In [2]:
# Step 1: Install Required Libraries (for Colab or local)
# !pip install transformers torch datasets pandas

# Step 2: Import Necessary Libraries
import pandas as pd
import json
import torch
import os
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
!pip install transformers torch datasets pandas scikit-learn

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score


os.environ["WANDB_DISABLED"] = "true"

# Step 3: Load Your Data
data = pd.read_parquet('/content/senti.parquet')  # Ensure the path is correct
print(data.head())  # Inspect the first few rows of the dataset
print(data.columns)  # Check the column names

# Load the label mapping from the JSON file
with open('/content/labels.json', 'r') as f:
    labels = json.load(f)

print(labels)  # Check the label mapping

# Step 4: Preprocess Data (with padding and truncation)
max_len = 128  # Adjust this value based on your use case

# Tokenize text without converting to tensors directly
def tokenize_function(text):
    return tokenizer(
        text,
        padding='max_length',
        truncation=True,
        max_length=max_len
    )

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Apply the tokenizer to the 'Text' column
data['inputs'] = data['Text'].apply(lambda x: tokenize_function(x))

# Map sentiment labels to numerical values
label_mapping = {'SP': 0, 'WP': 1, 'NU': 2, 'WN': 3, 'SN': 4}
data['label'] = data['Polarity'].map(label_mapping)

# Step 5: Train-test Split
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Step 6: Create Custom Dataset Class (updated)
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, text_data, labels):
        self.input_ids = torch.stack([torch.tensor(item['input_ids']) for item in text_data])
        self.attention_masks = torch.stack([torch.tensor(item['attention_mask']) for item in text_data])
        self.labels = torch.tensor(labels.tolist(), dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.labels[idx]
        }

# Create datasets
train_dataset = CustomDataset(train_data['inputs'], train_data['label'])
val_dataset = CustomDataset(val_data['inputs'], val_data['label'])

# Step 7: Load the mBERT Sequence Classification Model
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=5)

# Freeze all BERT layers
for param in model.bert.parameters():
    param.requires_grad = False

# Define a compute_metrics function for evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')  # Use weighted F1 score
    return {'accuracy': accuracy, 'f1': f1}

# Only the classifier head will be trained
# Step 8: Train the Model
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,  # Adjust based on GPU capacity
    gradient_accumulation_steps=2,  # Simulate larger batch sizes if needed
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    fp16=True,  # Mixed precision training for GPU memory optimization
    logging_dir='./logs',
    evaluation_strategy="epoch",  # Evaluate after every epoch
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  # Pass the validation set
    compute_metrics=compute_metrics  # Include the metrics function
)

# Start training
trainer.train()

# Evaluate the model
trainer.evaluate()

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.5773,1.570686,0.317071,0.254451
2,1.5646,1.555078,0.329714,0.276237
3,1.5585,1.550461,0.332643,0.279577


{'eval_loss': 1.550460934638977,
 'eval_accuracy': 0.33264285714285713,
 'eval_f1': 0.2795771011675683,
 'eval_runtime': 44.4831,
 'eval_samples_per_second': 314.727,
 'eval_steps_per_second': 78.682,
 'epoch': 3.0}