<a href="https://colab.research.google.com/github/imvktiwari/final_samudra_project/blob/main/mbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Step 1: Install Required Libraries (for Colab or local)
# !pip install transformers torch datasets pandas

# Step 2: Import Necessary Libraries
import pandas as pd
import json
import torch
import os
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split


os.environ["WANDB_DISABLED"] = "true"

# Step 3: Load Your Data
data = pd.read_parquet('/content/senti.parquet')  # Ensure the path is correct
print(data.head())  # Inspect the first few rows of the dataset
print(data.columns)  # Check the column names

# Load the label mapping from the JSON file
with open('/content/labels.json', 'r') as f:
    labels = json.load(f)

print(labels)  # Check the label mapping

# Step 4: Preprocess Data (with padding and truncation)
max_len = 128  # Adjust this value based on your use case

# Tokenize text without converting to tensors directly
def tokenize_function(text):
    return tokenizer(
        text,
        padding='max_length',
        truncation=True,
        max_length=max_len
    )

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Apply the tokenizer to the 'Text' column
data['inputs'] = data['Text'].apply(lambda x: tokenize_function(x))

# Map sentiment labels to numerical values
label_mapping = {'SP': 0, 'WP': 1, 'NU': 2, 'WN': 3, 'SN': 4}
data['label'] = data['Polarity'].map(label_mapping)

# Step 5: Train-test Split
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Step 6: Create Custom Dataset Class (updated)
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, text_data, labels):
        self.input_ids = torch.stack([torch.tensor(item['input_ids']) for item in text_data])
        self.attention_masks = torch.stack([torch.tensor(item['attention_mask']) for item in text_data])
        self.labels = torch.tensor(labels.tolist(), dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.labels[idx]
        }

# Create datasets
train_dataset = CustomDataset(train_data['inputs'], train_data['label'])
val_dataset = CustomDataset(val_data['inputs'], val_data['label'])

# Step 7: Load the mBERT Sequence Classification Model
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=5)

# Freeze all BERT layers
for param in model.bert.parameters():
    param.requires_grad = False

# Only the classifier head will be trained
# Step 8: Train the Model
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,  # Adjust based on GPU capacity
    gradient_accumulation_steps=2,  # Simulate larger batch sizes if needed
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    fp16=True,  # Mixed precision training for GPU memory optimization
    logging_dir='./logs',
    evaluation_strategy="epoch",  # Evaluate after every epoch
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  # Pass the validation set
)

# Start training
trainer.train()


                                 ID  \
0  7abbf5aeb011e883c0a47a5e299b371e   
1  1089584daf51e3be29f985bcb935d1fa   
2  1742705348624306b2a6e9c256213808   
3  31aa85f5fd4918a960937825b226b597   
4  37c5947791f4d1a787ba9b1111b0e87b   

                                              Text Polarity    Domain  
0  যাওয়ার সময় যেন স্বাস্থ্যমন্ত্রীকে সাথে নিয়ে যায়       NU  facebook  
1              তার আগে যদি আপনি বিদায় নিতেন স্যার,       NU  facebook  
2   রাষ্ট্রের তহবিল কি একেবারে তলানিতে গিয়ে ঠেকেছে       NU  facebook  
3            সাথে আপনাকে চিপ গেস্ট হিসেবে নিয়ে যাক       NU  facebook  
4     শ্রিপাকে ও রিমান্ডে নিয়ে জিজ্ঞাসাবাদ করা হোক       NU  facebook  
Index(['ID', 'Text', 'Polarity', 'Domain'], dtype='object')
{'SP': 'Strongly Positive', 'WP': 'Weakly Positive', 'NU': 'Neutral', 'WN': 'Weakly Negative', 'SN': 'Strongly Negative'}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss
1,1.5759,1.570111
2,1.5626,1.554654
3,1.5587,1.550117


TrainOutput(global_step=21000, training_loss=1.573301045735677, metrics={'train_runtime': 1199.581, 'train_samples_per_second': 140.049, 'train_steps_per_second': 17.506, 'total_flos': 1.1050961983488e+16, 'train_loss': 1.573301045735677, 'epoch': 3.0})