<a href="https://colab.research.google.com/github/imvktiwari/final_samudra_project/blob/main/muriltrain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Import Necessary Libraries
import pandas as pd
import json
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import os
os.environ["WANDB_DISABLED"] = "true"

# Step 2: Load Your Data
data = pd.read_parquet('/content/senti.parquet')  # Ensure the path is correct
print(data.head())  # Inspect the first few rows of the dataset
print(data.columns)  # Check the column names

# Load the label mapping from the JSON file
with open('/content/labels.json', 'r') as f:
    labels = json.load(f)

print(labels)  # Check the label mapping

# Step 3: Preprocess Data (with padding and truncation)
max_len = 128  # You can adjust this value based on your use case

# Initialize the tokenizer for MuRIL
tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")

# Tokenize text with fixed padding and truncation
def tokenize_function(text):
    return tokenizer(
        text,
        padding='max_length',
        truncation=True,
        max_length=max_len,
        return_tensors="pt"
    )

# Apply the tokenizer to the 'Text' column
data['inputs'] = data['Text'].apply(lambda x: tokenize_function(x))

# Map sentiment labels to numerical values
label_mapping = {'SP': 0, 'WP': 1, 'NU': 2, 'WN': 3, 'SN': 4}
data['label'] = data['Polarity'].map(label_mapping)

# Step 4: Train-test Split
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Step 5: Create Custom Dataset Class
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, text_data, labels):
        self.input_ids = torch.stack([item['input_ids'].squeeze() for item in text_data])
        self.attention_masks = torch.stack([item['attention_mask'].squeeze() for item in text_data])
        self.labels = torch.tensor(labels.tolist(), dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.labels[idx]
        }

# Create datasets
train_dataset = CustomDataset(train_data['inputs'], train_data['label'])
val_dataset = CustomDataset(val_data['inputs'], val_data['label'])

# Step 6: Load the MuRIL Sequence Classification Model
model = AutoModelForSequenceClassification.from_pretrained("google/muril-base-cased", num_labels=5)

# Freeze all BERT layers and ensure tensors are contiguous
for param in model.bert.parameters():
    param.requires_grad = False
    param.data = param.data.contiguous()  # Ensure tensors are contiguous

# Step 7: Define compute_metrics function for Trainer
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Convert logits and labels from NumPy arrays to PyTorch tensors
    logits = torch.from_numpy(logits)
    labels = torch.from_numpy(labels)
    # Get predicted class with highest score
    predictions = torch.argmax(logits, dim=-1)
    # Compute accuracy using sklearn
    accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())  # Convert back to NumPy for accuracy
    return {"accuracy": accuracy}


# Step 8: Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,  # Adjust based on GPU capacity
    gradient_accumulation_steps=2,  # Simulate larger batch sizes if needed
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    fp16=True,  # Mixed precision training for GPU memory optimization
    logging_dir='./logs',
    evaluation_strategy="epoch",  # Evaluate after every epoch
)

# Step 9: Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics  # Add the custom metrics function
)

# Step 10: Start training
trainer.train()

# Step 11: Evaluate the model on the validation set after training
eval_results = trainer.evaluate()

# Step 12: Print accuracy metrics
print(f"Validation Accuracy: {eval_results['eval_accuracy']:.4f}")

                                 ID  \
0  7abbf5aeb011e883c0a47a5e299b371e   
1  1089584daf51e3be29f985bcb935d1fa   
2  1742705348624306b2a6e9c256213808   
3  31aa85f5fd4918a960937825b226b597   
4  37c5947791f4d1a787ba9b1111b0e87b   

                                              Text Polarity    Domain  
0  যাওয়ার সময় যেন স্বাস্থ্যমন্ত্রীকে সাথে নিয়ে যায়       NU  facebook  
1              তার আগে যদি আপনি বিদায় নিতেন স্যার,       NU  facebook  
2   রাষ্ট্রের তহবিল কি একেবারে তলানিতে গিয়ে ঠেকেছে       NU  facebook  
3            সাথে আপনাকে চিপ গেস্ট হিসেবে নিয়ে যাক       NU  facebook  
4     শ্রিপাকে ও রিমান্ডে নিয়ে জিজ্ঞাসাবাদ করা হোক       NU  facebook  
Index(['ID', 'Text', 'Polarity', 'Domain'], dtype='object')
{'SP': 'Strongly Positive', 'WP': 'Weakly Positive', 'NU': 'Neutral', 'WN': 'Weakly Negative', 'SN': 'Strongly Negative'}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.6056,1.604908,0.223786
2,1.6062,1.604844,0.270357
3,1.6061,1.604812,0.223786


Validation Accuracy: 0.2238
