<a href="https://colab.research.google.com/github/imvktiwari/final_samudra_project/blob/main/mbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Install Required Libraries
!pip install transformers torch datasets pandas

# Step 2: Import Necessary Libraries
import pandas as pd
import json
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

# Step 3: Load Your Data
data = pd.read_parquet('/content/senti.parquet')  # Ensure the path is correct
print(data.head())  # Inspect the first few rows of the dataset
print(data.columns)  # Check the column names

# Load the label mapping from the JSON file
with open('/content/labels.json', 'r') as f:
    labels = json.load(f)

print(labels)  # Check the label mapping

# Step 4: Preprocess Data (with padding and truncation)
max_len = 128  # You can adjust this value based on your use case

# Tokenize text with fixed padding and truncation
def tokenize_function(text):
    return tokenizer(
        text,
        padding='max_length',
        truncation=True,
        max_length=max_len,
        return_tensors="pt"
    )

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Apply the tokenizer to the 'Text' column
data['inputs'] = data['Text'].apply(lambda x: tokenize_function(x))

# Map sentiment labels to numerical values
label_mapping = {'SP': 0, 'WP': 1, 'NU': 2, 'WN': 3, 'SN': 4}
data['label'] = data['Polarity'].map(label_mapping)

# Step 5: Train-test Split
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Step 6: Create Custom Dataset Class (updated)
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, text_data, labels):
        self.input_ids = torch.stack([item['input_ids'].squeeze() for item in text_data])
        self.attention_masks = torch.stack([item['attention_mask'].squeeze() for item in text_data])
        self.labels = torch.tensor(labels.tolist(), dtype=torch.long)  # Ensure labels are tensors

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.labels[idx]
        }

# Create datasets
train_dataset = CustomDataset(train_data['inputs'], train_data['label'])
val_dataset = CustomDataset(val_data['inputs'], val_data['label'])

# Step 7: Load the MuRIL Sequence Classification Model
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=5)  # For 5-class sentiment

# Ensure all model parameters are contiguous
for param in model.parameters():
    if not param.is_contiguous():
        param.data = param.data.contiguous()

# Step 8: Train the Model
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,  # Adjust as necessary
    gradient_accumulation_steps=2,   # To simulate larger batch sizes if needed
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    fp16=True,  # Mixed precision training
    logging_dir='./logs',
    evaluation_strategy="epoch",  # Evaluate after every epoch
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  # Pass the validation set
)

# Start training
trainer.train()

# Step 9: Save the Fine-Tuned Model
model.save_pretrained('./fine_tuned_muril')
tokenizer.save_pretrained('./fine_tuned_muril')

# Step 10: Evaluate the Model
eval_results = trainer.evaluate()  # Evaluate the model performance on validation set
print(eval_results)


Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:0

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
