In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split


In [2]:
# Load Rusad dataset (TSV file)
rusad_df = pd.read_csv("/content/Dataset 11000 Reviews.tsv", sep='\t')
print("Rusad dataset shape:", rusad_df.shape)
print(rusad_df.head())

Rusad dataset shape: (10999, 2)
   neg  \
0  neg   
1  neg   
2  neg   
3  neg   
4  neg   

   ab achanak khawaja saad rafique ko khiyaal aagaya woh bhe peshawar mai is line pai 10 saal sai koi train nahe guzri  
0   adha drama to censor hi hojaye gaa , khaas to...                                                                    
1   bekaar fuzool end !!!!! moti budhi laila jeet...                                                                    
2            choor kasuri choor jhootay ka moo kaala                                                                    
3   gali gali mein shor hai gaaanjaaaa shair chor...                                                                    
4   hum kia aur hamari logic kia bhonkoon k muted...                                                                    


In [3]:
# Load generated dataset (CSV)
generated_df = pd.read_csv("/content/ur_en_generated.csv")
print("Generated dataset shape:", generated_df.shape)
print(generated_df.head())


Generated dataset shape: (9999, 2)
                                            text     label
0                   perfect service, bilkul mast  positive
1  Mujhe laga pasand nahi aya, but overall worst  negative
2   Mujhe laga ٹھیک ہی تھا, but overall balanced   neutral
3         awesome quality, acha laga performance  positive
4           trash app hai, bohat pasand nahi aya  negative


In [5]:
import pandas as pd

# Load Rusad dataset (TSV)
rusad_df = pd.read_csv("/content/Dataset 11000 Reviews.tsv", sep='\t')
print("Rusad dataset loaded. Shape:", rusad_df.shape)

# Load generated dataset (CSV)
generated_df = pd.read_csv("/content/ur_en_generated.csv")
print("Generated dataset loaded. Shape:", generated_df.shape)


Rusad dataset loaded. Shape: (10999, 2)
Generated dataset loaded. Shape: (9999, 2)


In [6]:
print("Rusad columns:", rusad_df.columns)
print("Rusad sample data:\n", rusad_df.head(), "\n")

print("Generated dataset columns:", generated_df.columns)
print("Generated dataset sample data:\n", generated_df.head())


Rusad columns: Index(['neg', ' ab achanak khawaja saad rafique ko khiyaal aagaya woh bhe peshawar mai is line pai 10 saal sai koi train nahe guzri'], dtype='object')
Rusad sample data:
    neg  \
0  neg   
1  neg   
2  neg   
3  neg   
4  neg   

   ab achanak khawaja saad rafique ko khiyaal aagaya woh bhe peshawar mai is line pai 10 saal sai koi train nahe guzri  
0   adha drama to censor hi hojaye gaa , khaas to...                                                                    
1   bekaar fuzool end !!!!! moti budhi laila jeet...                                                                    
2            choor kasuri choor jhootay ka moo kaala                                                                    
3   gali gali mein shor hai gaaanjaaaa shair chor...                                                                    
4   hum kia aur hamari logic kia bhonkoon k muted...                                                                     

Generated dataset columns

In [7]:
# Rename Rusad columns
rusad_df = rusad_df.rename(columns={
    rusad_df.columns[0]: 'label',
    rusad_df.columns[1]: 'text'
})

# Remove first row if it's a repeated header
if rusad_df.iloc[0]['label'] == 'neg':  # checking first row label
    rusad_df = rusad_df.iloc[1:].reset_index(drop=True)

# Lowercase and strip text
rusad_df['text'] = rusad_df['text'].astype(str).str.lower().str.strip()
rusad_df['label'] = rusad_df['label'].astype(str).str.lower().str.strip()


In [8]:
generated_df['text'] = generated_df['text'].astype(str).str.lower().str.strip()
generated_df['label'] = generated_df['label'].astype(str).str.lower().str.strip()


In [9]:
print("Rusad dataset sample after cleaning:\n", rusad_df.head())
print("Generated dataset sample after cleaning:\n", generated_df.head())

print("Rusad labels:", rusad_df['label'].value_counts())
print("Generated labels:", generated_df['label'].value_counts())


Rusad dataset sample after cleaning:
   label                                               text
0   neg  bekaar fuzool end !!!!! moti budhi laila jeet ...
1   neg            choor kasuri choor jhootay ka moo kaala
2   neg  gali gali mein shor hai gaaanjaaaa shair chor hai
3   neg  hum kia aur hamari logic kia bhonkoon k muted ...
4   neg  imran khan altaf hussain chachoo haroon ab sad...
Generated dataset sample after cleaning:
                                             text     label
0                   perfect service, bilkul mast  positive
1  mujhe laga pasand nahi aya, but overall worst  negative
2   mujhe laga ٹھیک ہی تھا, but overall balanced   neutral
3         awesome quality, acha laga performance  positive
4           trash app hai, bohat pasand nahi aya  negative
Rusad labels: label
pos    5699
neg    5298
          1
Name: count, dtype: int64
Generated labels: label
positive    3333
negative    3333
neutral     3333
Name: count, dtype: int64


In [10]:
# Map Rusad labels to match Generated dataset
rusad_df['label'] = rusad_df['label'].replace({'pos': 'positive', 'neg': 'negative'})

# Check label distribution after mapping
print("Rusad labels after mapping:\n", rusad_df['label'].value_counts())


Rusad labels after mapping:
 label
positive    5699
negative    5298
               1
Name: count, dtype: int64


In [12]:
from sklearn.model_selection import train_test_split
import pandas as pd

# ------------------------------
# Step 1: Combine datasets
# ------------------------------
combined_df = pd.concat([rusad_df, generated_df], ignore_index=True)

# ------------------------------
# Step 2: Remove invalid or unexpected labels
# ------------------------------
valid_labels = ['positive', 'negative', 'neutral']
invalid_rows = combined_df[~combined_df['label'].isin(valid_labels)]
print("Rows with invalid/missing labels:\n", invalid_rows)

# Keep only valid rows
combined_df = combined_df[combined_df['label'].isin(valid_labels)].reset_index(drop=True)

# ------------------------------
# Step 3: Shuffle dataset
# ------------------------------
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Check combined label distribution
print("Combined label distribution:\n", combined_df['label'].value_counts())

# ------------------------------
# Step 4: Split dataset (stratified)
# ------------------------------
train_df, temp_df = train_test_split(
    combined_df, test_size=0.2, random_state=42, stratify=combined_df['label']
)
val_df, test_df = train_test_split(
    temp_df, test_size=0.5, random_state=42, stratify=temp_df['label']
)

print("Train shape:", train_df.shape)
print("Validation shape:", val_df.shape)
print("Test shape:", test_df.shape)

# ------------------------------
# Step 5: Save splits for later use
# ------------------------------
train_df.to_csv("/content/train_combined.csv", index=False)
val_df.to_csv("/content/val_combined.csv", index=False)
test_df.to_csv("/content/test_combined.csv", index=False)

print("Datasets saved successfully!")


Rows with invalid/missing labels:
     label                                               text
470        bad naseeb jb khud ko kaha jye to khush naseeb...
Combined label distribution:
 label
positive    9032
negative    8631
neutral     3333
Name: count, dtype: int64
Train shape: (16796, 2)
Validation shape: (2100, 2)
Test shape: (2100, 2)
Datasets saved successfully!


In [13]:
# Downsample generated dataset to 2,500 samples
synthetic_sample_size = 2500
generated_df_sampled = generated_df.sample(n=synthetic_sample_size, random_state=42).reset_index(drop=True)

print("Downsampled synthetic dataset shape:", generated_df_sampled.shape)
print(generated_df_sampled['label'].value_counts())


Downsampled synthetic dataset shape: (2500, 2)
label
neutral     858
positive    828
negative    814
Name: count, dtype: int64


In [14]:
# Merge Rusad (real) + sampled synthetic dataset
combined_df_realistic = pd.concat([rusad_df, generated_df_sampled], ignore_index=True)

# Shuffle the combined dataset
combined_df_realistic = combined_df_realistic.sample(frac=1, random_state=42).reset_index(drop=True)

# Check label distribution after merging
print("Combined realistic dataset label distribution:\n", combined_df_realistic['label'].value_counts())


Combined realistic dataset label distribution:
 label
positive    6527
negative    6112
neutral      858
               1
Name: count, dtype: int64


In [15]:
from sklearn.model_selection import train_test_split

# Keep only valid labels
valid_labels = ['positive', 'negative', 'neutral']
combined_df_realistic = combined_df_realistic[combined_df_realistic['label'].isin(valid_labels)].reset_index(drop=True)

# Shuffle
combined_df_realistic = combined_df_realistic.sample(frac=1, random_state=42).reset_index(drop=True)

# Stratified split: 80% train, 10% val, 10% test
train_df, temp_df = train_test_split(
    combined_df_realistic, test_size=0.2, random_state=42, stratify=combined_df_realistic['label']
)
val_df, test_df = train_test_split(
    temp_df, test_size=0.5, random_state=42, stratify=temp_df['label']
)

# Check shapes and label distribution
print("Train shape:", train_df.shape)
print("Validation shape:", val_df.shape)
print("Test shape:", test_df.shape)

print("\nTrain label distribution:\n", train_df['label'].value_counts())
print("\nValidation label distribution:\n", val_df['label'].value_counts())
print("\nTest label distribution:\n", test_df['label'].value_counts())

# Save datasets
train_df.to_csv("/content/train_realistic.csv", index=False)
val_df.to_csv("/content/val_realistic.csv", index=False)
test_df.to_csv("/content/test_realistic.csv", index=False)

print("\nRealistic datasets saved successfully!")


Train shape: (10797, 2)
Validation shape: (1350, 2)
Test shape: (1350, 2)

Train label distribution:
 label
positive    5221
negative    4889
neutral      687
Name: count, dtype: int64

Validation label distribution:
 label
positive    653
negative    611
neutral      86
Name: count, dtype: int64

Test label distribution:
 label
positive    653
negative    612
neutral      85
Name: count, dtype: int64

Realistic datasets saved successfully!


In [16]:
!pip install transformers datasets torch --quiet

import torch
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import pandas as pd


In [17]:
# Load datasets
train_df = pd.read_csv("/content/train_realistic.csv")
val_df = pd.read_csv("/content/val_realistic.csv")
test_df = pd.read_csv("/content/test_realistic.csv")

print("Train shape:", train_df.shape)
print("Validation shape:", val_df.shape)
print("Test shape:", test_df.shape)


Train shape: (10797, 2)
Validation shape: (1350, 2)
Test shape: (1350, 2)


In [18]:
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Check tokenizer
print("Tokenizer loaded:", tokenizer)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Tokenizer loaded: XLMRobertaTokenizerFast(name_or_path='xlm-roberta-base', vocab_size=250002, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	250001: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}
)


In [19]:
class SentimentDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.texts = dataframe['text'].tolist()
        self.labels = dataframe['label'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length
        # Map labels to integers
        self.label2id = {'positive': 0, 'negative': 1, 'neutral': 2}

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.label2id[self.labels[idx]]

        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),  # remove batch dim
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [20]:
# Parameters
max_length = 128
batch_size = 16

# Datasets
train_dataset = SentimentDataset(train_df, tokenizer, max_length)
val_dataset = SentimentDataset(val_df, tokenizer, max_length)
test_dataset = SentimentDataset(test_df, tokenizer, max_length)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

print("DataLoaders ready!")


DataLoaders ready!


In [22]:
from transformers import AutoModelForSequenceClassification
from torch.optim import AdamW  # Correct import

# Initialize XLM-RoBERTa for 3-class classification
model = AutoModelForSequenceClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels=3
)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Loss function
loss_fn = torch.nn.CrossEntropyLoss()

print("Model and optimizer ready on device:", device)


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model and optimizer ready on device: cuda


In [23]:
from tqdm import tqdm

epochs = 2  # start with 2 epochs, increase later if needed

for epoch in range(epochs):
    model.train()
    loop = tqdm(train_loader, leave=True)
    total_loss = 0

    for batch in loop:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        loop.set_description(f'Epoch {epoch+1}')
        loop.set_postfix(loss=loss.item())

    print(f"Epoch {epoch+1} average loss: {total_loss/len(train_loader):.4f}")


Epoch 1: 100%|██████████| 675/675 [04:33<00:00,  2.47it/s, loss=0.306]


Epoch 1 average loss: 0.5310


Epoch 2: 100%|██████████| 675/675 [04:38<00:00,  2.42it/s, loss=0.326]

Epoch 2 average loss: 0.3406





In [24]:
from sklearn.metrics import accuracy_score, f1_score

def evaluate(model, dataloader):
    model.eval()
    preds = []
    true_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)

            preds.extend(predictions.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(true_labels, preds)
    f1 = f1_score(true_labels, preds, average='weighted')
    return acc, f1

# Example: Evaluate on validation set
val_acc, val_f1 = evaluate(model, val_loader)
print(f"Validation Accuracy: {val_acc:.4f}, F1-score: {val_f1:.4f}")


Validation Accuracy: 0.8526, F1-score: 0.8521


In [25]:
import os

# Define save directory
save_dir = "/content/xlm_roberta_sentiment_model"
os.makedirs(save_dir, exist_ok=True)

# Save model
model.save_pretrained(save_dir)

# Save tokenizer
tokenizer.save_pretrained(save_dir)

print(f"Model and tokenizer saved successfully at: {save_dir}")


Model and tokenizer saved successfully at: /content/xlm_roberta_sentiment_model


In [26]:
import shutil

# Path of the saved model folder
save_dir = "/content/xlm_roberta_sentiment_model"

# Create a zip file
shutil.make_archive("/content/xlm_roberta_sentiment_model", 'zip', save_dir)

print("Model folder zipped successfully!")


Model folder zipped successfully!


In [28]:
from google.colab import files

# Download the zip
files.download("/content/xlm_roberta_sentiment_model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [29]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load from saved folder
save_dir = "/content/xlm_roberta_sentiment_model"
tokenizer = AutoTokenizer.from_pretrained(save_dir)
model = AutoModelForSequenceClassification.from_pretrained(save_dir)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

print("Model loaded successfully for inference!")


Model loaded successfully for inference!


In [30]:
def predict_sentiment(text):
    # Tokenize input
    encoding = tokenizer(
        text,
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # Get model prediction
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        pred_label_id = torch.argmax(logits, dim=-1).item()

    # Map back to label
    id2label = {0: 'positive', 1: 'negative', 2: 'neutral'}
    return id2label[pred_label_id]


In [31]:
samples = [
    "Mujhe ye app bohat pasand aayi, service amazing thi!",
    "Worst experience ever, bilkul time waste hua.",
    "The product is okay, kuch acha hai aur kuch bekar."
]

for s in samples:
    print(f"Text: {s}")
    print(f"Predicted Sentiment: {predict_sentiment(s)}\n")


Text: Mujhe ye app bohat pasand aayi, service amazing thi!
Predicted Sentiment: positive

Text: Worst experience ever, bilkul time waste hua.
Predicted Sentiment: negative

Text: The product is okay, kuch acha hai aur kuch bekar.
Predicted Sentiment: neutral



In [32]:
import pandas as pd
import random

# Load your existing datasets
rusad_df = pd.read_csv("/content/train_combined.csv")  # or wherever your saved realistic dataset is
generated_df = pd.read_csv("/content/ur_en_generated.csv")

# Create a list of neutral sentences (code-mixed Roman Urdu + English)
neutral_sentences = [
    "Main bazar gaya aur sab theek tha.",
    "Kal mausam thoda sa cloudy tha.",
    "Office me sab log kaam kar rahe the.",
    "Mujhe library me kuch books mil gayi.",
    "School me students morning assembly me khade the.",
    "Ghar me TV chal raha tha.",
    "Bus station par log wait kar rahe the.",
    "Market me fruits aur vegetables available the.",
    "Main park me walk kar raha tha.",
    "Mausam thoda sa thanda tha.",
    "Train station par train delay thi.",
    "Main coffee shop me tha aur log quietly baat kar rahe the.",
    "Hospital ke bahar log line me khade the.",
    "Library me students quietly padh rahe the.",
    "Main restaurant me order place kar raha tha."
]

# Upsample neutral sentences to reach ~3k
augmented_neutral = []
while len(augmented_neutral) < 3000:
    sentence = random.choice(neutral_sentences)
    augmented_neutral.append({"text": sentence, "label": "neutral"})

neutral_df = pd.DataFrame(augmented_neutral)

print("Augmented neutral dataset shape:", neutral_df.shape)
neutral_df.head()


Augmented neutral dataset shape: (3000, 2)


Unnamed: 0,text,label
0,Main restaurant me order place kar raha tha.,neutral
1,Market me fruits aur vegetables available the.,neutral
2,Hospital ke bahar log line me khade the.,neutral
3,Main restaurant me order place kar raha tha.,neutral
4,Main coffee shop me tha aur log quietly baat k...,neutral


In [33]:
from sklearn.model_selection import train_test_split

# Load your existing combined realistic dataset (positive + negative + small neutral)
combined_df = pd.read_csv("/content/train_combined.csv")  # this should be the realistic dataset you already prepared

# Drop any rows with missing labels just in case
combined_df = combined_df.dropna(subset=["label", "text"])

# Merge augmented neutral dataset
full_df = pd.concat([combined_df, neutral_df], ignore_index=True)

# Shuffle the dataset
full_df = full_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Check the label distribution
print("Updated label distribution:\n", full_df['label'].value_counts())

# Split dataset: 80% train, 10% validation, 10% test (stratified by label)
train_df, temp_df = train_test_split(full_df, test_size=0.2, random_state=42, stratify=full_df['label'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])

print("Train shape:", train_df.shape)
print("Validation shape:", val_df.shape)
print("Test shape:", test_df.shape)

# Save splits for fine-tuning
train_df.to_csv("/content/train_balanced.csv", index=False)
val_df.to_csv("/content/val_balanced.csv", index=False)
test_df.to_csv("/content/test_balanced.csv", index=False)

print("Balanced datasets saved successfully!")


Updated label distribution:
 label
positive    7225
negative    6905
neutral     5666
Name: count, dtype: int64
Train shape: (15836, 2)
Validation shape: (1980, 2)
Test shape: (1980, 2)
Balanced datasets saved successfully!


In [35]:
# Install dependencies
!pip install transformers datasets accelerate evaluate





In [43]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [44]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import evaluate
import pandas as pd

In [45]:
# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# Load train/val datasets
train_df = pd.read_csv("/content/train_balanced.csv")
val_df = pd.read_csv("/content/val_balanced.csv")

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Label mapping
label_list = ["negative", "neutral", "positive"]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

Using device: cuda


In [46]:
def encode_labels(example):
    example["label"] = label2id[example["label"]]
    return example

train_dataset = train_dataset.map(encode_labels)
val_dataset = val_dataset.map(encode_labels)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# Tokenization
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)

# Set format for PyTorch
columns = ["input_ids", "attention_mask", "label"]
train_dataset.set_format(type="torch", columns=columns)
val_dataset.set_format(type="torch", columns=columns)

# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)
model.to(device)

# Metric using 'evaluate' library
metric = evaluate.load("f1")


Map:   0%|          | 0/15836 [00:00<?, ? examples/s]

Map:   0%|          | 0/1980 [00:00<?, ? examples/s]

Map:   0%|          | 0/15836 [00:00<?, ? examples/s]

Map:   0%|          | 0/1980 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [47]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = metric.compute(predictions=preds, references=labels, average="weighted")["f1"]
    return {"f1": f1}

In [48]:
# Training arguments
training_args = TrainingArguments(
    output_dir="/content/xlm_roberta_sentiment",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [49]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)


In [50]:
# Fine-tune
trainer.train()

# Save final model and tokenizer
model.save_pretrained("/content/xlm_roberta_sentiment_final")
tokenizer.save_pretrained("/content/xlm_roberta_sentiment_final")

print("Fine-tuning complete and model saved!")

Epoch,Training Loss,Validation Loss,F1
1,0.4456,0.202823,0.906639
2,0.2083,0.214788,0.915652
3,0.1688,0.235283,0.924755


Epoch,Training Loss,Validation Loss,F1
1,0.4456,0.202823,0.906639
2,0.2083,0.214788,0.915652
3,0.1688,0.235283,0.924755


Fine-tuning complete and model saved!


In [51]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import shutil

# Suppose your trained model is saved in "xlm_roberta_sentiment_final"
model_save_path = "/content/xlm_roberta_sentiment_final"

# Save the model and tokenizer
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

# Zip the folder for easy download
shutil.make_archive("/content/xlm_roberta_sentiment_final", 'zip', model_save_path)

# After this, download the zip file from Colab
from google.colab import files
files.download("/content/xlm_roberta_sentiment_final.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [52]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Path to your saved model folder
model_path = "/content/xlm_roberta_sentiment_final"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Use CPU or GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Function to predict sentiment
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        pred = torch.argmax(logits, dim=1).item()
    # Map prediction to label
    labels = ["negative", "neutral", "positive"]  # ensure same order as during training
    return labels[pred]




Text: Mujhe ye app bohat pasand aayi, service amazing thi!
Predicted Sentiment: positive

Text: Worst experience ever, bilkul time waste hua.
Predicted Sentiment: negative

Text: Kal mausam thoda sa cloudy tha.
Predicted Sentiment: neutral



In [56]:
# Test examples
examples = [
    "School me students morning assembly me khade the.",
    "Worst experience ever, bilkul time waste hua.",
    "Main bazar gaya aur sab theek tha."
]

for text in examples:
    print("Text:", text)
    print("Predicted Sentiment:", predict_sentiment(text))
    print()

Text: School me students morning assembly me khade the.
Predicted Sentiment: neutral

Text: Worst experience ever, bilkul time waste hua.
Predicted Sentiment: negative

Text: Main bazar gaya aur sab theek tha.
Predicted Sentiment: neutral

