<a href="https://colab.research.google.com/github/harshit1441/NLP/blob/main/NLP_final_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1: Install libraries (run in Colab). If you're running locally, you may not need installs.
!pip install -q datasets transformers evaluate seqeval accelerate gradio scikit-learn matplotlib torch>=1.12.0
!pip install -U transformers accelerate


# GPU check (optional)
import torch
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))


In [None]:
# Cell 2: Imports and global config
import os
import random
import numpy as np
from datasets import load_dataset, DatasetDict
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer, DataCollatorWithPadding)
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import torch

# Reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)


In [None]:
# Cell 3: Load amazon_reviews_multi (multilingual) and inspect languages
# This dataset provides reviews in multiple languages with rating (1-5) and content.
ds = load_dataset("amazon_polarity")  # fallback if amazon_reviews_multi not available
# Note: If you prefer amazon_reviews_multi, replace above with:
# ds = load_dataset("amazon_reviews_multi", "default")

# Quick peek
print(ds)
print(ds['train'][0])


In [None]:
# Cell 4: Prepare dataset subsets and labels
# For amazon_polarity labels are already binary (0: negative, 1: positive)
# We'll create small subsets for quick experiments: train on English, test on other langs (if available).
# If dataset doesn't contain language field, we will simulate multilingual evaluation by translating or using other sources.
# Here, we'll just create small train/validation/test splits for demonstration.

dataset = ds['train'].train_test_split(test_size=0.1, seed=seed)
dataset = DatasetDict({
    'train': dataset['train'].shuffle(seed=seed).select(range(20000)),  # limit size for demo
    'test': dataset['test'].shuffle(seed=seed).select(range(5000))
})
print(dataset)
print("Example:", dataset['train'][0])


In [None]:
# Cell 5: Light model selection + tokenizer setup (memory-friendly)
import os
from transformers import AutoTokenizer

# Disable Weights & Biases tracking
os.environ["WANDB_DISABLED"] = "true"

# Smaller models to reduce RAM/VRAM
model_name_mono = "distilbert-base-uncased"              # English only
model_name_multi = "distilbert-base-multilingual-cased"  # Multilingual

tokenizer_mono = AutoTokenizer.from_pretrained(model_name_mono)
tokenizer_multi = AutoTokenizer.from_pretrained(model_name_multi)

max_length = 256
print("Tokenizers loaded successfully ✅")


In [None]:
# Cell 6: Tokenize and keep 'label' column (avoid memory overflow)
from datasets import DatasetDict

# --- Limit dataset size for Colab (adjust numbers if you have more RAM) ---
dataset = DatasetDict({
    "train": dataset["train"].shuffle(seed=42).select(range(2000)),
    "test": dataset["test"].shuffle(seed=42).select(range(500))
})

def tokenize_mono(examples):
    return tokenizer_mono(examples["content"], truncation=True, max_length=max_length)

def tokenize_multi(examples):
    return tokenizer_multi(examples["content"], truncation=True, max_length=max_length)

tokenized_mono = dataset.map(tokenize_mono, batched=True)
tokenized_multi = dataset.map(tokenize_multi, batched=True)

# Remove unused columns to free RAM
tokenized_mono = tokenized_mono.remove_columns(
    [c for c in tokenized_mono["train"].column_names if c not in ["input_ids","attention_mask","label"]]
)
tokenized_multi = tokenized_multi.remove_columns(
    [c for c in tokenized_multi["train"].column_names if c not in ["input_ids","attention_mask","label"]]
)

print("✅ Tokenization done")
print("Columns:", tokenized_mono["train"].column_names)


In [None]:
# Cell 7 (final, tested on transformers 4.40+)
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, f1_score
import numpy as np, torch, os

def make_model_and_trainer(model_name, tokenized_dataset, tokenizer, output_dir,
                           epochs=1, batch_size=8, learning_rate=2e-5):
    os.environ["WANDB_DISABLED"] = "true"
    num_labels = 2

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=num_labels
    ).to(device)

    args = TrainingArguments(
        output_dir=output_dir,
        eval_strategy="epoch", # Changed from evaluation_strategy
        save_strategy="no",
        logging_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=0.01,
        gradient_checkpointing=True,
        fp16=torch.cuda.is_available(),
        report_to="none",          # disable wandb/tensorboard
        logging_dir=f"{output_dir}/logs"
    )

    data_collator = DataCollatorWithPadding(tokenizer)

    def compute_metrics(pred):
        labels = pred.label_ids
        preds = np.argmax(pred.predictions, axis=1)
        return {
            "accuracy": accuracy_score(labels, preds),
            "f1": f1_score(labels, preds, average="weighted")
        }

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    return model, trainer

In [None]:
mono_out = "runs/distilbert_fastdemo"

model_mono, trainer_mono = make_model_and_trainer(
    model_name_mono, tokenized_mono, tokenizer_mono,
    output_dir=mono_out, epochs=0.5, batch_size=8
)

# Override trainer args for speed
trainer_mono.args.evaluation_strategy = "no"
trainer_mono.args.logging_steps = 200
trainer_mono.args.save_strategy = "no"
trainer_mono.args.gradient_accumulation_steps = 4
trainer_mono.args.per_device_train_batch_size = 2
trainer_mono.args.per_device_eval_batch_size = 2

trainer_mono.train()
print("✅ Training finished (fast demo mode)")


In [None]:
# Cell 9: Train multilingual model (XLM-R) on same English training data
multi_out = "runs/xlm_roberta_multi_demo"
model_multi, trainer_multi = make_model_and_trainer(model_name_multi, tokenized_multi, tokenizer_multi, multi_out, epochs=0.5, batch_size=16, learning_rate=2e-5)
# Override trainer args for speed
trainer_mono.args.evaluation_strategy = "no"
trainer_mono.args.logging_steps = 200
trainer_mono.args.save_strategy = "no"
trainer_mono.args.gradient_accumulation_steps = 4
trainer_mono.args.per_device_train_batch_size = 2
trainer_mono.args.per_device_eval_batch_size = 2
trainer_multi.train()
eval_multi = trainer_multi.evaluate()
print("Multilingual eval:", eval_multi)


In [None]:
# Cell 10: Evaluate and produce predictions + classification report
def get_preds_and_report(trainer, dataset, tokenizer):
    preds_output = trainer.predict(dataset)
    preds = np.argmax(preds_output.predictions, axis=1)
    labels = preds_output.label_ids
    print(classification_report(labels, preds, digits=4))
    return preds, labels

print("Monolingual model report:")
mono_preds, mono_labels = get_preds_and_report(trainer_mono, tokenized_mono['test'], tokenizer_mono)

print("Multilingual model report:")
multi_preds, multi_labels = get_preds_and_report(trainer_multi, tokenized_multi['test'], tokenizer_multi)
# Cell 10: Evaluate and produce predictions + classification report
def get_preds_and_report(trainer, dataset, tokenizer):
    preds_output = trainer.predict(dataset)
    preds = np.argmax(preds_output.predictions, axis=1)
    labels = preds_output.label_ids
    print(classification_report(labels, preds, digits=4))
    return preds, labels

print("Monolingual model report:")
mono_preds, mono_labels = get_preds_and_report(trainer_mono, tokenized_mono['test'], tokenizer_mono)

print("Multilingual model report:")
multi_preds, multi_labels = get_preds_and_report(trainer_multi, tokenized_multi['test'], tokenizer_multi)


In [None]:
# Cell 11: Simple visualizations
import seaborn as sns
from sklearn.metrics import ConfusionMatrixDisplay

# Accuracy and F1 we got from Trainer evaluations
mono_acc = eval_mono.get("eval_accuracy", None)
mono_f1 = eval_mono.get("eval_f1", None)
multi_acc = eval_multi.get("eval_accuracy", None)
multi_f1 = eval_multi.get("eval_f1", None)

# Bar chart comparing accuracy/f1
labels = ["Monolingual (BERT)", "Multilingual (XLM-R)"]
accs = [mono_acc or 0, multi_acc or 0]
f1s = [mono_f1 or 0, multi_f1 or 0]

x = np.arange(len(labels))
width = 0.35

fig, ax = plt.subplots(figsize=(8,4))
ax.bar(x - width/2, accs, width, label='Accuracy')
ax.bar(x + width/2, f1s, width, label='F1 (weighted)')
ax.set_xticks(x)
ax.set_xticklabels(labels, rotation=20)
ax.set_ylim(0,1)
ax.set_ylabel("Score")
ax.set_title("Model comparison (quick demo)")
ax.legend()
plt.show()

# Confusion matrix for multilingual model
cm_multi = confusion_matrix(multi_labels, multi_preds)
disp = ConfusionMatrixDisplay(cm_multi, display_labels=["neg","pos"])
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix - Multilingual Model")
plt.show()


In [None]:
# Cell 12: Show examples where the models disagree
def decode_input(tokenizer, tokenized_batch, idx):
    # reconstruct input text if original text removed
    # if we have no original text column, we can't easily decode; show tokens instead
    input_ids = tokenized_batch['input_ids'][idx]
    return tokenizer.decode(input_ids, skip_special_tokens=True)

# We'll compare predictions on the first 200 test examples
disagreements = []
for i in range(min(200, len(tokenized_mono['test']))):
    # get predictions by running models directly in eval mode
    with torch.no_grad():
        # mono
        mi = {k: torch.tensor([v[i]]).to(device) for k,v in tokenized_mono['test'][i].items() if k in ['input_ids','attention_mask','token_type_ids'] or k in ['input_ids','attention_mask']}
        mo_logits = model_mono(**{k:v for k,v in mi.items() if k in model_mono.forward.__code__.co_varnames})
        m_pred = int(torch.argmax(mo_logits.logits, dim=1).cpu().numpy())

        # multi
        xi = {k: torch.tensor([v[i]]).to(device) for k,v in tokenized_multi['test'][i].items() if k in ['input_ids','attention_mask']}
        xo_logits = model_multi(**{k:v for k,v in xi.items() if k in model_multi.forward.__code__.co_varnames})
        x_pred = int(torch.argmax(xo_logits.logits, dim=1).cpu().numpy())

    true_label = tokenized_mono['test'][i]['label']
    if m_pred != x_pred:
        text = decode_input(tokenizer_mono, tokenized_mono['test'], i)
        disagreements.append((i, text, true_label, m_pred, x_pred))
        if len(disagreements) >= 10:
            break

for d in disagreements:
    idx, text, true, mono_p, multi_p = d
    print(f"Idx {idx} | True: {true} | Mono: {mono_p} | Multi: {multi_p}\n{text}\n---\n")


In [None]:
# Cell 13: Gradio demo to try both models interactively
import gradio as gr

# load tokenizer and models on CPU for demo if GPU not available
mono_pipeline_tokenizer = tokenizer_mono
multi_pipeline_tokenizer = tokenizer_multi
model_mono.eval()
model_multi.eval()

def predict_both(text):
    # mono
    tok_m = mono_pipeline_tokenizer(text, truncation=True, padding=True, return_tensors="pt", max_length=max_length).to(device)
    with torch.no_grad():
        logits_m = model_mono(**tok_m).logits
    pred_m = int(torch.argmax(logits_m, dim=1).cpu().numpy())
    # multi
    tok_x = multi_pipeline_tokenizer(text, truncation=True, padding=True, return_tensors="pt", max_length=max_length).to(device)
    with torch.no_grad():
        logits_x = model_multi(**tok_x).logits
    pred_x = int(torch.argmax(logits_x, dim=1).cpu().numpy())
    label_map = {0:"Negative", 1:"Positive"}
    return label_map[pred_m], label_map[pred_x]

iface = gr.Interface(fn=predict_both,
                     inputs=gr.Textbox(lines=4, placeholder="Enter review here..."),
                     outputs=[gr.Label(num_top_classes=1, label="Monolingual (BERT)"),
                              gr.Label(num_top_classes=1, label="Multilingual (XLM-R)")],
                     title="Monolingual vs Multilingual Sentiment Demo",
                     description="Enter text (any language). Models trained on English demo subset; multilingual model may generalize better to other languages.")
iface.launch(share=False)


In [None]:
# Cell 14: Save the fine-tuned models & tokenizers locally
save_dir_mono = "saved_models/bert_mono"
save_dir_multi = "saved_models/xlm_roberta_multi"
os.makedirs(save_dir_mono, exist_ok=True)
os.makedirs(save_dir_multi, exist_ok=True)

model_mono.save_pretrained(save_dir_mono)
tokenizer_mono.save_pretrained(save_dir_mono)

model_multi.save_pretrained(save_dir_multi)
tokenizer_multi.save_pretrained(save_dir_multi)

print("Saved models to:", save_dir_mono, save_dir_multi)
