In [1]:
!pip install -U "pandas" "indic-nlp-library" "transformers[torch]" "datasets" "httpx==0.24.0" "accelerate>=0.26.0" "scikit-learn"
!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git indic_nlp_resources

Collecting pandas
  Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting indic-nlp-library
  Downloading indic_nlp_library-0.92-py3-none-any.whl.metadata (5.7 kB)
Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting httpx==0.24.0
  Downloading httpx-0.24.0-py3-none-any.whl.metadata (8.1 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting httpcore<0.18.0,>=0.15.0 (from httpx==0.24.0)
  Downloading httpcore-0.17.3-py3-none-any.whl.metadata (18 kB)
Collecting sphinx-argparse (from indic-nlp-library)
  Downloading sphinx_argparse-0.5.2-py3-none-any.whl.metadata (3.7 kB)
Collecting sphinx-rtd-theme (from indic-nlp-library)
  Downloading sphinx_rtd_theme-3.0.2-py2.py3-none-a

Cloning into 'indic_nlp_resources'...
remote: Enumerating objects: 139, done.[K
remote: Counting objects: 100% (13/13), done.[K
remote: Compressing objects: 100% (13/13), done.[K
remote: Total 139 (delta 2), reused 2 (delta 0), pack-reused 126 (from 1)[K
Receiving objects: 100% (139/139), 149.77 MiB | 35.69 MiB/s, done.
Resolving deltas: 100% (53/53), done.
Updating files: 100% (28/28), done.


In [1]:
!pip install indic-nlp-library -q
!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git

from indicnlp import common
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
import pandas as pd
from sklearn.model_selection import train_test_split
import re
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer
from datasets import Dataset
import torch
import os

# Set up IndicNLP resources
common.set_resources_path("./indic_nlp_resources")
factory = IndicNormalizerFactory()
normalizer = factory.get_normalizer("ta")

print("Dependencies and resources set up successfully!")

fatal: destination path 'indic_nlp_resources' already exists and is not an empty directory.
Dependencies and resources set up successfully!


In [2]:
import pandas as pd

# Load with UTF-8 (try this first)
df = pd.read_csv('/content/Tamil-News-Headlines.csv', encoding='utf-8')

# Check if Tamil text is correctA
print("First Tamil news text:")
print(df['News'].iloc[0])

# If garbled, try latin-1 and save fixed version
if 'à' in df['News'].iloc[0]:  # Check for garbled chars
    df_latin1 = pd.read_csv('/content/Tamil-News-Headlines.csv', encoding='latin-1')
    df_latin1.to_csv('fixed_encoding.csv', encoding='utf-8', index=False)
    print("Saved fixed file as 'fixed_encoding.csv'")
    df = df_latin1  # Use fixed data

# Save for next steps
df.to_csv('dataset_ready.csv', index=False, encoding='utf-8')
print("Dataset ready!")

First Tamil news text:
 பாஸ்வேர்டை பகிரும் பயனர்களிடம் கூடுதல் கட்டணம்: நெட்ஃப்ளிக்ஸ் பலே திட்டம்  
Dataset ready!


In [3]:

# Set up IndicNLP for Tamil normalization
resources_path = "./indic_nlp_resources"
# The resource cloning is already done in cell 07376836, no need to check or clone again
common.set_resources_path(resources_path)
factory = IndicNormalizerFactory()
normalizer = factory.get_normalizer("ta")

# Load data with error handling for encoding
try:
    df = pd.read_csv('dataset_ready.csv', encoding='utf-8')
except UnicodeDecodeError:
    df = pd.read_csv('dataset_ready.csv', encoding='utf-8-sig')  # Handle UTF-8 with BOM

# Clean text function
def clean_text(text):
    if not isinstance(text, str):  # Handle non-string inputs
        text = str(text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    # Keep all Tamil Unicode characters (U+0B80 to U+0BFF) and basic punctuation
    text = re.sub(r'[^\u0B80-\u0BFF\s.,!?]', '', text)
    # Normalize Tamil
    return normalizer.normalize(text.strip()) if text.strip() else ""

# Apply cleaning and print sample for debugging
df['News'] = df['News'].apply(clean_text)
print("Sample cleaned text:", df['News'].head().tolist())

# Select needed columns and rename
df_train = df[['News', 'Authenticity']].copy()
df_train.columns = ['text', 'label']

# Check label distribution
print("Label distribution (0=real, 1=fake):")
print(df_train['label'].value_counts())

# Balance if needed (optional)
if len(df_train[df_train['label'] == 0]) > len(df_train[df_train['label'] == 1]):
    df_real = df_train[df_train['label'] == 0].sample(len(df_train[df_train['label'] == 1]), random_state=42)
    df_train = pd.concat([df_real, df_train[df_train['label'] == 1]])

# Split data
train_df, temp_df = train_test_split(df_train, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Save with explicit encoding
train_df.to_csv('train.csv', index=False, encoding='utf-8-sig')
val_df.to_csv('val.csv', index=False, encoding='utf-8-sig')
test_df.to_csv('test.csv', index=False, encoding='utf-8-sig')

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

Sample cleaned text: ['பாஸ்வேர்டை பகிரும் பயனர்களிடம் கூடுதல் கட்டணம் நெட்ஃப்ளிக்ஸ் பலே திட்டம்', 'இந்தியாவில் நடப்பு ஆண்டின் இறுதிக்குள் ஏகே ரக துப்பாக்கிகள் தயாரிப்பு?', 'பட்ஜெட் விலையில் மோட்டோ  ஸ்மார்ட்போன் இந்தியாவில் அறிமுகம்  விலை ...', 'கலாம் கண்ட கடைசி கனவை நனவாக்குவோம்', 'பட்ஜெட் விலையில் ரெட்மி ஏ ஸ்மார்ட்போன் இந்தியாவில் அறிமுகம்  விலை ...']
Label distribution (0=real, 1=fake):
label
1    2902
0    2324
Name: count, dtype: int64
Train: 4180, Val: 523, Test: 523


In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from datasets import Dataset

# List of models to compare
models_to_test = {
    "XLM-RoBERTa": "xlm-roberta-base",
    "mBERT": "bert-base-multilingual-cased",
    "MuRIL": "google/muril-base-cased",
    "Distil-mBERT": "distilbert-base-multilingual-cased"
}

# Same training arguments for fair comparison
training_args = TrainingArguments(
    output_dir='./results_comparison',  # will create subfolders per model
    num_train_epochs=4,                 # 4 epochs works better for Tamil headlines
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    eval_strategy="epoch", # This was previously fixed and seems to be accepted
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    report_to=[],  # no wandb
    logging_strategy="epoch", # Reverting to original name to fix current TypeError
    save_total_limit=2,
    seed=42,
    fp16=True,  # if you have GPU
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='binary')
    return {"accuracy": acc, "f1": f1}

results = []

print("Starting model comparison...\n")

for name, model_id in models_to_test.items():
    print(f"\n=== Training {name} ({model_id}) ===")

    # Create model-specific output dir so checkpoints don't clash
    training_args.output_dir = f"./results_{name.replace('/', '_')}"

    # Load tokenizer & model
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2)

    # Tokenize function
    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            truncation=True,
            padding="max_length",
            max_length=128
        )

    # Create datasets fresh for this tokenizer
    train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
    val_ds   = Dataset.from_pandas(val_df.reset_index(drop=True))
    test_ds  = Dataset.from_pandas(test_df.reset_index(drop=True))

    train_ds = train_ds.map(tokenize_function, batched=True)
    val_ds   = val_ds.map(tokenize_function, batched=True)
    test_ds  = test_ds.map(tokenize_function, batched=True)

    # Rename label → labels (Trainer requirement) and remove text column
    train_ds = train_ds.rename_column("label", "labels").remove_columns(["text"])
    val_ds   = val_ds.rename_column("label", "labels").remove_columns(["text"])
    test_ds  = test_ds.rename_column("label", "labels").remove_columns(["text"])

    # Torch format
    train_ds.set_format("torch")
    val_ds.set_format("torch")
    test_ds.set_format("torch")

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        compute_metrics=compute_metrics,
    )

    # Train
    trainer.train()

    # Evaluate on test set
    test_results = trainer.evaluate(test_ds)

    results.append({
        "Model": name,
        "Test Accuracy": round(test_results["eval_accuracy"], 4),
        "Test F1": round(test_results["eval_f1"], 4),
        "Best Val Accuracy": round(trainer.state.best_metric, 4)
    })

    print(f"✓ {name} → Test Acc: {test_results['eval_accuracy']:.4f} | F1: {test_results['eval_f1']:.4f}\n")

# Final comparison table
import pandas as pd
results_df = pd.DataFrame(results)
results_df = results_df.sort_values("Test Accuracy", ascending=False).reset_index(drop=True)
print("\nFINAL RESULTS (sorted by Test Accuracy):")
display(results_df)

Starting model comparison...


=== Training XLM-RoBERTa (xlm-roberta-base) ===


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4180 [00:00<?, ? examples/s]

Map:   0%|          | 0/523 [00:00<?, ? examples/s]

Map:   0%|          | 0/523 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3401,0.214017,0.952199,0.954955
2,0.1515,0.121614,0.971319,0.972171
3,0.1001,0.136908,0.971319,0.972678
4,0.0437,0.135697,0.969407,0.971014


✓ XLM-RoBERTa → Test Acc: 0.9694 | F1: 0.9720


=== Training mBERT (bert-base-multilingual-cased) ===


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4180 [00:00<?, ? examples/s]

Map:   0%|          | 0/523 [00:00<?, ? examples/s]

Map:   0%|          | 0/523 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2851,0.228029,0.944551,0.945794
2,0.1576,0.159331,0.957935,0.959108
3,0.0776,0.169701,0.961759,0.96337
4,0.0376,0.185516,0.963671,0.965138


✓ mBERT → Test Acc: 0.9637 | F1: 0.9672


=== Training MuRIL (google/muril-base-cased) ===


tokenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/953M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4180 [00:00<?, ? examples/s]

Map:   0%|          | 0/523 [00:00<?, ? examples/s]

Map:   0%|          | 0/523 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.382,0.123006,0.969407,0.97037
2,0.1053,0.104093,0.977055,0.978102
3,0.045,0.090988,0.98088,0.981618
4,0.0153,0.107434,0.98088,0.981685


✓ MuRIL → Test Acc: 0.9828 | F1: 0.9843


=== Training Distil-mBERT (distilbert-base-multilingual-cased) ===


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4180 [00:00<?, ? examples/s]

Map:   0%|          | 0/523 [00:00<?, ? examples/s]

Map:   0%|          | 0/523 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3103,0.402933,0.879541,0.893039
2,0.1615,0.181204,0.938815,0.939623
3,0.1006,0.192314,0.948375,0.949533
4,0.0581,0.215877,0.946463,0.948529


✓ Distil-mBERT → Test Acc: 0.9369 | F1: 0.9422


FINAL RESULTS (sorted by Test Accuracy):


Unnamed: 0,Model,Test Accuracy,Test F1,Best Val Accuracy
0,MuRIL,0.9828,0.9843,0.9809
1,XLM-RoBERTa,0.9694,0.972,0.9713
2,mBERT,0.9637,0.9672,0.9637
3,Distil-mBERT,0.9369,0.9422,0.9484



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



In [7]:
import json
import os
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# FINAL BEST MODEL – Save it properly
best_model_name = "google/muril-base-cased" # Original model ID
muril_output_dir = "./results_MuRIL" # Directory where MuRIL checkpoints were saved

# Path to trainer_state.json
trainer_state_path = os.path.join(muril_output_dir, "trainer_state.json")

best_checkpoint_path = None
if os.path.exists(trainer_state_path):
    with open(trainer_state_path, "r") as f:
        trainer_state = json.load(f)
        best_checkpoint_path = trainer_state.get("best_model_checkpoint")

if best_checkpoint_path:
    print(f"Loading best model from checkpoint: {best_checkpoint_path}")
    # Load tokenizer from the original model ID
    tokenizer = AutoTokenizer.from_pretrained(best_model_name)

    # Load the fine-tuned model from the best checkpoint path
    model = AutoModelForSequenceClassification.from_pretrained(
        best_checkpoint_path,
        num_labels=2
    )

    # Define the final save path for the best model
    final_save_path = "./tamil_fake_news_muril_final"

    # Save the model and tokenizer to the new final directory
    model.save_pretrained(final_save_path)
    tokenizer.save_pretrained(final_save_path)

    print(f"Best model and tokenizer saved to {final_save_path}")
else:
    print(f"Error: Could not find 'best_model_checkpoint' in {trainer_state_path}. "
          "Please ensure training for MuRIL completed successfully and generated this file.")


Error: Could not find 'best_model_checkpoint' in ./results_MuRIL/trainer_state.json. Please ensure training for MuRIL completed successfully and generated this file.
