In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture
!pip install datasets

# Detection

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
import numpy as np

In [None]:
def return_trainer(model, tokenizer, lang, model_name, train_df, val_df, test_df):
  def tokenize(batch):
    return tokenizer(batch['text'], max_length=256, padding="max_length", truncation=True)

  train_dataset = Dataset.from_pandas(train_hin)
  train_dataset = train_dataset.map(tokenize, batched=True)
  val_dataset = Dataset.from_pandas(val_hin)
  val_dataset = val_dataset.map(tokenize, batched=True)
  test_dataset = Dataset.from_pandas(test_hin)
  test_dataset = test_dataset.map(tokenize, batched=True)

  training_args = TrainingArguments(
      output_dir=f'./results/{lang}_{model_name}',
      save_total_limit=1,
      eval_strategy="epoch",
      save_strategy="epoch",
      per_device_train_batch_size=16,
      per_device_eval_batch_size=16,
      num_train_epochs=10,
      learning_rate=2e-5,
      warmup_steps=300,
      weight_decay=0.01,
      logging_steps=100,
      load_best_model_at_end=True,
      metric_for_best_model="eval_loss",
      gradient_accumulation_steps=2,
      greater_is_better=False,
      report_to="none"
  )

  def compute_metrics(eval_pred):
      logits, labels = eval_pred
      preds = np.argmax(logits, axis=-1)
      precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
      acc = accuracy_score(labels, preds)
      return {
          'accuracy': acc,
          'f1': f1,
          'precision': precision,
          'recall': recall
      }

  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset,
      eval_dataset=val_dataset,
      compute_metrics=compute_metrics,
      callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
  )

  return trainer, test_dataset


In [None]:
def evaluate_model(trainer, test_dataset):

    predictions = trainer.predict(test_dataset)

    logits = predictions.predictions
    labels = predictions.label_ids

    preds = np.argmax(logits, axis=-1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)

    conf_matrix = confusion_matrix(labels, preds)

    # Print results
    print("Evaluation Results:")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")
    print("\nConfusion Matrix:")
    print(conf_matrix)

    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': conf_matrix
    }



## Hindi

In [None]:

train_hin = pd.read_excel('/content/drive/Shareddrives/FYP 2024-2025/Phase-2/Wiki_WikiBias/mitigation/train_hindi.xlsx')
val_hin = pd.read_excel('/content/drive/Shareddrives/FYP 2024-2025/Phase-2/Wiki_WikiBias/mitigation/val_hindi.xlsx')
test_hin = pd.read_excel('/content/drive/Shareddrives/FYP 2024-2025/Phase-2/Wiki_WikiBias/mitigation/test_hindi.xlsx')

train_bias = train_hin['biased_sent'].tolist()
train_non_bias = train_hin['debiased_sent'].tolist()
train_bias_labels = [1] * len(train_bias)
train_non_bias_labels = [0] * len(train_non_bias)

train_sentences = train_bias + train_non_bias
train_labels = train_bias_labels + train_non_bias_labels

train_hin = pd.DataFrame({'text': train_sentences, 'labels': train_labels})

val_bias = val_hin['biased_sent'].tolist()
val_non_bias = val_hin['debiased_sent'].tolist()
val_bias_labels = [1] * len(val_bias)
val_non_bias_labels = [0] * len(val_non_bias)

val_sentences = val_bias + val_non_bias
val_labels = val_bias_labels + val_non_bias_labels

val_hin = pd.DataFrame({'text': val_sentences, 'labels': val_labels})

test_bias = test_hin['biased_sent'].tolist()
test_non_bias = test_hin['debiased_sent'].tolist()
test_bias_labels = [1] * len(test_bias)
test_non_bias_labels = [0] * len(test_non_bias)

test_sentences = test_bias + test_non_bias
test_labels = test_bias_labels + test_non_bias_labels

test_hin = pd.DataFrame({'text': test_sentences, 'labels': test_labels})

### MuRIL

In [None]:
model_name = "google/muril-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
trainer, test_dataset = return_trainer(model, tokenizer, 'hindi', 'muril', train_hin, val_hin, test_hin)
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6932,0.693156,0.5,0.0,0.0,0.0
2,0.6896,0.693102,0.5,0.0,0.0,0.0
3,0.6897,0.683509,0.62993,0.623377,0.634615,0.612529
4,0.6756,0.655315,0.639211,0.629321,0.647059,0.612529
5,0.6486,0.64744,0.643852,0.64672,0.641553,0.651972
6,0.621,0.65552,0.640371,0.559659,0.721612,0.457077
7,0.5938,0.652741,0.646172,0.597094,0.693252,0.524362
8,0.5738,0.657677,0.639211,0.595579,0.677515,0.531323


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=1064, training_loss=0.6445388793945312, metrics={'train_runtime': 1856.1298, 'train_samples_per_second': 22.811, 'train_steps_per_second': 0.711, 'total_flos': 4456048833576960.0, 'train_loss': 0.6445388793945312, 'epoch': 8.0})

In [None]:
test_results = evaluate_model(trainer, test_dataset)

Evaluation Results:
Accuracy: 0.6362
Precision: 0.6371
Recall: 0.6326
F1-score: 0.6349

Confusion Matrix:
[[545 307]
 [313 539]]


### IndicBERTV2

In [None]:
model_name = "ai4bharat/IndicBERTv2-MLM-Sam-TLM"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)
trainer, test_dataset = return_trainer(model, tokenizer, 'hindi', 'indic_bert', train_hin, val_hin, test_hin)
trainer.train()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/639 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/IndicBERTv2-MLM-Sam-TLM and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.75M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Map:   0%|          | 0/4234 [00:00<?, ? examples/s]

Map:   0%|          | 0/862 [00:00<?, ? examples/s]

Map:   0%|          | 0/1704 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6945,0.692569,0.516241,0.572308,0.512868,0.647332
2,0.6891,0.690889,0.525522,0.657167,0.514436,0.909513
3,0.6903,0.681653,0.578886,0.644466,0.557627,0.763341
4,0.6648,0.656377,0.643852,0.675818,0.620155,0.742459
5,0.6095,0.662962,0.641531,0.62363,0.65641,0.593968
6,0.5681,0.656565,0.639211,0.625752,0.65,0.603248
7,0.4953,0.716028,0.653132,0.611183,0.695266,0.545244


TrainOutput(global_step=931, training_loss=0.6227623604550141, metrics={'train_runtime': 1698.6457, 'train_samples_per_second': 24.926, 'train_steps_per_second': 0.777, 'total_flos': 3899042729379840.0, 'train_loss': 0.6227623604550141, 'epoch': 7.0})

In [None]:
test_results = evaluate_model(trainer, test_dataset)

Evaluation Results:
Accuracy: 0.6426
Precision: 0.6172
Recall: 0.7512
F1-score: 0.6776

Confusion Matrix:
[[455 397]
 [212 640]]


## Tamil

In [None]:
train_tam = pd.read_excel('/content/drive/Shareddrives/FYP 2024-2025/Phase-2/Wiki_WikiBias/mitigation/train_tamil.xlsx')
val_tam = pd.read_excel('/content/drive/Shareddrives/FYP 2024-2025/Phase-2/Wiki_WikiBias/mitigation/val_tamil.xlsx')
test_tam = pd.read_excel('/content/drive/Shareddrives/FYP 2024-2025/Phase-2/Wiki_WikiBias/mitigation/test_tamil.xlsx')

train_bias = train_tam['biased_sent'].tolist()
train_non_bias = train_tam['debiased_sent'].tolist()
train_bias_labels = [1] * len(train_bias)
train_non_bias_labels = [0] * len(train_non_bias)

train_sentences = train_bias + train_non_bias
train_labels = train_bias_labels + train_non_bias_labels

train_tam = pd.DataFrame({'text': train_sentences, 'labels': train_labels})

val_bias = val_tam['biased_sent'].tolist()
val_non_bias = val_tam['debiased_sent'].tolist()
val_bias_labels = [1] * len(val_bias)
val_non_bias_labels = [0] * len(val_non_bias)

val_sentences = val_bias + val_non_bias
val_labels = val_bias_labels + val_non_bias_labels

val_tam = pd.DataFrame({'text': val_sentences, 'labels': val_labels})

test_bias = test_tam['biased_sent'].tolist()
test_non_bias = test_tam['debiased_sent'].tolist()
test_bias_labels = [1] * len(test_bias)
test_non_bias_labels = [0] * len(test_non_bias)

test_sentences = test_bias + test_non_bias
test_labels = test_bias_labels + test_non_bias_labels

test_tam = pd.DataFrame({'text': test_sentences, 'labels': test_labels})

### MuRIL

In [None]:
model_name = "google/muril-base-cased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)
trainer, test_dataset = return_trainer(model, tokenizer, 'tamil', 'muril', train_tam, val_tam, test_tam)
trainer.train()


config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/953M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

Map:   0%|          | 0/4234 [00:00<?, ? examples/s]

Map:   0%|          | 0/862 [00:00<?, ? examples/s]

Map:   0%|          | 0/1704 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6931,0.693124,0.5058,0.093617,0.564103,0.051044
2,0.6897,0.684223,0.598608,0.68315,0.564297,0.865429
3,0.6842,0.647772,0.636891,0.565881,0.703448,0.473318
4,0.6272,0.642631,0.639211,0.657111,0.62605,0.691415
5,0.5951,0.654625,0.645012,0.611675,0.67507,0.559165
6,0.5502,0.65182,0.635731,0.6075,0.658537,0.563805


In [None]:
test_results = evaluate_model(trainer, test_dataset)

### IndicBERTV2

In [None]:
model_name = "ai4bharat/IndicBERTv2-MLM-Sam-TLM"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)
trainer, test_dataset = return_trainer(model, tokenizer, 'tamil', 'indic_bert', train_tam, val_tam, test_tam)
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/IndicBERTv2-MLM-Sam-TLM and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4234 [00:00<?, ? examples/s]

Map:   0%|          | 0/862 [00:00<?, ? examples/s]

Map:   0%|          | 0/1704 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6924,0.69195,0.518561,0.571723,0.51487,0.642691
2,0.689,0.689948,0.533643,0.650435,0.520167,0.867749
3,0.6897,0.683589,0.553364,0.648402,0.534639,0.823666
4,0.6651,0.659548,0.611369,0.678811,0.578431,0.821346
5,0.6115,0.647371,0.638051,0.597938,0.672464,0.538283
6,0.5776,0.653568,0.640371,0.630072,0.648649,0.612529
7,0.5119,0.689817,0.635731,0.593264,0.671554,0.531323
8,0.4718,0.731459,0.643852,0.597641,0.686747,0.529002


TrainOutput(global_step=1064, training_loss=0.6072163761110234, metrics={'train_runtime': 2051.0871, 'train_samples_per_second': 20.643, 'train_steps_per_second': 0.644, 'total_flos': 4456048833576960.0, 'train_loss': 0.6072163761110234, 'epoch': 8.0})

In [None]:
test_results = evaluate_model(trainer, test_dataset)

Evaluation Results:
Accuracy: 0.6432
Precision: 0.6826
Recall: 0.5352
F1-score: 0.6000

Confusion Matrix:
[[640 212]
 [396 456]]


# Mitigation

In [None]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from datasets import Dataset

In [None]:
%%capture
!pip install evaluate
!pip install rouge_score
!pip install bert_score

In [None]:
from evaluate import load

# Load evaluation metrics
metric_meteor = load("meteor")
metric_rouge = load("rouge")
metric_bleu = load("bleu")
metric_bertscore = load("bertscore")

## Hindi

In [None]:
train_dataset = load_dataset("/content/drive/Shareddrives/FYP 2024-2025/Phase-2/Wiki_WikiBias/mitigation/train_hindi.xlsx")
val_dataset = load_dataset("/content/drive/Shareddrives/FYP 2024-2025/Phase-2/Wiki_WikiBias/mitigation/val_hindi.xlsx")
test_dataset = load_dataset("/content/drive/Shareddrives/FYP 2024-2025/Phase-2/Wiki_WikiBias/mitigation/test_hindi.xlsx")

### IndicBART

In [None]:
# Load IndicBART model & tokenizer
model_name = "ai4bharat/IndicBART"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, keep_accents=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/832 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.90M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/221 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/976M [00:00<?, ?B/s]

In [None]:
def load_dataset(excel_path):
    df = pd.read_excel(excel_path)
    dataset = Dataset.from_pandas(df[['biased_sent', 'debiased_sent']])
    return dataset

def preprocess_function(examples, tokenizer):
    lang_token = "<2hi>"  # Change based on language, e.g., "<2ta>" for Tamil
    inputs = [text + " </s> " + lang_token for text in examples['biased_sent']]
    targets = [lang_token + " " + text + " </s>" for text in examples['debiased_sent']]

    model_inputs = tokenizer(inputs, truncation=True, padding="max_length", max_length=128)
    labels = tokenizer(targets, truncation=True, padding="max_length", max_length=128)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
train_tokenized = train_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)
val_tokenized = val_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)
test_tokenized = test_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)


Map:   0%|          | 0/2117 [00:00<?, ? examples/s]

Map:   0%|          | 0/431 [00:00<?, ? examples/s]

Map:   0%|          | 0/852 [00:00<?, ? examples/s]

In [None]:
from transformers import EarlyStoppingCallback
training_args = TrainingArguments(
    output_dir="./results/indicbart",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-3,
    warmup_steps=300,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    # gradient_accumulation_steps=2,
    num_train_epochs=10,
    weight_decay=0.01,
    lr_scheduler_type="linear",
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=1,
    push_to_hub=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to = "none"

)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    processing_class=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()


Epoch,Training Loss,Validation Loss
1,0.2387,0.25498
2,0.2067,0.250106
3,0.1183,0.265013
4,0.1161,0.287085
5,0.0745,0.303924


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=1325, training_loss=0.5298707369813379, metrics={'train_runtime': 218.08, 'train_samples_per_second': 97.074, 'train_steps_per_second': 12.152, 'total_flos': 1433725084631040.0, 'train_loss': 0.5298707369813379, 'epoch': 5.0})

In [None]:
def evaluate():
    model.eval()
    references, predictions = [], []

    for example in test_dataset:
        inputs = tokenizer(example["biased_sent"] + " </s> <2hi>", return_tensors="pt", padding="max_length", truncation=True, max_length=128)
        with torch.no_grad():
            outputs = model.generate(input_ids=inputs["input_ids"].to(model.device))

        pred_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        ref_text = example["debiased_sent"]

        predictions.append(pred_text)
        references.append([ref_text])

    # Compute evaluation scores
    meteor_score = metric_meteor.compute(predictions=predictions, references=references)
    rouge_score = metric_rouge.compute(predictions=predictions, references=references)
    bleu_score = metric_bleu.compute(predictions=predictions, references=references)
    bert_score = metric_bertscore.compute(predictions=predictions, references=references, model_type="xlm-roberta-base")
    return meteor_score, rouge_score, bleu_score, bert_score


meteor_score, rouge_score, bleu_score, bert_score =evaluate()



In [None]:
print("METEOR:", meteor_score)
print("ROUGE:", rouge_score)
print("BLEU:", bleu_score)
print("BERTScore Precision:", sum(bert_score["precision"])/len(bert_score["precision"]))
print("BERTScore Recall:", sum(bert_score["recall"])/len(bert_score["recall"]))
print("BERTScore F1:", sum(bert_score["f1"])/len(bert_score["f1"]))

METEOR: {'meteor': np.float64(0.4133469199468718)}
ROUGE: {'rouge1': np.float64(0.2339587642404544), 'rouge2': np.float64(0.08111590417928446), 'rougeL': np.float64(0.23387054390575512), 'rougeLsum': np.float64(0.23459414216456467)}
BLEU: {'bleu': 0.16727204571485468, 'precisions': [0.8499138478280585, 0.7603931203931203, 0.6993457041724767, 0.648371104815864], 'brevity_penalty': 0.22734803577185303, 'length_ratio': 0.40301889550820513, 'translation_length': 11027, 'reference_length': 27361}
BERTScore Precision: 0.9471208817662208
BERTScore Recall: 0.8831048877866056
BERTScore F1: 0.9135537653321951


### mT0


In [None]:
model_name = "bigscience/mt0-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, legacy=False, use_fast=False)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
def load_dataset(excel_path):
    df = pd.read_excel(excel_path)
    dataset = Dataset.from_pandas(df[['biased_sent', 'debiased_sent']])
    return dataset

def preprocess_function(examples, tokenizer, max_length=128):
    inputs = [text for text in examples['biased_sent']]
    targets = examples['debiased_sent']
    model_inputs = tokenizer(inputs, text_target=targets, max_length=max_length, truncation=True, padding="max_length")
    return model_inputs



In [None]:
train_tokenized = train_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)
val_tokenized = val_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)
test_tokenized = test_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)


Map:   0%|          | 0/2117 [00:00<?, ? examples/s]

Map:   0%|          | 0/431 [00:00<?, ? examples/s]

Map:   0%|          | 0/852 [00:00<?, ? examples/s]

In [None]:
from transformers import EarlyStoppingCallback
training_args = TrainingArguments(
    output_dir="./results/hindi_mt0",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-3,
    warmup_steps=300,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    # gradient_accumulation_steps=2,
    num_train_epochs=10,
    weight_decay=0.01,
    lr_scheduler_type="linear",
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=1,
    push_to_hub=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to = "none"

)

# data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    processing_class=tokenizer,
    # data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.7392,0.306522
2,0.2949,0.304406
3,0.1786,0.307874
4,0.1755,0.331671
5,0.1338,0.365297


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=1325, training_loss=0.6207481119767675, metrics={'train_runtime': 439.0838, 'train_samples_per_second': 48.214, 'train_steps_per_second': 6.035, 'total_flos': 3172975826042880.0, 'train_loss': 0.6207481119767675, 'epoch': 5.0})

In [None]:
def evaluate():
    model.eval()
    references, predictions = [], []

    for example in test_dataset:
        inputs = tokenizer(example["biased_sent"], return_tensors="pt", padding="max_length", truncation=True, max_length=128)
        with torch.no_grad():
            outputs = model.generate(input_ids=inputs["input_ids"].to(model.device))

        pred_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        ref_text = example["debiased_sent"]

        predictions.append(pred_text)
        references.append([ref_text])

    # Compute evaluation scores
    meteor_score = metric_meteor.compute(predictions=predictions, references=references)
    rouge_score = metric_rouge.compute(predictions=predictions, references=references)
    bleu_score = metric_bleu.compute(predictions=predictions, references=references)
    bert_score = metric_bertscore.compute(predictions=predictions, references=references, model_type="xlm-roberta-base")
    return meteor_score, rouge_score, bleu_score, bert_score


meteor_score, rouge_score, bleu_score, bert_score =evaluate()



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
print("METEOR:", meteor_score)
print("ROUGE:", rouge_score)
print("BLEU:", bleu_score)
print("BERTScore Precision:", sum(bert_score["precision"])/len(bert_score["precision"]))
print("BERTScore Recall:", sum(bert_score["recall"])/len(bert_score["recall"]))
print("BERTScore F1:", sum(bert_score["f1"])/len(bert_score["f1"]))

METEOR: {'meteor': np.float64(0.33280081343776446)}
ROUGE: {'rouge1': np.float64(0.2169703405618899), 'rouge2': np.float64(0.0759040353230494), 'rougeL': np.float64(0.21704486176317161), 'rougeLsum': np.float64(0.21725631567180864)}
BLEU: {'bleu': 0.08337898469078743, 'precisions': [0.8411927877947295, 0.7505128205128205, 0.6874370413009067, 0.6356182354870449], 'brevity_penalty': 0.11504980023918018, 'length_ratio': 0.3162165125543657, 'translation_length': 8652, 'reference_length': 27361}
BERTScore Precision: 0.9376382664055891
BERTScore Recall: 0.8629380717104029
BERTScore F1: 0.8983377527463045


## Tamil


In [None]:
train_dataset = load_dataset("/content/drive/Shareddrives/FYP 2024-2025/Phase-2/Wiki_WikiBias/mitigation/train_tamil.xlsx")
val_dataset = load_dataset("/content/drive/Shareddrives/FYP 2024-2025/Phase-2/Wiki_WikiBias/mitigation/val_tamil.xlsx")
test_dataset = load_dataset("/content/drive/Shareddrives/FYP 2024-2025/Phase-2/Wiki_WikiBias/mitigation/test_tamil.xlsx")

### IndicBART

In [None]:
# Load IndicBART model & tokenizer
model_name = "ai4bharat/IndicBART"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, keep_accents=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
def load_dataset(excel_path):
    df = pd.read_excel(excel_path)
    dataset = Dataset.from_pandas(df[['biased_sent', 'debiased_sent']])
    return dataset

def preprocess_function(examples, tokenizer):
    lang_token = "<2ta>"
    inputs = [text + " </s> " + lang_token for text in examples['biased_sent']]
    targets = [lang_token + " " + text + " </s>" for text in examples['debiased_sent']]

    model_inputs = tokenizer(inputs, truncation=True, padding="max_length", max_length=128)
    labels = tokenizer(targets, truncation=True, padding="max_length", max_length=128)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
train_tokenized = train_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)
val_tokenized = val_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)
test_tokenized = test_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)


Map:   0%|          | 0/2117 [00:00<?, ? examples/s]

Map:   0%|          | 0/431 [00:00<?, ? examples/s]

Map:   0%|          | 0/852 [00:00<?, ? examples/s]

In [None]:
from transformers import EarlyStoppingCallback
training_args = TrainingArguments(
    output_dir="./results/indicbart",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    warmup_steps=300,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    # gradient_accumulation_steps=2,
    num_train_epochs=5,
    weight_decay=0.01,
    lr_scheduler_type="linear",
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=1,
    push_to_hub=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to = "none"

)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    processing_class=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()


Epoch,Training Loss,Validation Loss
1,4.6463,3.815746
2,3.7626,3.197142
3,2.8156,2.616468
4,2.5876,2.370398
5,2.1007,2.22676


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=335, training_loss=3.3793461529176625, metrics={'train_runtime': 177.5478, 'train_samples_per_second': 59.618, 'train_steps_per_second': 1.887, 'total_flos': 1433725084631040.0, 'train_loss': 3.3793461529176625, 'epoch': 5.0})

In [None]:
def evaluate():
    model.eval()
    references, predictions = [], []

    for example in test_dataset:
        inputs = tokenizer(example["biased_sent"] + " </s> <2ta>", return_tensors="pt", padding="max_length", truncation=True, max_length=128)
        with torch.no_grad():
            outputs = model.generate(input_ids=inputs["input_ids"].to(model.device))

        pred_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        ref_text = example["debiased_sent"]

        predictions.append(pred_text)
        references.append([ref_text])

    # Compute evaluation scores
    meteor_score = metric_meteor.compute(predictions=predictions, references=references)
    rouge_score = metric_rouge.compute(predictions=predictions, references=references)
    bleu_score = metric_bleu.compute(predictions=predictions, references=references)
    bert_score = metric_bertscore.compute(predictions=predictions, references=references, model_type="xlm-roberta-base")
    return meteor_score, rouge_score, bleu_score, bert_score


meteor_score, rouge_score, bleu_score, bert_score =evaluate()



In [None]:
print("METEOR:", meteor_score)
print("ROUGE:", rouge_score)
print("BLEU:", bleu_score)
print("BERTScore Precision:", sum(bert_score["precision"])/len(bert_score["precision"]))
print("BERTScore Recall:", sum(bert_score["recall"])/len(bert_score["recall"]))
print("BERTScore F1:", sum(bert_score["f1"])/len(bert_score["f1"]))

METEOR: {'meteor': np.float64(0.048602681658825794)}
ROUGE: {'rouge1': np.float64(0.11530572322825841), 'rouge2': np.float64(0.02904929577464789), 'rougeL': np.float64(0.11508495416946125), 'rougeLsum': np.float64(0.11495919964229827)}
BLEU: {'bleu': 0.00043194153687797294, 'precisions': [0.3309162821357943, 0.1466544454628781, 0.067618332081142, 0.0415944540727903], 'brevity_penalty': 0.003996194856251499, 'length_ratio': 0.15331749962100158, 'translation_length': 3034, 'reference_length': 19789}
BERTScore Precision: 0.8401447227443328
BERTScore Recall: 0.7757191285141197
BERTScore F1: 0.8063538180830333


### mT0

In [None]:
model_name = "bigscience/mt0-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, legacy=False, use_fast=False)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
def load_dataset(excel_path):
    df = pd.read_excel(excel_path)
    dataset = Dataset.from_pandas(df[['biased_sent', 'debiased_sent']])
    return dataset

def preprocess_function(examples, tokenizer, max_length=128):
    inputs = [text for text in examples['biased_sent']]
    targets = examples['debiased_sent']
    model_inputs = tokenizer(inputs, text_target=targets, max_length=max_length, truncation=True, padding="max_length")
    return model_inputs

In [None]:
train_tokenized = train_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)
val_tokenized = val_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)
test_tokenized = test_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)


Map:   0%|          | 0/2117 [00:00<?, ? examples/s]

Map:   0%|          | 0/431 [00:00<?, ? examples/s]

Map:   0%|          | 0/852 [00:00<?, ? examples/s]

In [None]:
from transformers import EarlyStoppingCallback
training_args = TrainingArguments(
    output_dir="./results/hindi_mt0",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-3,
    warmup_steps=300,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    # gradient_accumulation_steps=2,
    num_train_epochs=10,
    weight_decay=0.01,
    lr_scheduler_type="linear",
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=1,
    push_to_hub=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to = "none"

)

# data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    processing_class=tokenizer,
    # data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()


Epoch,Training Loss,Validation Loss
1,0.3824,0.367279
2,0.3822,0.351627
3,0.2312,0.343777
4,0.2269,0.363614
5,0.1164,0.4304
6,0.0809,0.457996


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=1590, training_loss=0.5527403136469283, metrics={'train_runtime': 602.5713, 'train_samples_per_second': 35.133, 'train_steps_per_second': 4.398, 'total_flos': 3807570991251456.0, 'train_loss': 0.5527403136469283, 'epoch': 6.0})

In [None]:
from evaluate import load
import torch

# Load evaluation metrics
metric_meteor = load("meteor")
metric_rouge = load("rouge")
metric_bleu = load("bleu")
metric_bertscore = load("bertscore")

def evaluate():
    model.eval()
    references, predictions = [], []

    for example in test_dataset:
        inputs = tokenizer(example["biased_sent"], return_tensors="pt", padding="max_length", truncation=True, max_length=128)
        with torch.no_grad():
            outputs = model.generate(input_ids=inputs["input_ids"].to(model.device))

        pred_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        ref_text = example["debiased_sent"]

        predictions.append(pred_text)
        references.append([ref_text])

    # Compute evaluation scores
    meteor_score = metric_meteor.compute(predictions=predictions, references=references)
    rouge_score = metric_rouge.compute(predictions=predictions, references=references)
    bleu_score = metric_bleu.compute(predictions=predictions, references=references)
    bert_score = metric_bertscore.compute(predictions=predictions, references=references, model_type="xlm-roberta-base")
    return meteor_score, rouge_score, bleu_score, bert_score


meteor_score, rouge_score, bleu_score, bert_score =evaluate()



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
print("METEOR:", meteor_score)
print("ROUGE:", rouge_score)
print("BLEU:", bleu_score)
print("BERTScore Precision:", sum(bert_score["precision"])/len(bert_score["precision"]))
print("BERTScore Recall:", sum(bert_score["recall"])/len(bert_score["recall"]))
print("BERTScore F1:", sum(bert_score["f1"])/len(bert_score["f1"]))

METEOR: {'meteor': np.float64(0.371699154928877)}
ROUGE: {'rouge1': np.float64(0.22702058309452666), 'rouge2': np.float64(0.08140950403274344), 'rougeL': np.float64(0.2261795097182421), 'rougeLsum': np.float64(0.22590263805052538)}
BLEU: {'bleu': 0.12954719941010803, 'precisions': [0.8008189142781666, 0.6892394701592499, 0.6059314811658428, 0.5355790312936017], 'brevity_penalty': 0.19913158138339657, 'length_ratio': 0.3825862853100207, 'translation_length': 7571, 'reference_length': 19789}
BERTScore Precision: 0.9448689232130005
BERTScore Recall: 0.8767607472312282
BERTScore F1: 0.9091245427797657
