In [None]:
import os
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers
!pip install datasets
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import transformers
import torch
import evaluate
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, pipeline
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset

**Dataset Preparation**

In [1]:
# 0 = fake, 1 = real
data_train = pd.read_csv('/content/drive/MyDrive/Multilingual_Datasets/train_dataframe.csv')
data_test = pd.read_csv('/content/drive/MyDrive/Multilingual_Datasets/test_dataframe.csv')

NameError: ignored

In [None]:
data_train['Label'].value_counts()

Real    1549
Fake    1437
Name: Label, dtype: int64

**Model Training**

In [None]:
#specify model
checkpoint = 'xlm-roberta-base'

In [None]:
#Tokenization
tokenizer = AutoTokenizer.from_pretrained(checkpoint, do_lower_case=True)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_labels = 2
multilingual_model =  AutoModelForSequenceClassification.from_pretrained(checkpoint, 
                                                           num_labels=num_labels, 
                                                           output_attentions=False, 
                                                           output_hidden_states=False).to(device)

In [None]:
#train test split
x_train = data_train['News']
x_test = data_test['News']
y_train = data_train['Label']
y_test = data_test['Label']

x_test_final, x_val, y_test_final, y_val = train_test_split(x_test, y_test, test_size=0.5, random_state=1000)

df_train = pd.concat([x_train, y_train], axis=1, join='inner')
df_test = pd.concat([x_test_final, y_test_final], axis=1, join='inner')
df_val = pd.concat([x_val, y_val], axis=1, join='inner')

In [None]:
print(len(df_train))
print(len(df_test))
print(len(df_val))

2986
939
939


In [None]:
# Tokenizing function
def tokenize_function(batch):
    return tokenizer(batch["News"], 
                    add_special_tokens=True, 
                    return_attention_mask=True, 
                    padding=True,
                    truncation=True, 
                    max_length=512, 
                    return_tensors='pt')

In [None]:
final_dataset_train = Dataset.from_pandas(df_train)
final_dataset_test =  Dataset.from_pandas(df_test)
final_dataset_val =  Dataset.from_pandas(df_val)

encoded_data_train = final_dataset_train.map(tokenize_function, batched=True, batch_size=None)
encoded_data_test= final_dataset_test.map(tokenize_function, batched=True, batch_size=None)
encoded_data_val = final_dataset_val.map(tokenize_function, batched=True, batch_size=None)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
#arguments
batch_size = 4

logging_steps = len(final_dataset_train) // batch_size
model_name = f'multilingual-{checkpoint}-fakenewsdetection'
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=10,
                                  learning_rate=1e-5,
                                  adam_epsilon=1e-8,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  #weight_decay=0.01,
                                  evaluation_strategy='epoch',
                                  save_strategy='epoch',
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  log_level='error',
                                  optim='adamw_torch',
                                  metric_for_best_model='eval_loss',
                                  load_best_model_at_end=True
                                  )

In [None]:
def compute_metrics(eval_preds):
    # Use metrics for MRPC dataset for GLUE benchmark
    # Which are Accuracy and F1
    metric = evaluate.load('glue', 'mrpc')
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
torch.cuda.empty_cache()

trainer_xlm = Trainer(model=multilingual_model, 
                  args=training_args, 
                  train_dataset=encoded_data_train,
                  eval_dataset=encoded_data_val,
                  tokenizer=tokenizer,
                  compute_metrics=compute_metrics)
trainer_xlm.train()