<h2>Installation

In [1]:
!pip install sentence_transformers
!pip install --upgrade --no-cache-dir gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


<h2>Imports

In [2]:
import torch 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
device

device(type='cuda')

<h2>Downloading data

In [5]:
!gdown 1Nk7eeRzyIAzqkdviopWIy72XGVlu7MTS
!gdown 1_3e8jv8uG4zRkHEimaH1VwHxEc_5k2JX

Downloading...
From (uriginal): https://drive.google.com/uc?id=1Nk7eeRzyIAzqkdviopWIy72XGVlu7MTS
From (redirected): https://drive.google.com/uc?id=1Nk7eeRzyIAzqkdviopWIy72XGVlu7MTS&confirm=t&uuid=3b6f04d6-0380-47b3-b0a6-e7072c122693
To: /content/WELFake_Dataset.csv
100% 245M/245M [00:01<00:00, 190MB/s]
Downloading...
From: https://drive.google.com/uc?id=1_3e8jv8uG4zRkHEimaH1VwHxEc_5k2JX
To: /content/data.csv
100% 12.6M/12.6M [00:00<00:00, 160MB/s]


<h2>Preparing Dataset

In [6]:
def read_train(split_dir):
    df = pd.read_csv(split_dir)
    df = df.dropna()
    df = df[ : 12000]
    text = df['text'].to_list()
    label = df['label'].to_list()
    return text, label

train_texts, train_labels = read_train('/content/WELFake_Dataset.csv')

In [7]:
def read_test(split_dir):
    df = pd.read_csv(split_dir)
    df = df.dropna()
    df = df[ : 2000]
    text = df['Body'].to_list()
    label = df['Label'].to_list()
    return text, label

test_texts, test_labels = read_test('/content/data.csv')

In [8]:
test_texts, val_texts, test_labels, val_labels = train_test_split(test_texts, test_labels, test_size=.5)

In [9]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [10]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [11]:
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

<h2>Training

In [55]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import EarlyStoppingCallback, IntervalStrategy, get_scheduler
from torch.utils.data import DataLoader, RandomSampler
from transformers import AdamW
from transformers import set_seed
import torch
import numpy as np

set_seed(42)

# Define the model and training arguments
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    learning_rate=2e-5,
    load_best_model_at_end=True,
    evaluation_strategy="steps",   # set evaluation strategy to "steps"
    eval_steps=10,                 # evaluate every 10 steps
    save_strategy="steps",
    save_steps=10,                 # save every 10 steps
)

# Define the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=training_args.learning_rate, weight_decay=training_args.weight_decay)
scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=training_args.warmup_steps,
    num_training_steps=len(train_dataset) // training_args.gradient_accumulation_steps * training_args.num_train_epochs,
)


# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    optimizers=(optimizer, scheduler),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.01)],
)

# Train the model
trainer.train()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

Step,Training Loss,Validation Loss
10,0.6976,0.69048
20,0.6977,0.691289
30,0.7018,0.692543
40,0.6908,0.694389


TrainOutput(global_step=40, training_loss=0.6969654202461243, metrics={'train_runtime': 110.3298, 'train_samples_per_second': 326.294, 'train_steps_per_second': 20.393, 'total_flos': 84779135139840.0, 'train_loss': 0.6969654202461243, 'epoch': 0.05})

In [13]:
!pip install huggingface_hub

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [14]:
from huggingface_hub import notebook_login
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [22]:
# model.push_to_hub("Fake_News_model")

<h2>Inference

In [61]:
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification
from tqdm import tqdm

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
model.to(device)

test_texts, test_labels = read_test('/content/data.csv')
test_encodings = tokenizer(test_texts, truncation=True, padding=True)
test_dataset = IMDbDataset(test_encodings, test_labels)

test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

In [70]:
y_pred = []
y_true = []
with torch.no_grad():
  for batch in tqdm(test_loader):
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)
      outputs = model(input_ids, attention_mask=attention_mask, labels=labels).logits
      predicted_class_id = outputs.argmax(dim = 1)
      y_true.extend(labels.cpu().detach().numpy())
      y_pred.extend(predicted_class_id.cpu().detach().numpy())

100%|██████████| 125/125 [00:33<00:00,  3.70it/s]


In [71]:
from sklearn.metrics import classification_report as clfr
print(f"Performance on test data -> \n{clfr(y_true, y_pred)}")

Performance on test data -> 
              precision    recall  f1-score   support

           0       0.53      1.00      0.69      1056
           1       1.00      0.00      0.00       944

    accuracy                           0.53      2000
   macro avg       0.76      0.50      0.35      2000
weighted avg       0.75      0.53      0.37      2000



In [72]:
from sklearn.metrics import f1_score
f1 = f1_score(y_true, y_pred,average='macro')
print(f1)

0.3467210488487084


In [65]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)
y_pred = []
y_true = []
with torch.no_grad():
  for batch in tqdm(train_loader):
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)
      outputs = model(input_ids, attention_mask=attention_mask, labels=labels).logits
      predicted_class_id = outputs.argmax(dim = 1)
      y_true.extend(labels.cpu().detach().numpy())
      y_pred.extend(predicted_class_id.cpu().detach().numpy())

100%|██████████| 375/375 [03:16<00:00,  1.90it/s]


In [66]:
from sklearn.metrics import classification_report as clfr
print(f"Performance on Train data -> \n{clfr(y_true, y_pred)}")

Performance on Train data -> 
              precision    recall  f1-score   support

           0       0.48      1.00      0.65      5727
           1       0.00      0.00      0.00      6273

    accuracy                           0.48     12000
   macro avg       0.24      0.50      0.32     12000
weighted avg       0.23      0.48      0.31     12000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [69]:
from sklearn.metrics import f1_score
f1 = f1_score(y_true, y_pred, average='macro')
print(f1)

0.32306650871551873
