<h2>Installation

In [1]:
!pip install sentence_transformers
!pip install --upgrade --no-cache-dir gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 KB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m100.3 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m69.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.13.1-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.

<h2>Imports

In [1]:
import torch 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

<h2>Downloading data

In [3]:
!gdown 1Nk7eeRzyIAzqkdviopWIy72XGVlu7MTS
!gdown 1_3e8jv8uG4zRkHEimaH1VwHxEc_5k2JX

Downloading...
From: https://drive.google.com/uc?id=1Nk7eeRzyIAzqkdviopWIy72XGVlu7MTS
To: /content/WELFake_Dataset.csv
100% 245M/245M [00:00<00:00, 272MB/s]
Downloading...
From: https://drive.google.com/uc?id=1pFFcunSiNS6PCGd9c_MvvPukJqZp1lHs
To: /content/fake_or_real_news.csv
100% 30.7M/30.7M [00:00<00:00, 179MB/s] 
Downloading...
From: https://drive.google.com/uc?id=1_3e8jv8uG4zRkHEimaH1VwHxEc_5k2JX
To: /content/data.csv
100% 12.6M/12.6M [00:00<00:00, 90.5MB/s]


<h2>Preparing Dataset

In [3]:
def read_train(split_dir):
    df = pd.read_csv(split_dir)
    df = df.dropna()
    df = df[ : 12000]
    text = df['text'].to_list()
    label = df['label'].to_list()
    return text, label

train_texts, train_labels = read_train('/content/WELFake_Dataset.csv')

In [14]:
def read_test(split_dir):
    df = pd.read_csv(split_dir)
    df = df.dropna()
    df = df[ : 2000]
    text = df['Body'].to_list()
    label = df['Label'].to_list()
    return text, label

test_texts, test_labels = read_test('/content/data.csv')

In [5]:
test_texts, val_texts, test_labels, val_labels = train_test_split(test_texts, test_labels, test_size=.5)

In [23]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/tokenizer_config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropo

In [8]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [9]:
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

<h2>Training

In [10]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

Step,Training Loss
10,0.7004
20,0.6844
30,0.6936
40,0.6747
50,0.6598
60,0.6196
70,0.5424
80,0.4741
90,0.3936
100,0.3091


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1125, training_loss=0.09697152109154397, metrics={'train_runtime': 1765.343, 'train_samples_per_second': 20.393, 'train_steps_per_second': 0.637, 'total_flos': 4768826351616000.0, 'train_loss': 0.09697152109154397, 'epoch': 3.0})

In [11]:
!pip install huggingface_hub

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [12]:
from huggingface_hub import notebook_login
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [13]:
model.push_to_hub("Fake_News_model")

Configuration saved in /tmp/tmp9922ccmw/config.json
Model weights saved in /tmp/tmp9922ccmw/pytorch_model.bin
Uploading the following files to fahad1247/Fake_News_model: pytorch_model.bin,config.json


Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/fahad1247/Fake_News_model/commit/d82f4b921d3ea3c19e8bdb280e74b9fd72e9c3d2', commit_message='Upload DistilBertForSequenceClassification', commit_description='', oid='d82f4b921d3ea3c19e8bdb280e74b9fd72e9c3d2', pr_url=None, pr_revision=None, pr_num=None)

<h2>Inference

In [48]:
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification
from tqdm import tqdm
model = AutoModelForSequenceClassification.from_pretrained("fahad1247/Fake_News_model")
model.to(device)

test_texts, test_labels = read_test('/content/data.csv')
test_encodings = tokenizer(test_texts, truncation=True, padding=True)
test_dataset = IMDbDataset(test_encodings, test_labels)


test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--fahad1247--Fake_News_model/snapshots/d82f4b921d3ea3c19e8bdb280e74b9fd72e9c3d2/config.json
Model config DistilBertConfig {
  "_name_or_path": "fahad1247/Fake_News_model",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--fahad1247--Fake_News_model/snapshots/d82f4b921d3ea3c19e8bdb280e74b9fd72e9c3d2/pyto

In [50]:
y_pred = []
y_true = []
with torch.no_grad():
  for batch in tqdm(test_loader):
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)
      outputs = model(input_ids, attention_mask=attention_mask, labels=labels).logits
      predicted_class_id = outputs.argmax(dim = 1)
      y_true.extend(labels.cpu().detach().numpy())
      y_pred.extend(predicted_class_id.cpu().detach().numpy())

100%|██████████| 125/125 [00:32<00:00,  3.81it/s]


In [51]:
from sklearn.metrics import classification_report as clfr
print(f"Performance on test data -> \n{clfr(y_true, y_pred)}")

Performance on test data -> 
              precision    recall  f1-score   support

           0       0.10      0.07      0.08      1056
           1       0.25      0.35      0.29       944

    accuracy                           0.20      2000
   macro avg       0.18      0.21      0.19      2000
weighted avg       0.17      0.20      0.18      2000



In [52]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)
y_pred = []
y_true = []
with torch.no_grad():
  for batch in tqdm(train_loader):
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)
      outputs = model(input_ids, attention_mask=attention_mask, labels=labels).logits
      predicted_class_id = outputs.argmax(dim = 1)
      y_true.extend(labels.cpu().detach().numpy())
      y_pred.extend(predicted_class_id.cpu().detach().numpy())

100%|██████████| 375/375 [03:32<00:00,  1.76it/s]


In [53]:
from sklearn.metrics import classification_report as clfr
print(f"Performance on Train data -> \n{clfr(y_true, y_pred)}")

Performance on Train data -> 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5727
           1       1.00      1.00      1.00      6273

    accuracy                           1.00     12000
   macro avg       1.00      1.00      1.00     12000
weighted avg       1.00      1.00      1.00     12000

