In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
print(os.listdir())

['collection_of_news.csv']


In [3]:
df = pd.read_csv("collection_of_news.csv", sep=",")
df = df.dropna()

In [4]:
# !pip install transformers tensorflow datasets torch

In [6]:
import tensorflow as tf
from datasets import Dataset, load_metric
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import pipeline
import torch
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn as nn
from sklearn.model_selection import train_test_split

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [27]:
sampled_df = df.groupby('label').apply(lambda x: x.sample(min(70000, len(x)))).reset_index(drop=True)
label_mapping = {"real": 0, "fake": 1}
df['label'] = df['label'].map(label_mapping)
df



Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,1
1,House Intelligence Committee Chairman Devin Nu...,1
2,"On Friday, it was revealed that former Milwauk...",1
3,"On Christmas day, Donald Trump announced that ...",1
4,Pope Francis used his annual Christmas Day mes...,1
...,...,...
153442,"Mr. Leroux was born Hervé Peugnet on May 30, 1...",0
153443,Ethical questions loom after Vice President Mi...,0
153444,Trump Jr. Is Soon To Give A 30-Minute Speech F...,1
153445,SHANGHAI (Reuters) - China said it plans to ac...,0


In [28]:
tokenizer = AutoTokenizer.from_pretrained("vikram71198/distilroberta-base-finetuned-fake-news-detection")
model = AutoModelForSequenceClassification.from_pretrained("vikram71198/distilroberta-base-finetuned-fake-news-detection")
model.to(device)


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (

In [29]:
dataset = Dataset.from_pandas(df)
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/153446 [00:00<?, ? examples/s]

In [30]:
train_df, val_df = train_test_split(df, test_size=0.2)
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Tokenize the datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)

accuracy_metric = load_metric("accuracy")
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return accuracy_metric.compute(predictions=preds, references=p.label_ids)

Map:   0%|          | 0/122756 [00:00<?, ? examples/s]

Map:   0%|          | 0/30690 [00:00<?, ? examples/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [31]:
# !pip install transformers[torch] accelerate -U


In [34]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=5,
    weight_decay=0.01,
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.160049,0.930694
2,No log,0.150137,0.935549
3,No log,0.142777,0.938286
4,No log,0.143879,0.940209
5,0.145300,0.141357,0.940828




TrainOutput(global_step=600, training_loss=0.14129503726959228, metrics={'train_runtime': 1093.2106, 'train_samples_per_second': 561.447, 'train_steps_per_second': 0.549, 'total_flos': 8.130583994707968e+16, 'train_loss': 0.14129503726959228, 'epoch': 5.0})

In [35]:
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-tokenizer')

('./fine-tuned-tokenizer/tokenizer_config.json',
 './fine-tuned-tokenizer/special_tokens_map.json',
 './fine-tuned-tokenizer/vocab.json',
 './fine-tuned-tokenizer/merges.txt',
 './fine-tuned-tokenizer/added_tokens.json',
 './fine-tuned-tokenizer/tokenizer.json')

In [36]:
model_path = './fine-tuned-model'
tokenizer_path = './fine-tuned-tokenizer'

tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [40]:
def classify_news(texts):
    # Tokenize the texts with truncation and padding
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Make predictions
    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1).numpy()

    return predictions, logits.numpy()

In [47]:
test = val_df.sample(1000)
test['label'].value_counts()

label
0    611
1    389
Name: count, dtype: int64

In [49]:
counter = 0
preds = []
for i,text in enumerate(test['text'].values):
  if counter <= i:
    print("Iteration ", i)
    counter += 100
  pred, _ = classify_news(text)
  preds.append(pred[0])

Iteration  0
Iteration  100
Iteration  200
Iteration  300
Iteration  400
Iteration  500
Iteration  600
Iteration  700
Iteration  800
Iteration  900


In [51]:
print(classification_report(test['label'].values, preds))

              precision    recall  f1-score   support

           0       0.94      0.98      0.96       611
           1       0.96      0.90      0.93       389

    accuracy                           0.95      1000
   macro avg       0.95      0.94      0.94      1000
weighted avg       0.95      0.95      0.95      1000

