# Setup Environment

In [None]:
# Install specific libraries
! pip install transformers
! pip install pycaret

Collecting pycaret
  Downloading pycaret-3.3.0-py3-none-any.whl (485 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.9/485.9 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn>1.4.0 (from pycaret)
  Downloading scikit_learn-1.4.1.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m87.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyod>=1.1.3 (from pycaret)
  Downloading pyod-1.1.3.tar.gz (160 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.5/160.5 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting imbalanced-learn>=0.12.0 (from pycaret)
  Downloading imbalanced_learn-0.12.2-py3-none-any.whl (257 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.0/258.0 kB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting category-e

In [None]:
!pip install scikit-learn



In [None]:
import numpy as np
import pandas as pd
import pycaret
import transformers
from transformers import AutoModel, BertTokenizerFast
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
import torch.nn as nn
# specify GPU
device = torch.device("cuda")

# Load Dataset

In [None]:
# Load Dataset
real = pd.read_csv('https://raw.githubusercontent.com/mnkd246/fake-news-detection/main/news_dataset/True.csv')
fake = pd.read_csv('https://raw.githubusercontent.com/mnkd246/fake-news-detection/main/news_dataset/Fake.csv')

real = real.sample(frac=0.33, random_state=2024)
fake = fake.sample(frac=0.33, random_state=2024)

# Generate labels True/Fake in new 'Target' columns
real['Target'] = ['True']*len(real)
fake['Target'] = ['Fake']*len(fake)

# Merge 'true_data' and 'fake_data', by random mixing into a single df called 'data'
data = real.append(fake).sample(frac=1).reset_index().drop(columns=['index'])

# See how the data looks like
print(data.shape)
data.head()

(14817, 5)


Unnamed: 0,title,text,subject,date,Target
0,Reopen the Kurt Cobain Case? [POLL],21st Century Wire asks The tragic death of Kur...,Middle-east,"February 17, 2016",Fake
1,Man Who Penned ‘Benghazi Mom’s’ GOP Conventio...,The man who wrote the speech for Patricia Smit...,News,"August 17, 2016",Fake
2,McConnell happier with Trump tweets after tax ...,WASHINGTON (Reuters) - A summer spat between P...,politicsNews,"December 22, 2017",True
3,British police release two men in Parsons Gree...,(Reuters) - British police said they have rele...,worldnews,"September 22, 2017",True
4,Hamas says ready to hand Gaza to a Palestinian...,CAIRO/RAMALLAH (Reuters) - Hamas has agreed to...,worldnews,"September 17, 2017",True


In [None]:
data['class'] = pd.get_dummies(data.Target)['True']

In [None]:
data.head()

Unnamed: 0,title,text,subject,date,Target,class
0,Reopen the Kurt Cobain Case? [POLL],21st Century Wire asks The tragic death of Kur...,Middle-east,"February 17, 2016",Fake,0
1,Man Who Penned ‘Benghazi Mom’s’ GOP Conventio...,The man who wrote the speech for Patricia Smit...,News,"August 17, 2016",Fake,0
2,McConnell happier with Trump tweets after tax ...,WASHINGTON (Reuters) - A summer spat between P...,politicsNews,"December 22, 2017",True,1
3,British police release two men in Parsons Gree...,(Reuters) - British police said they have rele...,worldnews,"September 22, 2017",True,1
4,Hamas says ready to hand Gaza to a Palestinian...,CAIRO/RAMALLAH (Reuters) - Hamas has agreed to...,worldnews,"September 17, 2017",True,1


In [None]:
len(data)

14817

# Train-test-split

In [None]:
# Train-Validation-Test set split into 70:15:15 ratio
# Train-Temp split
train_text, temp_text, train_labels, temp_labels = train_test_split(data['title'], data['class'],
                                                                    random_state=2024,
                                                                    test_size=0.3,
                                                                    stratify=data['class'])
# Validation-Test split
val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels,
                                                                random_state=2024,
                                                                test_size=0.3,
                                                                stratify=temp_labels)


## BERT Fine-tuning

### Load pretrained BERT Model

In [None]:
# Load BERT model and tokenizer via HuggingFace Transformers
bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

### Prepare Input Data

In [None]:
MAX_LENGHT = 50
# Tokenize and encode sequences in the train set
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = MAX_LENGHT,
    pad_to_max_length=True,
    truncation=True
)
# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = MAX_LENGHT,
    pad_to_max_length=True,
    truncation=True
)
# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    test_text.tolist(),
    max_length = MAX_LENGHT,
    pad_to_max_length=True,
    truncation=True
)

In [None]:
# Convert lists to tensors
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())

In [None]:
# Data Loader structure definition
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
batch_size = 32                                               #define a batch size

train_data = TensorDataset(train_seq, train_mask, train_y)    # wrap tensors
train_sampler = RandomSampler(train_data)                     # sampler for sampling the data during training
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
                                                              # dataLoader for train set
val_data = TensorDataset(val_seq, val_mask, val_y)            # wrap tensors
val_sampler = SequentialSampler(val_data)                     # sampler for sampling the data during training
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)
                                                              # dataLoader for validation set

### Define Model Architecture

In [None]:
class BERT_Arch(nn.Module):
    def __init__(self, bert):
      super(BERT_Arch, self).__init__()
      self.bert = bert
      self.dropout = nn.Dropout(0.1)            # dropout layer
      self.relu =  nn.ReLU()                    # relu activation function
      self.fc1 = nn.Linear(768,512)             # dense layer 1
      self.fc2 = nn.Linear(512,2)               # dense layer 2 (Output layer)
      self.softmax = nn.LogSoftmax(dim=1)       # softmax activation function
    def forward(self, sent_id, mask):           # define the forward pass
      cls_hs = self.bert(sent_id, attention_mask=mask)['pooler_output']
                                                # pass the inputs to the model
      x = self.fc1(cls_hs)
      x = self.relu(x)
      x = self.dropout(x)
      x = self.fc2(x)                           # output layer
      x = self.softmax(x)                       # apply softmax activation
      return x

model = BERT_Arch(bert)
# Defining the hyperparameters (optimizer, weights of the classes and the epochs)
# Define the optimizer
from transformers import AdamW
optimizer = AdamW(model.parameters(),
                  lr = 1e-5)          # learning rate
# Define the loss function
cross_entropy  = nn.NLLLoss()
# Number of training epochs
epochs = 2

### Define Train & Evaluate Function

In [None]:
# Defining training and evaluation functions
def train():
  model.train()
  total_loss, total_accuracy = 0, 0

  for step,batch in enumerate(train_dataloader):                # iterate over batches
    if step % 50 == 0 and not step == 0:                        # progress update after every 50 batches.
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))
    batch = [r for r in batch]                                  # push the batch to gpu
    sent_id, mask, labels = batch
    model.zero_grad()                                           # clear previously calculated gradients
    preds = model(sent_id, mask)                                # get model predictions for current batch
    loss = cross_entropy(preds, labels)                         # compute loss between actual & predicted values
    total_loss = total_loss + loss.item()                       # add on to the total loss
    loss.backward()                                             # backward pass to calculate the gradients
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)     # clip gradients to 1.0. It helps in preventing exploding gradient problem
    optimizer.step()                                            # update parameters
    preds=preds.detach().cpu().numpy()                          # model predictions are stored on GPU. So, push it to CPU

  avg_loss = total_loss / len(train_dataloader)                 # compute training loss of the epoch
                                                                # reshape predictions in form of (# samples, # classes)
  return avg_loss                                 # returns the loss and predictions

def evaluate():
  print("\nEvaluating...")
  model.eval()                                    # Deactivate dropout layers
  total_loss, total_accuracy = 0, 0
  for step,batch in enumerate(val_dataloader):    # Iterate over batches
    if step % 50 == 0 and not step == 0:          # Progress update every 50 batches.
                                                  # Calculate elapsed time in minutes.
                                                  # Elapsed = format_time(time.time() - t0)
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))
                                                  # Report progress
    batch = [t for t in batch]                    # Push the batch to GPU
    sent_id, mask, labels = batch
    with torch.no_grad():                         # Deactivate autograd
      preds = model(sent_id, mask)                # Model predictions
      loss = cross_entropy(preds,labels)          # Compute the validation loss between actual and predicted values
      total_loss = total_loss + loss.item()
      preds = preds.detach().cpu().numpy()
  avg_loss = total_loss / len(val_dataloader)         # compute the validation loss of the epoch
  return avg_loss

### Model training

In [None]:
# Train and predict
best_valid_loss = float('inf')
train_losses=[]                   # empty lists to store training and validation loss of each epoch
valid_losses=[]

for epoch in range(epochs):
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    train_loss = train()                       # train model
    valid_loss = evaluate()                    # evaluate model
    if valid_loss < best_valid_loss:              # save the best model
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'c2_new_model_weights.pt')
    train_losses.append(train_loss)               # append training and validation loss
    valid_losses.append(valid_loss)

    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')


 Epoch 1 / 2
  Batch    50  of    325.
  Batch   100  of    325.
  Batch   150  of    325.
  Batch   200  of    325.
  Batch   250  of    325.
  Batch   300  of    325.

Evaluating...
  Batch    50  of     98.

Training Loss: 0.194
Validation Loss: 0.103

 Epoch 2 / 2
  Batch    50  of    325.
  Batch   100  of    325.
  Batch   150  of    325.
  Batch   200  of    325.
  Batch   250  of    325.
  Batch   300  of    325.

Evaluating...
  Batch    50  of     98.

Training Loss: 0.068
Validation Loss: 0.060


### Model performance

In [None]:
with torch.no_grad():
  preds = model(test_seq, test_mask)
  preds = preds.detach().cpu().numpy()

preds = np.argmax(preds, axis = 1)
print(classification_report(test_y, preds))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98       698
           1       0.96      0.99      0.98       636

    accuracy                           0.98      1334
   macro avg       0.98      0.98      0.98      1334
weighted avg       0.98      0.98      0.98      1334



#Tweets

In [None]:
# Dependencies
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install beautifulsoup4
!pip install textblob
!pip install git+https://github.com/laxmimerit/preprocess_kgptalkie.git --upgrade --force-reinstall

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m86.8 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting git+https://github.com/laxmimerit/preprocess_kgptalkie.git
  Cloning https://github.com/laxmimerit/preprocess_kgptalkie.git to /tmp/pip-req-build-junw8ybf
  Running command git clone --filter=blob:none --quiet https://github.com/laxmimerit/preprocess_kgptalkie.git /tmp/pip-req-build-junw8ybf
  Resolved https://github.com/laxmime

In [None]:
import preprocess_kgptalkie as ps

In [None]:
# Removing emojis from the tweets
import re

def remove_emojis(text):
    # Regex to match most emojis in the range of Unicode characters
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
tweets_df = pd.read_csv('https://raw.githubusercontent.com/joshndala/fake-news-detection/main/twitter_dataset/twitter_data.csv')

tweets_df['text'] = tweets_df['text'].apply(remove_emojis)

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Tokenize tweets
tokens = tokenizer.batch_encode_plus(
    tweets_df['text'].tolist(),
    max_length = 50,  # Use the same max_length as during training
    pad_to_max_length=True,
    truncation=True,
    return_tensors="pt"
)

# Extract input IDs and attention masks
input_ids = tokens['input_ids']
attention_mask = tokens['attention_mask']

In [None]:
model.eval()  # Set the model to evaluation mode

with torch.no_grad():  # Deactivate gradients for inference
    predictions = model(sent_id=input_ids, mask=attention_mask)
    # Convert model logits to softmax probabilities to get predictions
    predictions = torch.softmax(predictions, dim=1)
    # Assuming you want the class with the highest probability
    predicted_classes = torch.argmax(predictions, dim=1)


In [None]:
true_labels = tweets_df['class'].tolist()

# Convert predicted classes from a tensor to a list
predicted_classes_list = predicted_classes.numpy()

print(classification_report(true_labels, predicted_classes))

              precision    recall  f1-score   support

           0       0.51      0.85      0.64       406
           1       0.59      0.20      0.30       412

    accuracy                           0.53       818
   macro avg       0.55      0.53      0.47       818
weighted avg       0.55      0.53      0.47       818



#Other Misinformation Dataset

In [None]:
misinfo_df = pd.read_excel('Misinfo_Dataset.xlsx')
misinfo_df.head()

Unnamed: 0,Text,Translation,Class label,Unnamed: 3,Verified
0,সাধারণত প্রতি ২৮ থেকে ৩৫ দিন পর পর একজন নারীর ...,A woman usually has her period every 28 to 35 ...,Valid Information,,
1,১২ বছর থেকে ৫৫ বছর বয়সী নারীদের ক্ষেত্রে এমনটি...,The same is true for women between the ages of...,Valid Information,,
2,প্রাপ্তবয়স্ক একজন নারীর নিয়মিত ও সময়মতো মাসিক ...,Regular and timely menstruation in an adult wo...,Valid Information,,
3,"মাসিক যদি অনিয়মিত হয়ে পড়ে, তার মানে হয়তো শারীর...","If menstruation becomes irregular, it may mean...",Valid Information,,
4,বেশিরভাগ নারীর মাসিকের চক্র একই থাকে।,Most women have the same menstrual cycle.,Valid Information,,


In [None]:
misinfo_df['Class label '].value_counts()

Valid Information    2469
Misinformation       2019
Name: Class label , dtype: int64

In [None]:
def get_class_label(label):
  if label == 'Valid Information':
    return 1
  else:
    return 0

misinfo_df['class'] = misinfo_df['Class label '].apply(get_class_label)
misinfo_df.head()

Unnamed: 0,Text,Translation,Class label,Unnamed: 3,Verified,class
0,সাধারণত প্রতি ২৮ থেকে ৩৫ দিন পর পর একজন নারীর ...,A woman usually has her period every 28 to 35 ...,Valid Information,,,1
1,১২ বছর থেকে ৫৫ বছর বয়সী নারীদের ক্ষেত্রে এমনটি...,The same is true for women between the ages of...,Valid Information,,,1
2,প্রাপ্তবয়স্ক একজন নারীর নিয়মিত ও সময়মতো মাসিক ...,Regular and timely menstruation in an adult wo...,Valid Information,,,1
3,"মাসিক যদি অনিয়মিত হয়ে পড়ে, তার মানে হয়তো শারীর...","If menstruation becomes irregular, it may mean...",Valid Information,,,1
4,বেশিরভাগ নারীর মাসিকের চক্র একই থাকে।,Most women have the same menstrual cycle.,Valid Information,,,1


In [None]:
misinfo_data = misinfo_df[["Translation", "class"]]
misinfo_data.rename(columns={"Translation": "text"}, inplace=True)
misinfo_data['text'] = misinfo_data['text'].apply(lambda x: str(x).lower())
misinfo_data.head()

Unnamed: 0,text,class
0,a woman usually has her period every 28 to 35 ...,1
1,the same is true for women between the ages of...,1
2,regular and timely menstruation in an adult wo...,1
3,"if menstruation becomes irregular, it may mean...",1
4,most women have the same menstrual cycle.,1


In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Tokenize text
tokens = tokenizer.batch_encode_plus(
    misinfo_data['text'].tolist(),
    max_length = 50,  # Use the same max_length as during training
    pad_to_max_length=True,
    truncation=True,
    return_tensors="pt"
)

# Extract input IDs and attention masks
input_ids = tokens['input_ids']
attention_mask = tokens['attention_mask']

In [None]:
model.eval()  # Set the model to evaluation mode

with torch.no_grad():  # Deactivate gradients for inference
    predictions = model(sent_id=input_ids, mask=attention_mask)
    # Convert model logits to softmax probabilities to get predictions
    predictions = torch.softmax(predictions, dim=1)
    # Assuming you want the class with the highest probability
    predicted_classes = torch.argmax(predictions, dim=1)


In [None]:
true_labels = misinfo_data['class'].tolist()

# Convert predicted classes from a tensor to a list
predicted_classes_list = predicted_classes.numpy()

print(classification_report(true_labels, predicted_classes))

              precision    recall  f1-score   support

           0       0.43      0.78      0.55      2041
           1       0.42      0.13      0.20      2469

    accuracy                           0.43      4510
   macro avg       0.42      0.46      0.37      4510
weighted avg       0.42      0.43      0.36      4510

