In [164]:
#standard data analysis libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#natural language toolkit for processing
import nltk
from nltk.corpus import stopwords

import re
import html
import string
from sklearn import preprocessing

#for fasttext
import fasttext

#for XLNet
from transformers import XLNetTokenizer, XLNetModel
import torch
import sentencepiece as spm

# NLP models
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import timeit

In [165]:
#importing datasets
df_train = pd.read_csv("Constraint_Train.csv")
df_test = pd.read_csv("test1.csv")

print(df_train.head())
print('\n')
print(df_test.head())


   id                                              tweet label
0   1  The CDC currently reports 99031 deaths. In gen...  real
1   2  States reported 1121 deaths a small rise from ...  real
2   3  Politically Correct Woman (Almost) Uses Pandem...  fake
3   4  #IndiaFightsCorona: We have 1524 #COVID testin...  real
4   5  Populous states can generate large case counts...  real


   id                                              tweet label
0   1  Chinese converting to Islam after realising th...  fake
1   2  11 out of 13 people (from the Diamond Princess...  fake
2   3  COVID-19 Is Caused By A Bacterium, Not Virus A...  fake
3   4  Mike Pence in RNC speech praises Donald Trump’...  fake
4   5  6/10 Sky's @EdConwaySky explains the latest #C...  real


In [166]:
#take a look at dataset
df_train.info()
print('\n')
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6420 entries, 0 to 6419
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      6420 non-null   int64 
 1   tweet   6420 non-null   object
 2   label   6420 non-null   object
dtypes: int64(1), object(2)
memory usage: 150.6+ KB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2140 entries, 0 to 2139
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      2140 non-null   int64 
 1   tweet   2140 non-null   object
 2   label   2140 non-null   object
dtypes: int64(1), object(2)
memory usage: 50.3+ KB


In [167]:
#getting a count of fake & real entries for train dataset
print(df_train["label"].value_counts())
print('\n')
print(df_test["label"].value_counts())

label
real    3360
fake    3060
Name: count, dtype: int64


label
real    1120
fake    1020
Name: count, dtype: int64


In [168]:
#saving punctuations and stopwords into variables for preprocessing
punctuations = string.punctuation

STOP = stopwords.words("english")

In [169]:
#function to remove punctuation and stopwords from tweets and lowercasing the tweets
def cleanTweets(tweetParse):
    for i in range(0, len(tweetParse)):
        tweet = tweetParse[i]
        tweet = html.unescape(tweet)
        tweet = re.sub(r"@\w+", " ", tweet)
        tweet = re.sub(r"http\S+", " ", tweet)
        tweet = "".join([punc for punc in tweet if not punc in punctuations])
        tweet = tweet.lower()
        
        tweetWord = tweet.split()
        tweetParse[i] = "".join([word + " " for word in tweetWord if not word in STOP])
    return tweetParse

In [170]:
#clean tweets and place them into new column
df_train["cleanTweet"] = cleanTweets(df_train["tweet"].copy())
df_test["cleanTweet"] = cleanTweets(df_test["tweet"].copy())

#checking if function works for df_train
df_train.head()

Unnamed: 0,id,tweet,label,cleanTweet
0,1,The CDC currently reports 99031 deaths. In gen...,real,cdc currently reports 99031 deaths general dis...
1,2,States reported 1121 deaths a small rise from ...,real,states reported 1121 deaths small rise last tu...
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake,politically correct woman almost uses pandemic...
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real,indiafightscorona 1524 covid testing laborator...
4,5,Populous states can generate large case counts...,real,populous states generate large case counts loo...


In [171]:
#checking if function works for df_test
df_test.head()

Unnamed: 0,id,tweet,label,cleanTweet
0,1,Chinese converting to Islam after realising th...,fake,chinese converting islam realising muslim affe...
1,2,11 out of 13 people (from the Diamond Princess...,fake,11 13 people diamond princess cruise ship inti...
2,3,"COVID-19 Is Caused By A Bacterium, Not Virus A...",fake,covid19 caused bacterium virus treated aspirin
3,4,Mike Pence in RNC speech praises Donald Trump’...,fake,mike pence rnc speech praises donald trump’s c...
4,5,6/10 Sky's @EdConwaySky explains the latest #C...,real,610 skys explains latest covid19 data governme...


In [172]:
#encoding label into binary form
label_encoder = preprocessing.LabelEncoder()
dummyTrain = label_encoder.fit_transform(df_train["label"])
dummyTest = label_encoder.fit_transform(df_test["label"])

df_train["encodedLabel"] = dummyTrain
df_test["encodedLabel"] = dummyTest

#checking if encoder works
df_train.head()

Unnamed: 0,id,tweet,label,cleanTweet,encodedLabel
0,1,The CDC currently reports 99031 deaths. In gen...,real,cdc currently reports 99031 deaths general dis...,1
1,2,States reported 1121 deaths a small rise from ...,real,states reported 1121 deaths small rise last tu...,1
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake,politically correct woman almost uses pandemic...,0
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real,indiafightscorona 1524 covid testing laborator...,1
4,5,Populous states can generate large case counts...,real,populous states generate large case counts loo...,1


In [173]:
#cleaning of train dataset
#dropping tweet and label columns
df_train.drop(['id', 'tweet', 'label'], axis= 'columns', inplace=True)

df_train.head()

Unnamed: 0,cleanTweet,encodedLabel
0,cdc currently reports 99031 deaths general dis...,1
1,states reported 1121 deaths small rise last tu...,1
2,politically correct woman almost uses pandemic...,0
3,indiafightscorona 1524 covid testing laborator...,1
4,populous states generate large case counts loo...,1


In [174]:
#cleaning of test dataset
#dropping id and tweet column
df_test.drop(['id', 'tweet', 'label'], axis='columns', inplace=True)

df_test.head()

Unnamed: 0,cleanTweet,encodedLabel
0,chinese converting islam realising muslim affe...,0
1,11 13 people diamond princess cruise ship inti...,0
2,covid19 caused bacterium virus treated aspirin,0
3,mike pence rnc speech praises donald trump’s c...,0
4,610 skys explains latest covid19 data governme...,1


In [175]:
#loading of fasttext model
fasttext_model = fasttext.load_model('wiki.simple.bin')

#function to do feature extraction
def get_fasttext_features(sentence):
    words = sentence.split()
    vectors = [fasttext_model.get_word_vector(word) for word in words]
    avg_vector = sum(vectors) / len(vectors)
    return avg_vector



In [176]:
#loading of pretrained XLNet model and tokenizer
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
xlnet_model = XLNetModel.from_pretrained('xlnet-base-cased')

#function to generate XLNet features for a sentence
def get_xlnet_features(sentence):
    inputs = tokenizer(sentence, return_tensors='pt', max_length=512, truncation=True)
    outputs = xlnet_model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    avg_pooled = torch.mean(last_hidden_states, dim=1).squeeze()
    return avg_pooled.detach().numpy()

In [177]:
#applying function to train dataset
#retrieving time taken
start_time = timeit.default_timer()
df_train['fasttext_features'] = df_train['cleanTweet'].apply(get_fasttext_features)
elapsed = timeit.default_timer() - start_time
print(f'Time taken to extract features from train dataset with fasttext: {elapsed}')

Time taken to extract features from train dataset with fasttext: 0.9476410001516342


In [178]:
#applying XLNet function to train dataset
#retrieving time taken
start_time = timeit.default_timer()
df_train['xlnet_features'] = df_train['cleanTweet'].apply(get_xlnet_features)
elapsed = timeit.default_timer() - start_time
print(f'Time taken to extract features from train dataset with XLNet: {elapsed}')

Time taken to extract features from train dataset with XLNet: 289.4137211248744


In [179]:
#taking a look at 'new' train dataset
df_train.head()

Unnamed: 0,cleanTweet,encodedLabel,fasttext_features,xlnet_features
0,cdc currently reports 99031 deaths general dis...,1,"[0.15246023, -0.15250677, 0.053375464, -0.2004...","[-0.591332, 1.3220596, -1.2890038, 0.43662465,..."
1,states reported 1121 deaths small rise last tu...,1,"[0.21843885, -0.2629997, -0.04008166, -0.10784...","[0.25671953, 2.420395, -2.559547, 2.5866818, -..."
2,politically correct woman almost uses pandemic...,0,"[0.10240695, 0.05278797, 0.10026813, -0.115677...","[1.6209552, 1.4003724, -0.060141277, 1.9227209..."
3,indiafightscorona 1524 covid testing laborator...,1,"[0.15502764, -0.004772112, 0.106178775, -0.168...","[-0.8387828, 1.4871421, -1.6803803, 2.005351, ..."
4,populous states generate large case counts loo...,1,"[0.13204077, -0.10095969, 0.12423663, -0.17807...","[-0.36012238, 2.6860003, -1.738005, 2.3930886,..."


In [180]:
#applying function to test dataset
#retrieving time taken
start_time = timeit.default_timer()
df_test['fasttext_features'] = df_test['cleanTweet'].apply(get_fasttext_features)
elapsed = timeit.default_timer() - start_time
print(f'Time taken to extract features from test dataset with fasttext: {elapsed}')

Time taken to extract features from test dataset with fasttext: 1.6608560420572758


In [181]:
#applying XLNet function to test dataset
#retrieving time taken
start_time = timeit.default_timer()
df_test['xlnet_features'] = df_test['cleanTweet'].apply(get_xlnet_features)
elapsed = timeit.default_timer() - start_time
print(f'Time taken to extract features from test dataset with XLNet: {elapsed}')

Time taken to extract features from test dataset with XLNet: 104.09481154195964


In [196]:
#concatenating both features for model fitting
X = pd.concat([df_train['fasttext_features'].apply(pd.Series), df_train['xlnet_features'].apply(pd.Series)], axis=1)
y = df_train['encodedLabel']

X_test = pd.concat([df_test['fasttext_features'].apply(pd.Series), df_test['xlnet_features'].apply(pd.Series)], axis=1)
y_test = df_test['encodedLabel']


#initialize model and training
start_time = timeit.default_timer()
lr_model = LogisticRegression(solver='lbfgs', max_iter=5000)
lr_model.fit(X, y)

#prediction
predictions_lr = lr_model.predict(X_test)
elapsed = timeit.default_timer() - start_time
#checking accuracy
accuracy = accuracy_score(y_test, predictions_lr)
print(f'Accuracy of Linear Regression model: {accuracy}')
print(f'Time taken for Linear Regression model: {elapsed}')

Accuracy of Linear Regression model: 0.8686915887850467
Time taken for Linear Regression model: 15.918151166057214


In [183]:
#checking all metrics
lr_class = classification_report(y_test, predictions_lr)
print(f'Classification report of Linear Regression model')
print(f'{lr_class}')

Classification report of Linear Regression model
              precision    recall  f1-score   support

           0       0.85      0.88      0.86      1020
           1       0.89      0.86      0.87      1120

    accuracy                           0.87      2140
   macro avg       0.87      0.87      0.87      2140
weighted avg       0.87      0.87      0.87      2140



In [184]:
#NB model
scaler = MinMaxScaler()
X_nb = scaler.fit_transform(X)
X_test_nb = scaler.fit_transform(X_test)

start_time = timeit.default_timer()
nb_model = MultinomialNB()
nb_model.fit(X_nb,y)

#prediction
predictions_nb = nb_model.predict(X_test_nb)
elapsed_nb = timeit.default_timer() - start_time

accuracy_nb = accuracy_score(y_test, predictions_nb)
print(f'Accuracy of Naive Bayes model: {accuracy_nb}')
print(f'Time taken for Naive Bayes model: {elapsed_nb}')

Accuracy of Naive Bayes model: 0.7929906542056074
Time taken for Naive Bayes model: 0.07559666712768376


In [185]:
#checking all metrics
nb_class = classification_report(y_test, predictions_nb)
print(f'Classification report of Naive Bayes model')
print(f'{nb_class}')

Classification report of Naive Bayes model
              precision    recall  f1-score   support

           0       0.77      0.81      0.79      1020
           1       0.82      0.78      0.80      1120

    accuracy                           0.79      2140
   macro avg       0.79      0.79      0.79      2140
weighted avg       0.79      0.79      0.79      2140



In [186]:
#RF model
start_time_rf = timeit.default_timer()
rf_model = RandomForestClassifier(n_estimators=100, criterion='entropy', class_weight='balanced_subsample', random_state=42)
rf_model.fit(X, y)

predictions_rf = rf_model.predict(X_test)
elapsed_rf = timeit.default_timer() - start_time_rf

accuracy_rf = accuracy_score(y_test, predictions_rf)
print(f'Accuracy of Random Forest model: {accuracy_rf}')
print(f'Time taken for Random Forest model: {elapsed_rf}')

Accuracy of Random Forest model: 0.8733644859813084
Time taken for Random Forest model: 12.428017165977508


In [187]:
#checking all metrics
rf_class = classification_report(y_test, predictions_rf)
print(f'Classification report of Random Forest model')
print(f'{rf_class}')

Classification report of Random Forest model
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1020
           1       0.89      0.86      0.88      1120

    accuracy                           0.87      2140
   macro avg       0.87      0.87      0.87      2140
weighted avg       0.87      0.87      0.87      2140



In [188]:
#SVM model
start_time_svm = timeit.default_timer()
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X, y)

predictions_svm = svm_model.predict(X_test)
elapsed_svm = timeit.default_timer() - start_time_svm

accuracy_svm = accuracy_score(y_test, predictions_svm)
print(f'Accuracy of SVM model: {accuracy_svm}')
print(f'Time taken for SVM model: {elapsed_svm}')

Accuracy of SVM model: 0.8696261682242991
Time taken for SVM model: 64.15167300007306


In [189]:
#checking all metrics
svm_class = classification_report(y_test, predictions_svm)
print(f'Classification report of SVM model')
print(f'{svm_class}')

Classification report of SVM model
              precision    recall  f1-score   support

           0       0.85      0.88      0.87      1020
           1       0.89      0.86      0.87      1120

    accuracy                           0.87      2140
   macro avg       0.87      0.87      0.87      2140
weighted avg       0.87      0.87      0.87      2140



In [190]:
#checking all metrics
lr_report = classification_report(y_test, predictions_lr, output_dict = True)
nb_report = classification_report(y_test, predictions_nb, output_dict = True)
rf_report = classification_report(y_test, predictions_rf, output_dict = True)
svm_report = classification_report(y_test, predictions_svm, output_dict = True)

In [191]:
#function to create classification report table
def crTable(data, model):
    accuracy = data['accuracy']
    ogData = pd.DataFrame(data)
    dataT = ogData.transpose()
    finalData = pd.DataFrame(dataT, index=['macro avg'])
    finalData.loc['macro avg', 'accuracy'] = accuracy
    finalData.rename(index={'macro avg': model}, inplace=True)
    return finalData

In [192]:
#creating classification report table for all models
svm_results = crTable(svm_report, 'SVM')
lr_results = crTable(lr_report, 'LR')
rf_results = crTable(rf_report, 'RF')
nb_results = crTable(nb_report, 'NB')

In [193]:
#concatenating all tables into 1
result = [svm_results, lr_results, rf_results, nb_results]
main_table = pd.concat(result)
main_table = main_table.drop(columns='support')
main_table

Unnamed: 0,precision,recall,f1-score,accuracy
SVM,0.869328,0.870019,0.86949,0.869626
LR,0.868413,0.869126,0.868563,0.868692
RF,0.873194,0.873985,0.873268,0.873364
NB,0.793208,0.793829,0.792915,0.792991


In [197]:
print(f'Time taken for Linear Regression model: {elapsed} seconds')
print(f'Time taken for Naive Bayes model: {elapsed_nb} seconds')
print(f'Time taken for Random Forest model: {elapsed_rf} seconds')
print(f'Time taken for SVM model: {elapsed_svm} seconds')

Time taken for Linear Regression model: 15.918151166057214 seconds
Time taken for Naive Bayes model: 0.07559666712768376 seconds
Time taken for Random Forest model: 12.428017165977508 seconds
Time taken for SVM model: 64.15167300007306 seconds


In [198]:
#adding time taken to main table
main_table['time taken'] = [elapsed_svm, elapsed, elapsed_rf, elapsed_nb]
main_table

Unnamed: 0,precision,recall,f1-score,accuracy,time taken
SVM,0.869328,0.870019,0.86949,0.869626,64.151673
LR,0.868413,0.869126,0.868563,0.868692,15.918151
RF,0.873194,0.873985,0.873268,0.873364,12.428017
NB,0.793208,0.793829,0.792915,0.792991,0.075597
