In [1]:
#import libraries
import pandas as pd
import nltk
import numpy as np
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords

nltk.download('wordnet')
nltk.download('stopwords')

import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\izbaa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\izbaa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#save data in dataframe
df = pd.read_csv('dataframe.csv')
df.head()

Unnamed: 0,Text,Label
0,SCARY! LEAKED EMAIL PROVES Radical Billionaire...,fake
1,Watch as Assad Destroys US Reporter Michael Is...,fake
2,UK counter-terrorism police charge 14-year-old...,real
3,The Internet Drags Trump’s Son For Saying ‘Th...,fake
4,Charles Koch Has The Sads Because He Thinks H...,fake


In [3]:
#initialize the tokenizer
tokenizer = nltk.RegexpTokenizer(r"\w+")

#initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

#get all the English stopwords
stopwords = stopwords.words('english')

In [4]:
#function to lemmatize and remove stopwords
def preprocess_text(text):
    # Check if text is a string or can be converted to a string
    if not isinstance(text, str):
        text = str(text)
    
    # Make tokens of everything in the text
    tokens = tokenizer.tokenize(text)
    
    # Make them all lowercase
    lower = [text.lower() for text in tokens]
    
    # Lemmatize all the tokens and store them in a list
    lemmatized = [lemmatizer.lemmatize(text) for text in lower]
    
    # Get all the words which aren't stopwords in a list
    words = [text for text in lemmatized if text not in stopwords]
    
    #convert back to text
    sentence = ' '.join(words)
    
    return sentence
    

In [5]:
text = df['Text'].apply(preprocess_text)
text.head()

0    scary leaked email prof radical billionaire do...
1    watch assad destroys u reporter michael isikof...
2    uk counter terrorism police charge 14 year old...
3    internet drag trump son saying better patriot ...
4    charles koch ha sads think influence election ...
Name: Text, dtype: object

In [6]:
#Get the labels
labels=df.Label
labels.head()

0    fake
1    fake
2    real
3    fake
4    fake
Name: Label, dtype: object

In [7]:
#Split the dataset
x_train,x_test,y_train,y_test=train_test_split(text, labels, test_size=0.2, random_state=7)
x_train = x_train.fillna('')  # Replace NaN values with empty strings
x_test = x_test.fillna('')  # Replace NaN values with empty strings

In [8]:
#Initialize a TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.9, min_df=0.1)

#Fit and transform train set, transform test set
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

In [9]:
# Initialize a LogisticRegression classifier
lr=LogisticRegression()

# Train the classifier on the training data
lr.fit(tfidf_train, y_train)

# Predict on the test set and calculate accuracy
lrpred=lr.predict(tfidf_test)
score=accuracy_score(y_test,lrpred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 93.21%


In [10]:
confusion_matrix(y_test,lrpred, labels=['fake','real'])

array([[5035,  286],
       [ 438, 4899]], dtype=int64)

In [11]:
# Initialize a multinomial naive bayes classifier
nb=MultinomialNB()

# Train the classifier on the training data
nb.fit(tfidf_train, y_train)

# Predict on the test set and calculate accuracy
nbpred=nb.predict(tfidf_test)
score=accuracy_score(y_test,nbpred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 87.69%


In [12]:
confusion_matrix(y_test,nbpred, labels=['fake','real'])

array([[4635,  686],
       [ 626, 4711]], dtype=int64)

In [13]:
#Initialize a PassiveAggressiveClassifier
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)

#Predict on the test set and calculate accuracy
pcpred=pac.predict(tfidf_test)
score=accuracy_score(y_test,pcpred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 92.17%


In [14]:
#Build confusion matrix
confusion_matrix(y_test,pcpred, labels=['fake','real'])

array([[5120,  201],
       [ 633, 4704]], dtype=int64)

<html>
    <body>
        <p> As it can be seen, we get the highest accuracy using passive aggressive classifier so we export that. </p>
    </body>
</html>

In [15]:
import pickle

In [16]:
# Save the trained classifier and tfidf vector to a pickle file
with open('../website/pickleFiles/tfidf.pickle', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)
    pickle.dump(lr, f)
    