In [1]:
import pandas as pd
import numpy as np
import gensim
import itertools
import nltk
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords

nltk.download('wordnet')
nltk.download('stopwords')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\izbaa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\izbaa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#save data in dataframe
df = pd.read_csv('dataframe.csv')

In [3]:
#initialize the tokenizer
tokenizer = nltk.RegexpTokenizer(r"\w+")

#initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

#get all the English stopwords
stopwords = stopwords.words('english')

In [4]:
#function to lemmatize and remove stopwords
def preprocess_text(text):
    # Check if text is a string or can be converted to a string
    if not isinstance(text, str):
        text = str(text)
    
    # Make tokens of everything in the text
    tokens = tokenizer.tokenize(text)
    
    # Make them all lowercase
    lower = [text.lower() for text in tokens]
    
    # Lemmatize all the tokens and store them in a list
    lemmatized = [lemmatizer.lemmatize(text) for text in lower]
    
    # Get all the words which aren't stopwords in a list
    words = [text for text in lemmatized if text not in stopwords]
    
    #convert back to text
    sentence = ' '.join(words)
    
    return sentence

In [5]:
df['Text'] = df['Text'].apply(preprocess_text)
df.head()

Unnamed: 0,Text,Label
0,scary leaked email prof radical billionaire do...,fake
1,watch assad destroys u reporter michael isikof...,fake
2,uk counter terrorism police charge 14 year old...,real
3,internet drag trump son saying better patriot ...,fake
4,charles koch ha sads think influence election ...,fake


In [6]:
#Split the dataset
train_df , test_df=train_test_split(df, test_size=0.2, random_state=7)

In [7]:
#tokenizes each word
tokenized_text = [text.split() for text in df['Text']]
#creates a new word2vec model with size of 100
word2vec_model = gensim.models.Word2Vec(tokenized_text, vector_size=1000, window=10, min_count=10)

In [8]:
#function to get average word vector representation
#takes as input the list, pretrained model , unique words in models vocab and dimensionality of vectors
def average_word_vectors(words, model, vocabulary, num_features):
    #creates a vector with 0s of the lenght of numfeatures
    feature_vector = np.zeros((num_features,), dtype="float64")
    #nwords keep track of words
    nwords = 0.
    #iterate over list
    for word in words:
        #if word is in model, increase nword
        if word in vocabulary:
            nwords += 1
            #add the word to the vector with 0s
            feature_vector = np.add(feature_vector, model.wv[word])
    #check if nwords is now greater than 0 and if so calculate average
    if nwords > 0:
        feature_vector = np.divide(feature_vector, nwords)
    #return the vector
    return feature_vector

In [9]:
# Extract features for training data
#empty array to store
train_data = []
#iterates over train df
for _, row in train_df.iterrows():
    #apply the function of text column and adds it to the array
    features= average_word_vectors(row['Text'].split(), word2vec_model, word2vec_model.wv.key_to_index, 1000)
    train_data.append(features)
#make a new array with labels of train_df
train_labels = train_df['Label']

In [10]:
# Repeat process for test data
#empty array to store
test_data = []
#iterates over train df
for _, row in test_df.iterrows():
    #apply the function of text column and adds it to the array
    features= average_word_vectors(row['Text'].split(), word2vec_model, word2vec_model.wv.key_to_index, 1000)
    test_data.append(features)
#make a new array with labels of test_df
test_labels = test_df['Label']

In [11]:
#To ensure there are no negative values
train_data = np.clip(train_data, 0, None)
test_data = np.clip(test_data, 0, None)

In [12]:
# Initialize a LogisticRegression classifier
lr=LogisticRegression()
lr.fit(train_data, train_labels)

#Predict on the test set and calculate accuracy
lrpred=lr.predict(test_data)
score=accuracy_score(test_labels,lrpred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 94.98%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
confusion_matrix(test_labels,lrpred, labels=['fake','real'])

array([[5072,  249],
       [ 286, 5051]], dtype=int64)

In [14]:
# Initialize a multinomial naive bayes classifier
nb=MultinomialNB()
nb.fit(train_data, train_labels)

#Predict on the test set and calculate accuracy
nbpred=nb.predict(test_data)
score=accuracy_score(test_labels,nbpred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 85.23%


In [15]:
confusion_matrix(test_labels,nbpred, labels=['fake','real'])

array([[4506,  815],
       [ 759, 4578]], dtype=int64)

In [16]:
#Initialize a PassiveAggressiveClassifier
pac=PassiveAggressiveClassifier()
pac.fit(train_data, train_labels)

#Predict on the test set and calculate accuracy
pcpred=pac.predict(test_data)
score=accuracy_score(test_labels,pcpred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 93.78%


In [17]:
#Build confusion matrix
confusion_matrix(test_labels,pcpred, labels=['fake','real'])

array([[5132,  189],
       [ 474, 4863]], dtype=int64)

<html>
    <body>
        <p> As it can be seen, we get the highest accuracy using logistic regression so we export that. </p>
    </body>
</html>

In [18]:
import pickle

In [19]:
# Save the trained classifier and word2vec model to a pickle file
with open('../website/pickleFiles/word2vec.pickle', 'wb') as f:
    pickle.dump(lr, f)
    pickle.dump(word2vec_model, f)