In [1]:
#import libraries
import pandas as pd
import nltk
import numpy as np
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords

nltk.download('wordnet')
nltk.download('stopwords')

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\izbaa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\izbaa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#save data in dataframe
df = pd.read_csv('dataframe.csv')
df.head()

Unnamed: 0,Text,Label
0,SCARY! LEAKED EMAIL PROVES Radical Billionaire...,fake
1,Watch as Assad Destroys US Reporter Michael Is...,fake
2,UK counter-terrorism police charge 14-year-old...,real
3,The Internet Drags Trump’s Son For Saying ‘Th...,fake
4,Charles Koch Has The Sads Because He Thinks H...,fake


In [3]:
#initialize the tokenizer
tokenizer = nltk.RegexpTokenizer(r"\w+")

#initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

#get all the English stopwords
stopwords = stopwords.words('english')

In [4]:
#function which will first tokenize all the words in the Body field. 
#Then these words will be lemmatized to obtain the root words for example dance and dancing will be the same. 
#After that it will remove the stop words such as 'the' to only keep important words which are useful for the analysis. 
#Lastly, this function will return these words in a list.
def preprocess_text(text):
    # Check if text is a string or can be converted to a string
    if not isinstance(text, str):
        text = str(text)
    
    # Make tokens of everything in the text
    tokens = tokenizer.tokenize(text)
    
    # Make them all lowercase
    lower = [text.lower() for text in tokens]
    
    # Lemmatize all the tokens and store them in a list
    lemmatized = [lemmatizer.lemmatize(text) for text in lower]
    
    # Get all the words which aren't stopwords in a list
    words = [text for text in lemmatized if text not in stopwords]
    
    return words


In [5]:
#split 80:20 for train and test
#find 80th percent of the length of the dataframe
split = int(len(df) * 0.8)

#split everything before that into train and the rest into test
train_df, test_df = df[:split], df[split:]

In [6]:
#Create dictionary which stores each token and its count
# Empty dictionary for storing the tokens
token_count = {}

# Tokenize the combined text
for text in train_df['Text']:
    tokens = preprocess_text(text)
    
    for token in tokens:
        if token in token_count:
            token_count[token] += 1
        else:
            token_count[token] = 1

# Print the count of unique tokens
print(len(token_count))

125962


In [7]:
#check max and min value
print(max(token_count.values()))

140366


In [8]:
print(min(token_count.values()))

1


In [9]:
#function which returns true if input word is appearing between the threshold
def common_tokens(token, lowthresh, highthresh):
    if token not in token_count:
        return False
    else:
        tokens = token_count[token] > lowthresh and token_count[token] < highthresh
        return tokens

In [10]:
#set to store all tokens apperaing inside threshold
common = set()

#loops through the token count dictionary 
for token in token_count:
    #if the value if greater than 3000 and less than 100000 adds it 
    if common_tokens(token,3000, 100000):
        common.add(token)
        
print(common)

{'victory', 'thing', 'texas', 'much', 'international', 'plan', 'capital', 'judge', 'record', 'donald', 'everyone', 'ryan', 'crisis', 'intelligence', 'given', 'community', 'known', 'others', 'service', 'message', 'army', 'militant', 'shooting', 'detail', 'special', 'wall', 'wrong', 'play', 'adviser', 'photo', 'went', 'interest', 'seems', 'opposition', 'example', 'speech', 'first', 'county', '4', 'term', 'n', 'center', 'muslim', 'facebook', 'group', 'point', 'committee', 'may', 'either', 'key', 'three', 'primary', 'november', 'simply', 'meeting', 'presidential', 'making', 'six', 'healthcare', 'convention', 'found', 'despite', 'expected', 'made', 'post', 'sander', 'well', 'even', 'view', 'al', 'fact', 'able', 'set', 'already', 'democratic', 'mexico', 'statement', 'energy', 'account', 'book', 'protest', 'secretary', 'weapon', 'hit', 'reported', '2013', 'obama', 'together', 'better', 'hand', 'work', 'increase', 'today', 'east', 'half', 'europe', 'never', 'failed', 'several', 'potential', 'i

In [11]:
#The words are then added to a list and arranged and given an index of most to least appeared
#convert to list
common = list(common)

#map them
mapping = {t:i for t,i in zip(common, range(len(common)))}
print(mapping)

{'victory': 0, 'thing': 1, 'texas': 2, 'much': 3, 'international': 4, 'plan': 5, 'capital': 6, 'judge': 7, 'record': 8, 'donald': 9, 'everyone': 10, 'ryan': 11, 'crisis': 12, 'intelligence': 13, 'given': 14, 'community': 15, 'known': 16, 'others': 17, 'service': 18, 'message': 19, 'army': 20, 'militant': 21, 'shooting': 22, 'detail': 23, 'special': 24, 'wall': 25, 'wrong': 26, 'play': 27, 'adviser': 28, 'photo': 29, 'went': 30, 'interest': 31, 'seems': 32, 'opposition': 33, 'example': 34, 'speech': 35, 'first': 36, 'county': 37, '4': 38, 'term': 39, 'n': 40, 'center': 41, 'muslim': 42, 'facebook': 43, 'group': 44, 'point': 45, 'committee': 46, 'may': 47, 'either': 48, 'key': 49, 'three': 50, 'primary': 51, 'november': 52, 'simply': 53, 'meeting': 54, 'presidential': 55, 'making': 56, 'six': 57, 'healthcare': 58, 'convention': 59, 'found': 60, 'despite': 61, 'expected': 62, 'made': 63, 'post': 64, 'sander': 65, 'well': 66, 'even': 67, 'view': 68, 'al': 69, 'fact': 70, 'able': 71, 'set':

<html>
    <body>
        <p> Now make a bag of words. Bag of words include the words and how many times it appears in a text. For example, in the text: "this year trump won the elections at this time of the year." : </p>
        <p> ["trump" , "year" , "time"] <br>[  1  ,   2   ,   1  ]
        </p>
        <p> We will make a function to create the bag of words now. </p>
    </body>
</html>

In [12]:
#make bag of words
def bag_of_words(text):
    
    #make an empty vector of the length of common list
    count_vector = np.zeros(len(common))
    
    #get the tokens list and store them
    proc_tokens = preprocess_text(text)
    
    #count them and add them to the vector
    for token in proc_tokens:
        #if not in common, ignore the tokens 
        if token not in common:
            continue
        #count the values
        index = mapping[token]
        count_vector[index] += 1
        
    return count_vector

In [13]:
#function which get the df and get the y value which is the label value. 
#It will also iterate over the text and apply the bag of words function to each text and return it as x. 
def df_xy(dataframe):
    y = dataframe['Label']
    
    text = dataframe['Text']
    vectors = []
    
    for t in text:
        vector = bag_of_words(t)
        vectors.append(vector)
        
    x = np.array(vectors).astype(int)
    
    return x,y

In [14]:
#This function will then be applied on train and test data.
x_train, y_train = df_xy(train_df)
x_test, y_test = df_xy(test_df)


x_train.shape , y_train.shape , x_test.shape , y_test.shape

((42632, 774), (42632,), (10658, 774), (10658,))

<html>
    <body>
        <p> We will now get the accuracy using the three evalualtion methods and export the one with the highest accuracy. </p>
    </body>
</html>

In [15]:
lr = LogisticRegression().fit(x_train , y_train)
lrpred = lr.predict(x_test)
lrscore =accuracy_score(y_test,lrpred)
print(f'Accuracy: {round(lrscore*100,2)}%')

Accuracy: 95.04%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
confusion_matrix(y_test,lrpred, labels=['fake','real'])

array([[5165,  208],
       [ 321, 4964]], dtype=int64)

In [17]:
NB = MultinomialNB()
NB.fit(x_train, y_train)

nbpred = NB.predict(x_test)
nbscore=accuracy_score(y_test,nbpred)
print(f'Accuracy: {round(nbscore*100,2)}%')

Accuracy: 88.81%


In [18]:
confusion_matrix(y_test,nbpred, labels=['fake','real'])

array([[4794,  579],
       [ 614, 4671]], dtype=int64)

In [19]:
#Initialize a PassiveAggressiveClassifier
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(x_train,y_train)

#Predict on the test set and calculate accuracy
pacpred=pac.predict(x_test)
pacscore=accuracy_score(y_test,pacpred)
print(f'Accuracy: {round(pacscore*100,2)}%')

Accuracy: 91.96%


In [20]:
confusion_matrix(y_test,pacpred, labels=['fake','real'])

array([[4789,  584],
       [ 273, 5012]], dtype=int64)

<html>
    <body>
        <p> As it can be seen, we get the highest accuracy using logistic regression so we export that. </p>
    </body>
</html>

In [21]:
import pickle

In [22]:
# Save the trained classifier and mapping dictionary to a pickle file
with open('../bagOfWords.pickle', 'wb') as f:
    pickle.dump(lr, f)
    pickle.dump(mapping, f)