In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold
import nltk
import numpy as np
import matplotlib.pyplot as plt

In [4]:
# Loading the dataset
news_train = pd.read_csv(
    "/Users/mohammadanas/Desktop/Duke MIDS/Fall 2021/NLP/NLP Final Project/data/Fake_news_data.csv"
)


In [9]:
# We just conduct EDA and go over the dataset as whole and 
# see if there are any flaws in the structure that need to be corrected 
# We make a copy of the data and make a perform some analysis
# After doing some preprocessing steps we note that these data entries have no text available 
# We filter out the IDS of these data entries 


news_train_eda = news_train.copy()
news_train_eda['text'] = news_train_eda['text'].str.lower()
news_train_eda['text'] = news_train_eda['text'].replace('[^A-Za-z\s]','', regex=True)
news_train_eda['text'] = news_train_eda['text'].str.split()
ids = list(news_train_eda.loc[news_train_eda['text'].str.len() == 0 ,]['id'])
news_train_eda.loc[news_train['id'].isin(ids)]

Unnamed: 0,id,title,author,text,label
82,82,Huma’s Weiner Dogs Hillary,Steve Sailer,[],1
169,169,Mohamad Khweis: Another “Virginia Man” (Palest...,James Fulford,[],1
295,295,A Connecticut Reader Reports Record Voter Regi...,VDARE.com Reader,[],1
470,470,BULLETIN: There ARE Righteous Jews For Trump!;...,admin,[],1
492,492,Казахстан на страже ядерной безопасности | Нов...,,[],1
...,...,...,...,...,...
20141,20141,Thomas Frank Explores Whether Hillary Clinton ...,,[],1
20242,20242,Radio Derb Transcript For October 21 Up: The M...,John Derbyshire,[],1
20264,20264,Pro-sovereignty Legislators Demand That Admini...,Brenda Walker,[],1
20513,20513,SAID IN SPANISH: A Mexican Governor Meddles In...,Allan Wall,[],1


In [10]:
# However, the text column above is a copy of a generated data with 
# some proprocessing steps already performed on it
# We use those IDs to get an idea of how the actual data entries of this data looks like
# We see that either there are no ids or there are the text is not in English language
news_train.loc[news_train['id'].isin(ids)] 

Unnamed: 0,id,title,author,text,label
82,82,Huma’s Weiner Dogs Hillary,Steve Sailer,,1
169,169,Mohamad Khweis: Another “Virginia Man” (Palest...,James Fulford,,1
295,295,A Connecticut Reader Reports Record Voter Regi...,VDARE.com Reader,,1
470,470,BULLETIN: There ARE Righteous Jews For Trump!;...,admin,,1
492,492,Казахстан на страже ядерной безопасности | Нов...,,В ноябре 2016 г. Мажилис Парламента Республики...,1
...,...,...,...,...,...
20141,20141,Thomas Frank Explores Whether Hillary Clinton ...,,,1
20242,20242,Radio Derb Transcript For October 21 Up: The M...,John Derbyshire,,1
20264,20264,Pro-sovereignty Legislators Demand That Admini...,Brenda Walker,,1
20513,20513,SAID IN SPANISH: A Mexican Governor Meddles In...,Allan Wall,,1


In [11]:
# To resolve this we replace the "text" of the sentences we our "title"
news_train.loc[news_train['id'].isin(ids) , 'text'] = news_train.loc[news_train['id'].isin(ids) , 'title']

In [12]:
# Now that we have our data ready we perform preprocessing steps on it
news_train = news_train.dropna()  # place this above the two cells
label = news_train["label"]  # Get a seperate columns for labels
news_train = news_train.drop("label", axis=1)
news_train.set_index("id", inplace=True)


news_train["text"] = news_train[
    "text"
].str.lower()  # convert the whole text to lower case to ensure uniformity

news_train["text"] = news_train["text"].replace("[^A-Za-z\s]", "", regex=True)
#Replace everything that is not letters or a space with a blank

news_train["text"] = news_train["text"].str.split()  # split our text column to a list


In [13]:
# during this chunk cell, we just wrote a function to remove stopwords
# and use porterstemmer to perform lemmatization. This function takes in a list and return a sentence

ps = PorterStemmer()
nltk.download("stopwords")


def remove_stopwords_and_stem(x):
    stopwds_lst = stopwords.words("english")
    sentence = ""
    for i in x:
        if i in stopwds_lst:
            x.remove(i)
    for k in range(len(x)):
        word = ps.stem(x[k])
        x[k] = word
    for j in x:
        sentence = sentence + j + " "
    sentence_final = sentence[:-1]
    return sentence_final


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mohammadanas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
news_train['text'] = news_train['text'].apply(remove_stopwords_and_stem)
# we apply the above function to the text column

In [15]:
text_comments = news_train['text'] 
# now we seperate the text into another series 

In [16]:
# We create a Bag of words model that take into account the the most frequently occurring unigrams and bigrams. 
# The reason for doing this is avoiding curse of dimensionality  
cv = CountVectorizer(max_features=10000, ngram_range=(1,2)) 
cv.fit(text_comments) 
BOG = cv.transform(text_comments)
BOG = BOG.toarray() # seperate BOG(bag of words as a seperate array)
label = np.array(label) # seperate labels as an array

In [13]:
# We use the Generative model created in Step 2 to create synthetic data
# In our case this is a Multinomial Naive Bayes Model ( for 2 classes as with our dataset, 
# this behaves like a Binomial Naive Bayes)
# We have already identified and tuned our hyper parameters for this model. The Building of this 
# model is described in "Training Naive Bayes.ipynb". However, for generating synthetic data we train it on 
# the whole dataset rather than using only the train data. This will allow us to capture the 
# distibution of the sentences better and so we can generate a better synthetic data. The only point is that 
# we are just using the hyperparametre from that model rather than tuning them seperately. For our
# model the hypermater is Alpha, a laplace smoothing constant

In [17]:
x_train = BOG

Model = MultinomialNB(alpha=0.1)
Model.fit(x_train, label)

MultinomialNB(alpha=0.1)

In [18]:
doc_lengths = np.sum(x_train,axis=1) # This gives us the length of each document in the sentence
words = cv.get_feature_names_out() # This returns the feature words used to generate the Bag of words
log_prob = Model.feature_log_prob_ # This returns log probabilities vectors of each feature given a class label (P(X|y))
prob = np.exp(log_prob) # This exponentiates the log_probs to get probabilities
class_prob = np.exp(Model.class_log_prior_) # returns class probabilites based on train data

In [23]:
print(class_prob)
print(Model.classes_)
news_train.shape

[0.56663932 0.43336068]
[0 1]


(18285, 3)

In [18]:
# we use the above information to generate a synthetic data 

In [21]:
# the purpose of this function is to generate words from a bag of words vector.
# By bag of words vector, we mean the horizontal vector corresponding to each sentence.
# change sequence arguements just shuufle the order of the sentence (does not matter in 
# our case as our generative naive bayes is does not take order into account. I just added)
def sentence(words, rep_vector, change_sequence=False):
    assert isinstance(
        change_sequence, bool
    ), "Please enter a Boolean for change_sequence"
    sentence_list = []
    for i in range(len(words)):
        char = words[i]
        rep = rep_vector[0, i]
        if rep != 0:
            char_rep = char * rep
            if rep == 1:
                sentence_list.append(char_rep)
            else:
                chunks = [
                    char_rep[i : i + 2] for i in range(0, len(char_rep), len(char))
                ]
                sentence_list = sentence_list + chunks
    if change_sequence == False:
        return " ".join(sentence_list)
    elif change_sequence == True:
        sen_list = np.array(sentence_list.copy())
        np.random.shuffle(sen_list)
        return " ".join(sen_list)

In [22]:
synthetic_labels = np.random.binomial(1, class_prob[1], 2000) # based on the class probabilities we create a 2000
# labels randomly

In [23]:
synthetic_sentences = np.array([])
synthetic_BOG = np.zeros(
    [2000, 10000]
)  # a place holder array to create a synthetic bag of words
for i in range(len(synthetic_labels)):
    current_label = synthetic_labels[i]  # takes the label
    length_of_doc = np.random.choice(
        doc_lengths
    )  # takes a random length based on the lengths of document that occur in our original data
    rep_vec = np.random.multinomial(
        length_of_doc, prob[current_label], size=1
    )  # takes the feature probabilities given the class label generates multinomial a vector that 
    # corresponds to a data entry or a document. This vector will indicates the 
    # number of words in each sentence.

    synthetic_BOG[i] = rep_vec # append the vector to our placeholder vector for synthetic data  
    synthetic_sentences = np.append(
        synthetic_sentences, sentence(words, rep_vec, change_sequence=True)
    ) # based on the vector generated above this code creates a sentence and adds it to the sentence array above


In [2]:
# The Synthetic bag of words and synthetic sentences array are the same 
# sentences but different representations. However, we will just use the 
# sentences data to test our models that we create in step 2 and step 3

In [41]:
synthetic_Data_Sentences = pd.DataFrame(synthetic_sentences,columns=['sentences'])
synthetic_Data_Sentences['labels'] = pd.Series(synthetic_labels)
## created datafram of setence synthetic data

In [56]:
synthetic_Data_Sentences.to_csv(
    "/Users/mohammadanas/Desktop/Duke MIDS/Fall 2021/NLP/NLP Final Project/Synthetic_Sentences.csv",
    encoding="utf-8",
)
#Exporting to csv to test it on other models.