# Identification of Fake New using Ensemble Methods
### Jeffrey Lin Alex Te
#### Santa Clara University
#### COEN281 Term Project 

In [1]:
#libraries
import re

import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
import sklearn.linear_model as lm
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

ModuleNotFoundError: No module named 'sklearn'

### Importing Dataset
Here we will inport a dataset taken from: https://www.kaggle.com/clmentbisaillon/fake-and-real-news-dataset

In [None]:
# Importing fake news dataset
df_false = pd.read_csv("Fake.csv")
df_false.head()

In [None]:
# Importing true news dataset
df_true = pd.read_csv("True.csv")
df_true.head()

### Preliminary Data Cleaning

In [None]:
# remove brackets since true dataset has random brackets with the time in it 
# remove entries with empty features
# remove publisher identification (reuters)
# tbd

del df_true['subject']
del df_true['date']
del df_false['subject']
del df_false['date']

for index, row in df_true.iterrows():
    row_text = row['text']
    if ") - " in row_text:
        row_text = row_text.split(") - ")[1]
        
    if bool(re.search('\[\d* \w*\]', row_text)):
        row_text = " ".join(re.split('\[\d* \w*\]', row_text))
        
    row['text'] = row_text

df_true.head()

### Combining True/False Datasets

In [None]:
#creating labels for true(1)/false(0)
df_true['category'] = 1
df_false['category'] = 0

In [None]:
#concat datasets into one
df = pd.concat([df_true,df_false]) 

In [None]:
df

In [None]:
# dataset balance
df.category.value_counts()

### Splitting Training/Testing Dataset

In [None]:
#we are splitting the training and testing dataset here since if we do it later we get a memory error
y = df["category"] 
X = df["text"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
pd.Series(y_train).value_counts().plot.bar()

In [None]:
df_xtrain = pd.DataFrame(X_train)
df_xtrain.head()

In [None]:
df_xtest = pd.DataFrame(X_test)
df_xtest.head()

### Data Analysis

In [None]:
#insert analysis here
# point of question
# first check to see if the data is balance (i.e. there are the same number of true articles as there are false)

num_true_articles = len(df_true.index)
num_false_articles = len(df_false.index)

num_articles = num_true_articles if num_true_articles <= num_false_articles else num_false_articles

#num_articles hold the smaller of the two datasets (that way we are comparing the same number of articles)
#time to count the number of words inside each.

true_dataset_num_words_per_article = {}
true_dataset_words = {}
false_dataset_num_words_per_article = {}
false_dataset_words = {}

for index, row in df_true.iterrows():
    if index == num_articles:
        break
    else:
        row_text = row['text']
        string_list = row_text.split()
        true_dataset_num_words_per_article[f"Article{index}"] = len(string_list)
        for word in string_list:
            true_dataset_words[word] = string_list.count(word)

for index, row in df_false.iterrows():
    if index == num_articles:
        break
    else:
        row_text = row['text']
        string_list = row_text.split()
        false_dataset_num_words_per_article[f"Article{index}"] = len(string_list)
        for word in string_list:
            false_dataset_words[word] = string_list.count(word)

print(f"Num words in each of the true dataset: {true_dataset_num_words_per_article}")
print(f"Words in the true dataset (across all {num_articles} articles): {true_dataset_words}")

print(f"Num words in each of the false dataset: {false_dataset_num_words_per_article}")
print(f"Words in the false dataset (across all {num_articles} articles): {false_dataset_words}")

### NLP Data Processing

In [None]:
# remove puncutation from text
def clean_punc(inputString):
    cleaned = re.sub(r'[?|!|\'|#]', r'', inputString)
    cleaned = re.sub(r'[.|,|)|(|\|/]', r' ', cleaned)
    return cleaned

In [None]:
df_xtrain["text"] = df_xtrain.apply(lambda row: clean_punc(row["text"]), axis=1)
df_xtest["text"] = df_xtest.apply(lambda row: clean_punc(row["text"]), axis=1)

#df["text"] = df.apply(lambda row: clean_punc(row["text"]), axis=1)

In [None]:
# lowercase
def lower_case(inputString):
    return inputString.lower()

In [None]:
df_xtrain["text"] = df_xtrain.apply(lambda row: lower_case(row["text"]), axis=1)
df_xtest["text"] = df_xtest.apply(lambda row: lower_case(row["text"]), axis=1)

#df["text"] = df.apply(lambda row: lower_case(row["text"]), axis=1)

In [None]:
#use this to download wordnet library (one time download)
"""
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('wordnet')
"""

In [None]:
# lemmatization groups words with the same base meaning together
# i.e. studies studying cries cry -> study studying cry cry
def lemmatization(inputString):
    lemmatizer = WordNetLemmatizer()
    words = []
    for word in inputString.split():
        words.append(lemmatizer.lemmatize(word))
    output = " ".join(words)
    return output

# test
#print(lemmatization("studies studying cries cry"))

In [None]:
# apply lemmatization to text
df_xtrain["text"] = df_xtrain.apply(lambda row: lemmatization(row["text"]), axis=1)
df_xtest["text"] = df_xtest.apply(lambda row: lemmatization(row["text"]), axis=1)

#df["text"] = df.apply(lambda row: lemmatization(row["text"]), axis=1)

In [None]:
# remove stop words that don't provide additional meaning to text
def stopword_removal(inputString):
    sw = stopwords.words('english')
    words = [word for word in inputString.split() if word not in sw]
    output = " ".join(words)
    return output

In [None]:
# apply stopword removal to text
df_xtrain["text"] = df_xtrain.apply(lambda row: stopword_removal(row["text"]), axis=1)
df_xtest["text"] = df_xtest.apply(lambda row: stopword_removal(row["text"]), axis=1)

#df["text"] = df.apply(lambda row: stopword_removal(row["text"]), axis=1)

In [None]:
df_xtrain.head()

In [None]:
df_xtest.head()

### Vectorizing Text

In [None]:
# tfidf vectorizer
# we must .fit() the vectorizer on the training dataset so that when we use .transform()
# the dimension of the resulting df is the same for train and test (it uses the word corpus of training set)
# output_train/test is the sparce matrix, df_xtrain/testvectorized is just for display

tfidf = TfidfVectorizer()
tfidf.fit(df_xtrain["text"]) 
output_train = tfidf.transform(df_xtrain["text"])
df_xtrainvectorized = pd.DataFrame(output_train.toarray(), columns=tfidf.get_feature_names())
df_xtrainvectorized

In [None]:
output_test = tfidf.transform(df_xtest["text"])
df_xtest_vectorized = pd.DataFrame(output_test.toarray(), columns=tfidf.get_feature_names())
df_xtest_vectorized

In [None]:
#note: there is a lot of garbage values

tfidf.get_feature_names()

### Decision Tree

In [None]:
#X_train needs to be a list of the training data.
train_data = [X_train]
labels = y_train
clf = tree.DecisionTreeClassifier()
clf = clf.fit(train_data, labels)

#loop through the testing data here
# https://scikit-learn.org/stable/modules/tree.html
# i needs to be 1 single list and in the square bracket
# for i in ___
#   clf.predict([i])

### Random Forest

In [None]:
# Random forest model

rf_regr = RandomForestRegressor(n_estimators=20, random_state=0)
rf_regr.fit(output_train, y_train)

In [None]:
resultRF = rf_regr.predict(output_test)
resultRF = resultRF.round(0)

In [None]:
#Displaying the confusion matrix
print('Confusion Matrix - Random Forest')
print(pd.crosstab(y_test, resultRF, rownames = ['True'], colnames = ['Predicted'], margins = True))

In [None]:
print('F1 score - Random Forest:')
print(f1_score(y_test, resultRF, average='macro'))

### XGBoost 

In [None]:
#Fitting and predicting using gradient boosting
xgb = XGBClassifier(n_estimators=20, random_state=0)
xgb.fit(output_train, y_train)

In [None]:
resultXGB = xgb.predict(output_test)
resultXGB = resultXGB.round(0)

In [None]:
#Displaying the confusion matrix
print('Confusion Matrix - Random Forest')
print(pd.crosstab(y_test, resultRF, rownames = ['True'], colnames = ['Predicted'], margins = True))

In [None]:
print('F1 score - XGBClassifier:')
print(f1_score(y_test, resultXGB, average='macro'))