# Identification of Fake New using Ensemble Methods
### Jeffrey Lin Alex Te
#### Santa Clara University
#### COEN281 Term Project 

In [10]:
#libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import re

import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import sklearn.linear_model as lm
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBClassifier

ModuleNotFoundError: No module named 'sklearn'

### Dataset
Here we will inport a dataset taken from: https://www.kaggle.com/clmentbisaillon/fake-and-real-news-dataset

In [13]:
# Importing fake news dataset
df_false = pd.read_csv("Fake.csv")
df_false.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [12]:
# Importing true news dataset
df_true = pd.read_csv("True.csv")
df_true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


### Data Clean

In [14]:
# remove brackets since true dataset has random brackets with the time in it 
# remove entries with empty features
# remove publisher identification (reuters)
# tbd

del df_true['subject']
del df_true['date']
del df_false['subject']
del df_false['date']

for index, row in df_true.iterrows():
    row_text = row['text']
    if ") - " in row_text:
        row_text = row_text.split(") - ")[1]
        
    if bool(re.search('\[\d* \w*\]', row_text)):
        row_text = " ".join(re.split('\[\d* \w*\]', row_text))
        
    row['text'] = row_text

df_true.head()

NameError: name 're' is not defined

### Combining the data

In [None]:
df_true['category'] = 1
df_false['category'] = 0

In [None]:
#concat datasets into one
df = pd.concat([df_true,df_false]) 

In [None]:
df

In [None]:
# dataset balance
df.category.value_counts()

### Data Analysis

In [None]:
#insert analysis here
# point of question
# first check to see if the data is balance (i.e. there are the same number of true articles as there are false)

num_true_articles = len(df_true.index)
num_false_articles = len(df_false.index)

num_articles = num_true_articles if num_true_articles <= num_false_articles else num_false_articles

#num_articles hold the smaller of the two datasets (that way we are comparing the same number of articles)
#time to count the number of words inside each.

true_dataset_num_words_per_article = {}
true_dataset_words = {}
false_dataset_num_words_per_article = {}
false_dataset_words = {}

for index, row in df_true.iterrows():
    if index == num_articles:
        break
    else:
        row_text = row['text']
        string_list = row_text.split()
        true_dataset_num_words_per_article[f"Article{index}"] = len(string_list)
        for word in string_list:
            true_dataset_words[word] = string_list.count(word)

for index, row in df_false.iterrows():
    if index == num_articles:
        break
    else:
        row_text = row['text']
        string_list = row_text.split()
        false_dataset_num_words_per_article[f"Article{index}"] = len(string_list)
        for word in string_list:
            false_dataset_words[word] = string_list.count(word)

print(f"Num words in each of the true dataset: {true_dataset_num_words_per_article}"
print(f"Words in the true dataset (across all {num_articles} articles): {true_dataset_words}")

print(f"Num words in each of the false dataset: {false_dataset_num_words_per_article}"
print(f"Words in the false dataset (across all {num_articles} articles): {false_dataset_words}")


### NLP Data Preparation

In [None]:
df["text"]

In [None]:
def clean_punc(inputString):
    cleaned = re.sub(r'[?|!|\'|#]', r'', inputString)
    cleaned = re.sub(r'[.|,|)|(|\|/]', r' ', cleaned)
    return cleaned

In [None]:
df["text"] = df.apply(lambda row: clean_punc(row["text"]), axis=1)

In [5]:
def lower_case(inputString):
    return inputString.lower()

In [6]:
df["text"] = df.apply(lambda row: lower_case(row["text"]), axis=1)

NameError: name 'df' is not defined

In [None]:
df["text"]

In [None]:
#use this to download wordnet library (one time download)
"""
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('wordnet')
"""

In [None]:
def lemmatization(inputString):
    lemmatizer = WordNetLemmatizer()
    words = []
    for word in inputString.split():
        words.append(lemmatizer.lemmatize(word))
    output = " ".join(words)
    return output

In [None]:
df["text"] = df.apply(lambda row: lemmatization(row["text"]), axis=1)

In [None]:
def stopword_removal(inputString):
    sw = stopwords.words('english')
    words = [word for word in inputString.split() if word not in sw]
    output = " ".join(words)
    return output

In [None]:
df["text"] = df.apply(lambda row: stopword_removal(row["text"]), axis=1)

In [None]:
print(lemmatization("studies studying cries cry"))

In [None]:
df["text"]

In [None]:
#insert word2vec conversion