# Identification of Fake New using Ensemble Methods
### Jeffrey Lin Alex Te
#### Santa Clara University
#### COEN281 Term Project 

In [353]:
#libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import re

import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import sklearn.linear_model as lm
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBClassifier

ModuleNotFoundError: No module named 'sklearn'

### Dataset
Here we will inport a dataset taken from: https://www.kaggle.com/clmentbisaillon/fake-and-real-news-dataset

In [354]:
# Importing fake news dataset
df_false = pd.read_csv("Fake.csv")
df_false.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [355]:
# Importing true news dataset
df_true = pd.read_csv("True.csv")
df_true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


### Data Clean

In [356]:
# remove brackets since true dataset has random brackets with the time in it 
# remove entries with empty features
# remove publisher identification (reuters)
# tbd

del df_true['subject']
del df_true['date']
del df_false['subject']
del df_false['date']

for index, row in df_true.iterrows():
    row_text = row['text']
    if ") - " in row_text:
        row_text = row_text.split(") - ")[1]
        
    if bool(re.search('\[\d* \w*\]', row_text)):
        row_text = " ".join(re.split('\[\d* \w*\]', row_text))
        
    row['text'] = row_text

df_true.head()

Unnamed: 0,title,text
0,"As U.S. budget fight looms, Republicans flip t...",The head of a conservative Republican faction ...
1,U.S. military to accept transgender recruits o...,Transgender people will be allowed for the fir...
2,Senior U.S. Republican senator: 'Let Mr. Muell...,The special counsel investigation of links bet...
3,FBI Russia probe helped by Australian diplomat...,Trump campaign adviser George Papadopoulos tol...
4,Trump wants Postal Service to charge 'much mor...,President Donald Trump called on the U.S. Post...


### Combining the data

In [357]:
df_true['category'] = 1
df_false['category'] = 0

In [358]:
#concat datasets into one
df = pd.concat([df_true,df_false]) 

In [359]:
df

Unnamed: 0,title,text,category,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",The head of a conservative Republican faction ...,1,,
1,U.S. military to accept transgender recruits o...,Transgender people will be allowed for the fir...,1,,
2,Senior U.S. Republican senator: 'Let Mr. Muell...,The special counsel investigation of links bet...,1,,
3,FBI Russia probe helped by Australian diplomat...,Trump campaign adviser George Papadopoulos tol...,1,,
4,Trump wants Postal Service to charge 'much mor...,President Donald Trump called on the U.S. Post...,1,,
...,...,...,...,...,...
23476,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,0,Middle-east,"January 16, 2016"
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,0,Middle-east,"January 16, 2016"
23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,0,Middle-east,"January 15, 2016"
23479,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,0,Middle-east,"January 14, 2016"


In [360]:
# dataset balance
df.category.value_counts()

0    23481
1    21417
Name: category, dtype: int64

### Data Analysis

In [361]:
#insert analysis here
# point of question


### NLP Data Preparation

In [363]:
df["text"]

0        The head of a conservative Republican faction ...
1        Transgender people will be allowed for the fir...
2        The special counsel investigation of links bet...
3        Trump campaign adviser George Papadopoulos tol...
4        President Donald Trump called on the U.S. Post...
                               ...                        
23476    21st Century Wire says As 21WIRE reported earl...
23477    21st Century Wire says It s a familiar theme. ...
23478    Patrick Henningsen  21st Century WireRemember ...
23479    21st Century Wire says Al Jazeera America will...
23480    21st Century Wire says As 21WIRE predicted in ...
Name: text, Length: 44898, dtype: object

In [364]:
def clean_punc(inputString):
    cleaned = re.sub(r'[?|!|\'|#]', r'', inputString)
    cleaned = re.sub(r'[.|,|)|(|\|/]', r' ', cleaned)
    return cleaned

In [365]:
df["text"] = df.apply(lambda row: clean_punc(row["text"]), axis=1)

In [366]:
def lower_case(inputString):
    return inputString.lower()

In [367]:
df["text"] = df.apply(lambda row: lower_case(row["text"]), axis=1)

In [368]:
df["text"]

0        the head of a conservative republican faction ...
1        transgender people will be allowed for the fir...
2        the special counsel investigation of links bet...
3        trump campaign adviser george papadopoulos tol...
4        president donald trump called on the u s  post...
                               ...                        
23476    21st century wire says as 21wire reported earl...
23477    21st century wire says it s a familiar theme  ...
23478    patrick henningsen  21st century wireremember ...
23479    21st century wire says al jazeera america will...
23480    21st century wire says as 21wire predicted in ...
Name: text, Length: 44898, dtype: object

In [369]:
#use this to download wordnet library (one time download)
"""
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('wordnet')
"""

"\nimport nltk\nimport ssl\n\ntry:\n    _create_unverified_https_context = ssl._create_unverified_context\nexcept AttributeError:\n    pass\nelse:\n    ssl._create_default_https_context = _create_unverified_https_context\n\nnltk.download('wordnet')\n"

In [370]:
def lemmatization(inputString):
    lemmatizer = WordNetLemmatizer()
    words = []
    for word in inputString.split():
        words.append(lemmatizer.lemmatize(word))
    output = " ".join(words)
    return output

In [371]:
df["text"] = df.apply(lambda row: lemmatization(row["text"]), axis=1)

NameError: name 'WordNetLemmatizer' is not defined

In [None]:
def stopword_removal(inputString):
    sw = stopwords.words('english')
    words = [word for word in inputString.split() if word not in sw]
    output = " ".join(words)
    return output

In [None]:
df["text"] = df.apply(lambda row: stopword_removal(row["text"]), axis=1)

In [None]:
print(lemmatization("studies studying cries cry"))

In [None]:
df["text"]

In [None]:
#insert word2vec conversion