In [3]:
# Import libs for DA and ML
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

# init of the lemma object
lemmatizer = WordNetLemmatizer()

In [4]:
# Data set of the Fake News
fake_df = pd.read_csv('../data/Fake.csv')

# Data set of the True News 
true_df = pd.read_csv('../data/True.csv')

In [5]:
fake_df.head(2)

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"


In [6]:
true_df.head(2)

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"


In [9]:
# Checked null values in the dataset
fake_df.isnull().mean()

title      0.0
text       0.0
subject    0.0
date       0.0
dtype: float64

In [10]:
# Checked null values in the dataset
true_df.isnull().mean()

title      0.0
text       0.0
subject    0.0
date       0.0
dtype: float64

In [11]:
true_df = true_df[['text']]
true_df['type'] = 1
true_df

Unnamed: 0,text,type
0,WASHINGTON (Reuters) - The head of a conservat...,1
1,WASHINGTON (Reuters) - Transgender people will...,1
2,WASHINGTON (Reuters) - The special counsel inv...,1
3,WASHINGTON (Reuters) - Trump campaign adviser ...,1
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,1
...,...,...
21412,BRUSSELS (Reuters) - NATO allies on Tuesday we...,1
21413,"LONDON (Reuters) - LexisNexis, a provider of l...",1
21414,MINSK (Reuters) - In the shadow of disused Sov...,1
21415,MOSCOW (Reuters) - Vatican Secretary of State ...,1


In [12]:
fake_df = fake_df[['text']]
fake_df['type'] = 0
fake_df

Unnamed: 0,text,type
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0
...,...,...
23476,21st Century Wire says As 21WIRE reported earl...,0
23477,21st Century Wire says It s a familiar theme. ...,0
23478,Patrick Henningsen 21st Century WireRemember ...,0
23479,21st Century Wire says Al Jazeera America will...,0


In [15]:
df = pd.concat([true_df, fake_df])
df.shape

(44898, 2)

In [16]:
df.head(2)

Unnamed: 0,text,type
0,WASHINGTON (Reuters) - The head of a conservat...,1
1,WASHINGTON (Reuters) - Transgender people will...,1


In [17]:
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

def tokenize(text):
    text = text.lower()
    text = re.sub('[^A-Za-z0-9]',' ', text)
    text = re.sub('\n+|\s+',' ', text)
    
    text = word_tokenize(text)
    text = [ i for i in text if i not in stopwords.words('english') ]
    text = [ lemmatizer.lemmatize(i) for i in text ]
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kamleshkumarrangi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kamleshkumarrangi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier

X = df['text']
y = df['type']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.80)

cvec = CountVectorizer(tokenizer=tokenize)
tfid = TfidfTransformer()
clf = RandomForestClassifier()

In [None]:
count_vect = cvec.fit_transform(X_train)
tiff_vec   = tfid.fit_transform(count_vect)