In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

lemmatizer = WordNetLemmatizer()

In [2]:
fake_df = pd.read_csv('../data/Fake.csv')
true_df = pd.read_csv('../data/True.csv')

In [3]:
fake_df.shape

(23481, 4)

In [4]:
fake_df.head()
fake_df['subject'].unique()

array(['News', 'politics', 'Government News', 'left-news', 'US_News',
       'Middle-east'], dtype=object)

In [5]:
true_df['subject'].unique()

array(['politicsNews', 'worldnews'], dtype=object)

In [6]:
true_df = true_df[['text']]
true_df['type'] = 1
true_df

Unnamed: 0,text,type
0,WASHINGTON (Reuters) - The head of a conservat...,1
1,WASHINGTON (Reuters) - Transgender people will...,1
2,WASHINGTON (Reuters) - The special counsel inv...,1
3,WASHINGTON (Reuters) - Trump campaign adviser ...,1
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,1
...,...,...
21412,BRUSSELS (Reuters) - NATO allies on Tuesday we...,1
21413,"LONDON (Reuters) - LexisNexis, a provider of l...",1
21414,MINSK (Reuters) - In the shadow of disused Sov...,1
21415,MOSCOW (Reuters) - Vatican Secretary of State ...,1


In [7]:
fake_df = fake_df[['text']]
fake_df['type'] = 0
fake_df

Unnamed: 0,text,type
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0
...,...,...
23476,21st Century Wire says As 21WIRE reported earl...,0
23477,21st Century Wire says It s a familiar theme. ...,0
23478,Patrick Henningsen 21st Century WireRemember ...,0
23479,21st Century Wire says Al Jazeera America will...,0


In [8]:
df = pd.concat([true_df, fake_df])
df.head()
df.shape

(44898, 2)

In [9]:
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

def get_clean(text):
    text = text.lower()
    text = re.sub('[^A-Za-z0-9]',' ', text)
    text = re.sub('\n+|\s+',' ', text)
    
    text = word_tokenize(text)
    text = [ i for i in text if i not in stopwords.words('english') ]
    text = [ lemmatizer.lemmatize(i) for i in text ]
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kamleshkumarrangi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kamleshkumarrangi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier

X = df['text']
y = df['type']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.80)
count_vect = CountVectorizer(tokenizer=get_clean)
tdiff_transformer = TfidfTransformer()
classifier = RandomForestClassifier()

In [11]:
tokezie = count_vect.fit_transform(X_train)

In [12]:
tfid_vec = tdiff_transformer.fit_transform(tokezie)

In [15]:
tfid_vec

<8979x55629 sparse matrix of type '<class 'numpy.float64'>'
	with 1454436 stored elements in Compressed Sparse Row format>

In [17]:
mlclf = classifier.fit(tfid_vec, y_train)

In [None]:
X_test_counts = count_vect.fit_transform(X_test)
X_test_tfidf = tdiff_transformer.fit_transform(X_test_counts)
y_pred = mlclf.predict(X_test_tfidf)