## Import dependencies

In [25]:
import numpy as np
import pandas as pd
import re
import nltk
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support as score
import pickle
import requests 
import pandas as pd 
from bs4 import BeautifulSoup 

## Read the data

In [26]:
data = pd.read_csv("data/news.csv")

In [27]:
data.label.value_counts()

REAL    3171
FAKE    3164
Name: label, dtype: int64

In [28]:
data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


## Clean the text

In [30]:
def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=nltk.corpus.stopwords.words("english")):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

In [31]:
data["text_clean"] = data["text"].apply(lambda x: 
          utils_preprocess_text(x, flg_stemm=False, flg_lemm=True))
data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label,text_clean
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,daniel greenfield shillman journalism fellow f...
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,google pinterest digg linkedin reddit stumbleu...
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,u secretary state john f kerry said monday sto...
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,kaydee king kaydeeking november 9 2016 lesson ...
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,primary day new york frontrunners hillary clin...


## Build the model

### Spliting the data set into training and testing sets

In [32]:
labels = data.label
x_train, x_test, y_train, y_test = train_test_split(data.text_clean, labels, test_size=0.2, random_state = 0)

### Bag-of-Words - TF IDF vectorizer

In [33]:
tfidf_vectorizer = TfidfVectorizer()

In [34]:
tfidf_Train = tfidf_vectorizer.fit_transform(x_train)
tfidf_Test = tfidf_vectorizer.transform(x_test)

### Random Forest

In [35]:
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(tfidf_Train, y_train) 

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [36]:
y_pred = classifier.predict(tfidf_Test)

### Evaluate the model

In [37]:
print('Our model has an accuracy of', accuracy_score(y_test, y_pred),'! Yeay ! ')

Our model has an accuracy of 0.9123914759273876 ! Yeay ! 


In [38]:
precision, recall, fscore, support = score(y_test, y_pred)

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

precision: [0.91860465 0.90676692]
recall: [0.89918699 0.92484663]
fscore: [0.90879211 0.91571754]
support: [615 652]


### Initialize a PassiveAggressiveClassifier

In [39]:
passiveaggressive = PassiveAggressiveClassifier(C=1.0, fit_intercept=True, max_iter=1000, tol=0.001, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, shuffle=True, verbose=0, loss='hinge', n_jobs=None, random_state=None, warm_start=False, class_weight=None, average=False)
passiveaggressive.fit(tfidf_Train, y_train)

PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
                            early_stopping=False, fit_intercept=True,
                            loss='hinge', max_iter=1000, n_iter_no_change=5,
                            n_jobs=None, random_state=None, shuffle=True,
                            tol=0.001, validation_fraction=0.1, verbose=0,
                            warm_start=False)

In [40]:
y_prediction = passiveaggressive.predict(tfidf_Test)

### Evaluate the model

In [41]:
## Generate score accuracy and confusion matrix
print('Our model has an accuracy of', accuracy_score(y_test, y_prediction),'! Yeay ! ')

Our model has an accuracy of 0.936069455406472 ! Yeay ! 


In [42]:
precision, recall, fscore, support = score(y_test, y_prediction)

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

precision: [0.94205298 0.9306184 ]
recall: [0.92520325 0.94631902]
fscore: [0.93355209 0.93840304]
support: [615 652]


### Save the model

In [43]:
# save the model to disk
filename = 'finalized_model.pkl'
pickle.dump(passiveaggressive, open(filename, 'wb'))