In [14]:
#Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import  accuracy_score, f1_score, precision_score,confusion_matrix, recall_score, roc_auc_score
import pickle
import re
import string
from matplotlib import rc,rcParams
import itertools

In [15]:
#import data
fake = pd.read_csv("../data/Fake.csv")
true = pd.read_csv("../data/True.csv")

# Functions

In [16]:
#Creation of target variables
def variables(fake,true):
    fake["classement"] = 0
    true["classement"] = 1
    return fake, true


# Removing last 10 rows for manual testing
def manual_testing(fake, true):
    #It is very important to see the accuracy of our model and its reflection on the new entries.
    fake_manual_testing = fake.tail(10)
    for i in range(23480,23470,-1):
        fake.drop([i], axis = 0, inplace = True)
    
    true_manual_testing = true.tail(10)
    for i in range(21416,21406,-1):
        true.drop([i], axis = 0, inplace = True)
    #exporting the manual testing data in CSV form
    manual_testing = pd.concat([fake_manual_testing,true_manual_testing], axis = 0)
    manual_testing.to_csv("../data/manual_testing.csv")
    return manual_testing


#merging the fake and true dfs to make them in one unique dataframe
def merging(fake, true):
    merge = pd.concat([fake, true], axis =0 )
    merge.columns
    return merge


#shuffling data
def shuffling_data(merge): 
    merge = merge.sample(frac = 1)
    merge.reset_index(inplace = True)
    merge.head()
    return merge


#Drop uneeded columns
def droping_columns(merge):
    merge = merge.drop(["title", "subject","date","index"], axis = 1)
    return merge

#Processing words
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text) #removing URLs
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text


#Scaling
def scaling(merge):
    #Spliting depentdent and independent variales
    x = merge["text"] #independent
    y = merge["classement"] #dependent
    return x,y

#Spliting test and train
def train_test(x, y):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
    return  x_train, x_test, y_train, y_test


#Converting words to vectors
def vectorizing(x_train, x_test):
    vectorization = TfidfVectorizer()
    x_train = vectorization.fit_transform(x_train)
    x_test = vectorization.transform(x_test)
    pickle.dump(vectorization, open(r'../data/vectorizing.pickle', 'wb'))
    return x_train, x_test


#Modeling
def model(x_train, x_test, y_train, y_test):
    models = []
    models.append(('LR', LogisticRegression()))
    models.append(('DT', DecisionTreeClassifier()))
    models.append(('Gboost', GradientBoostingClassifier(random_state=0)))
    models.append(('RF',RandomForestClassifier(random_state=0)))
    # evaluate each model in turn
    results = []
    names = []
    result_df = pd.DataFrame(columns = ["model", "accuracy_score"])
    index = 0
    for name, model in models:
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        accuracy = accuracy_score(y_test, y_pred)
        result_df.at[index, ["model", "accuracy_score"]] = [name, accuracy]
        index += 1
        pickle.dump(models[1][1], open(r'../data/best_model.pickle', 'wb'))
    return result_df.sort_values("accuracy_score", ascending = False)

# Creation of target variables

In [17]:
fake, true=variables(fake,true)

# Removing last 10 rows for manual testing

In [18]:
manual_test=manual_testing(fake, true)
manual_test.head()

Unnamed: 0,title,text,subject,date,classement
23471,Seven Iranians freed in the prisoner swap have...,"21st Century Wire says This week, the historic...",Middle-east,"January 20, 2016",0
23472,#Hashtag Hell & The Fake Left,By Dady Chery and Gilbert MercierAll writers ...,Middle-east,"January 19, 2016",0
23473,Astroturfing: Journalist Reveals Brainwashing ...,Vic Bishop Waking TimesOur reality is carefull...,Middle-east,"January 19, 2016",0
23474,The New American Century: An Era of Fraud,Paul Craig RobertsIn the last years of the 20t...,Middle-east,"January 19, 2016",0
23475,Hillary Clinton: ‘Israel First’ (and no peace ...,Robert Fantina CounterpunchAlthough the United...,Middle-east,"January 18, 2016",0


# merging the fake and true dfs to make them in one unique dataframe

In [19]:
merge=merging(fake, true)
merge

Unnamed: 0,title,text,subject,date,classement
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
...,...,...,...,...,...
21402,Exclusive: Trump's Afghan decision may increas...,ON BOARD A U.S. MILITARY AIRCRAFT (Reuters) - ...,worldnews,"August 22, 2017",1
21403,U.S. puts more pressure on Pakistan to help wi...,WASHINGTON (Reuters) - The United States sugge...,worldnews,"August 21, 2017",1
21404,Exclusive: U.S. to withhold up to $290 million...,WASHINGTON (Reuters) - The United States has d...,worldnews,"August 22, 2017",1
21405,Trump talks tough on Pakistan's 'terrorist' ha...,ISLAMABAD (Reuters) - Outlining a new strategy...,worldnews,"August 22, 2017",1


# shuffling data

In [20]:
merge=shuffling_data(merge)
merge

Unnamed: 0,index,title,text,subject,date,classement
0,17472,CNN’S FAKE NEWS BACKFIRES! CNN Legal Analyst A...,Hell has frozen over! CNN is actually reportin...,left-news,"Dec 28, 2017",0
1,10988,WATCH TUCKER CARLSON’S Heated Debate With Delu...,"Tucker Carlson interviews Jose Antonio Vargas,...",politics,"May 3, 2017",0
2,16156,AMBASSADOR JOHN BOLTON: Susan Rice Has “Real L...,Ambassador John Bolton was on Lou Dobbs tonigh...,Government News,"Apr 3, 2017",0
3,19100,Police investigating militants search Brussels...,BRUSSELS (Reuters) - Police searched eight hou...,worldnews,"September 25, 2017",1
4,1945,"Turncoat Democrats Break Ranks, Back Trump On...",When one votes for someone with a (D) next to ...,News,"March 30, 2017",0
...,...,...,...,...,...,...
44873,14054,"Saudi aims to issue tourist visas next year, o...",DUBAI (Reuters) - Saudi Arabia aims to start i...,worldnews,"November 23, 2017",1
44874,2139,House Intelligence Committee: Trump Lied Abou...,Less than 24 hours after Trump s press secreta...,News,"March 15, 2017",0
44875,20978,BETTE MIDLER Asks Obama To Release VIOLENT BLA...,Pardoning the Black Panthers will just be the ...,left-news,"Feb 19, 2016",0
44876,14301,Putin to inform Saudi king about his meeting w...,MOSCOW (Reuters) - Russian President Vladimir ...,worldnews,"November 21, 2017",1


# Drop uneeded columns

In [21]:
merge=droping_columns(merge)
merge

Unnamed: 0,text,classement
0,Hell has frozen over! CNN is actually reportin...,0
1,"Tucker Carlson interviews Jose Antonio Vargas,...",0
2,Ambassador John Bolton was on Lou Dobbs tonigh...,0
3,BRUSSELS (Reuters) - Police searched eight hou...,1
4,When one votes for someone with a (D) next to ...,0
...,...,...
44873,DUBAI (Reuters) - Saudi Arabia aims to start i...,1
44874,Less than 24 hours after Trump s press secreta...,0
44875,Pardoning the Black Panthers will just be the ...,0
44876,MOSCOW (Reuters) - Russian President Vladimir ...,1


# Processing words

In [22]:
merge["text"] = merge["text"].apply(wordopt)

# Scaling

In [23]:
X, y =scaling(merge)

# Spliting test and train

In [24]:
x_train, x_test, y_train, y_test=train_test(X, y)

# Converting words to vectors

In [25]:
x_train, x_test=vectorizing(x_train, x_test)

# Modeling

In [26]:
result_model=model(x_train, x_test, y_train, y_test)
result_model

Unnamed: 0,model,accuracy_score
2,Gboost,0.995722
1,DT,0.995455
3,RF,0.986542
0,LR,0.985205


In [29]:
x_train.shape

(33658, 94909)

In [33]:
x_test.shape

(11220, 94909)

In [35]:
y_train.shape

(33658,)