### 1. Importing Libraries


In [25]:
import pandas as pd
import string
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score, recall_score

import time 

### 2. Loading and Exploring the Data

In [3]:
true_data = pd.read_csv('data/True.csv')
fake_data = pd.read_csv('data/Fake.csv')

### 3. Data Cleaning and Preparation

#### Add a new column called `output`, 1 for true_data and 0 for fake_data

In [4]:
true_data['output'] = 1
fake_data['output'] = 0

#### Concat `true_data` and `fake_data`

In [5]:
data = pd.concat([true_data,fake_data])

#### Delete duplicated rows and missing values

In [6]:
data.duplicated().sum()    # 209 duplicated rows
data.drop_duplicates(inplace=True)

data.duplicated(subset=['text']).sum() # 6043 duplicated rows 

data.drop_duplicates(subset=['text'],inplace=True)

In [7]:
data.isnull().any(axis=1).sum()  # there are no missing values

0

#### Delete blank text

In [8]:
data.sort_values(by='text',inplace=True)
data = data[data.text.str.strip() != '']

#### Add a new column called `complete_text`, and drop `text` and `title` from data

In [9]:
data['complete_text'] =data['title'] + ' ' + data['text']

data.drop(columns=['text','title'],inplace=True)

#### Delete the punctuation from the text

In [10]:
punctuation = string.punctuation + '‘’-“”'
data['complete_text'] = data['complete_text'].str.lower().replace(f'[{punctuation}]','',regex=True)

#### Test after

In [11]:
## Test delete duplicated rows with subset = 'title'
# data[data.duplicated(subset=['title'])]

### 4. Splitting and Vectorizing Data

#### Splitting data into train and test

In [12]:
X = data.complete_text.values
y = data.output

x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=20)

#### Vectorizing data

In [13]:
vectorizer = CountVectorizer(stop_words='english')

x_train_vect = vectorizer.fit_transform(x_train)

x_test_vect = vectorizer.transform(x_test)

### 5. Model Building and Training


In [14]:
model_description = []

def save_info_evaluation(duration,model_name, y_test,predictions):
    accuracy = accuracy_score(y_test,predictions)
    precision = precision_score(y_test,predictions)
    recall = recall_score(y_test,predictions)
    f1score = f1_score(y_test,predictions)

    model_description.append([model_name,duration,accuracy,precision,recall,f1score])

#### Naive Bayes

In [15]:
model = MultinomialNB()

start_time = time.time()
model.fit(x_train_vect,y_train)
predictions = model.predict(x_test_vect)
end_time = time.time()

duration = end_time - start_time

model_name = 'NB - Multinomial with alpha = 1 (per default)'
save_info_evaluation(duration,model_name, y_test,predictions)

##### Improve model

In [16]:
params = {
    'alpha': [0.1,0.01,0.001,0.0001,0.00001],
}

gridsearch = GridSearchCV(model, param_grid=params,scoring='accuracy', cv=5,n_jobs=-1,verbose=1)

start_time = time.time()
gridsearch.fit(x_train_vect,y_train)
predictions = gridsearch.predict(x_test_vect)
end_time = time.time()

duration = end_time - start_time

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [17]:
model_name = f'NB - Multinomial with {gridsearch.best_params_}'
save_info_evaluation(duration,model_name, y_test,predictions)

#### Decision Trees

##### `criterion = 'entropy'`

In [18]:
model = DecisionTreeClassifier(criterion='entropy')

start_time = time.time()
model.fit(x_train_vect,y_train)
predictions = model.predict(x_test_vect)
end_time = time.time()

duration = end_time - start_time

model_name = 'Decision Tree - entropy'
save_info_evaluation(duration,model_name, y_test,predictions)

##### `criterion = 'gini'`

In [19]:
model = DecisionTreeClassifier(criterion='gini')

start_time = time.time()
model.fit(x_train_vect,y_train)
predictions = model.predict(x_test_vect)
end_time = time.time()

duration = end_time - start_time

model_name = 'Decision Tree - gini'
save_info_evaluation(duration,model_name, y_test,predictions)

#### SVM

##### `kernel: linear`

In [20]:
model = SVC(kernel='linear')

start_time = time.time()
model.fit(x_train_vect,y_train)
predictions = model.predict(x_test_vect)
end_time = time.time()

duration = end_time - start_time

model_name = 'SVM - kernel: linear'
save_info_evaluation(duration,model_name, y_test,predictions)

##### `kernel: sigmoid`

In [23]:
model = SVC(kernel='sigmoid')

start_time = time.time()
model.fit(x_train_vect,y_train)
predictions = model.predict(x_test_vect)
end_time = time.time()

duration = end_time - start_time

model_name = 'SVM - kernel: sigmoid'
save_info_evaluation(duration,model_name, y_test,predictions)

#### Logistic Regression

##### `solver: sag`

In [27]:
model = LogisticRegression(solver='sag')

start_time = time.time()
model.fit(x_train_vect,y_train)
predictions = model.predict(x_test_vect)
end_time = time.time()

duration = end_time - start_time

model_name = 'Log. Regression - solver: sag'
save_info_evaluation(duration,model_name, y_test,predictions)



##### `solver: newton`

In [28]:
model = LogisticRegression(solver='newton-cg')

start_time = time.time()
model.fit(x_train_vect,y_train)
predictions = model.predict(x_test_vect)
end_time = time.time()

duration = end_time - start_time

model_name = 'Log. Regression - solver: newton-cg'
save_info_evaluation(duration,model_name, y_test,predictions)

#### Conclusion

Based on the results, I prefer to use `Decision Tree: Gini` or `Log. Regression - solver: Newton-CG`, both have similar metrics, however the main difference is the duration, we also need to avoid overfitting models, to solve this we need to get more data or use cross validation for each of these 2 models.

In [32]:
df = pd.DataFrame(model_description,columns=['name_model','duration','accuracy','precision','recall','f1score'])
df.sort_values(by=['accuracy'],ascending=False)

Unnamed: 0,name_model,duration,accuracy,precision,recall,f1score
3,Decision Tree - gini,16.26401,0.995756,0.994493,0.997714,0.996101
7,Log. Regression - solver: newton-cg,7.2453,0.995342,0.994113,0.997333,0.99572
4,SVM - kernel:linear,140.312686,0.995032,0.993735,0.997142,0.995436
2,Decision Tree - entropy,7.92436,0.993893,0.993158,0.995618,0.994387
6,Log. Regression - solver: sag,14.505118,0.992858,0.990902,0.995999,0.993444
1,NB - Multinomial with {'alpha': 1e-05},7.504513,0.965532,0.971062,0.965327,0.968186
0,NB - Multinomial with alpha = 1 (per default),0.11055,0.954249,0.950178,0.96647,0.958255
5,SVM - kernel: sigmoid,143.787991,0.949488,0.951451,0.955801,0.953621
