# Fake News Detector Machine Learning Model 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics import classification_report
# from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import precision_recall_fscore_support
# import re
# import string


# Importing Function from script.py file
from scripts import word_drop

# Loading The Datasets

For this exploratory Pipeline, I am using two separated datasets I found on Google, one CSV file is the **true news dataset** and the other file is the ****fake news dataset****.

In [2]:
df = pd.read_csv('Datasets/df.csv')
df = df.drop(['Unnamed: 0'], axis=1)
df.head(3)


Unnamed: 0,Tweets,class
0,covid deaths are starting to decline in the n...,1
1,breaking news in a reversal the fda will wai...,1
2,belgium health minister puts ban on non essent...,0


# Defining our Dependent and Independent Variables

In [3]:
x = df['Tweets']
y = df['class']

# Spliting the Data in Train and Test

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= .25)

print('df', df.shape)
print('x_train', x_train.shape)
print('x_test', x_test.shape)
print('y_train', y_train.shape)
print('y_test', y_test.shape)

df (1386, 2)
x_train (1039,)
x_test (347,)
y_train (1039,)
y_test (347,)


# Vectorizing Data
Using sklearn.feature_extraction.text.TfidfVectorizer
// from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
vectorization = TfidfVectorizer()
x_train_vectors = vectorization.fit_transform(x_train)
x_test_vectors = vectorization.transform(x_test)

In [6]:
x_train.shape

(1039,)

# Model Selection

## Training Logistic Regression Model

In [7]:
logModel = LogisticRegression()

In [116]:
#For the gridsearch 2 parameter grids had be stablished because some of the parameters were not working one with the other.

param_grid_1 = {'penalty': ['l2'],
                'C': [1, 10, 100, 300, 400],
                'solver':[
                        'liblinear', 
                        'newton-cg'],
                'max_iter': [ 100, 1000, 3000, 5000]
}


param_grid_2 = {'penalty': ['l1'],
              'C': [1, 10, 100],
              'solver': ['liblinear'],
              'max_iter': [750]
}


clf_1 = GridSearchCV(logModel, param_grid = param_grid_1, cv = 3, verbose=True, n_jobs=-1)
clf_2 = GridSearchCV(logModel, param_grid = param_grid_2, cv = 3, verbose=True, n_jobs=-1)


In [117]:
best_clf_1 = clf_1.fit(x_train_vectors, y_train)
best_clf_2 = clf_2.fit(x_train_vectors, y_train)

Fitting 3 folds for each of 40 candidates, totalling 120 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fits


In [134]:
best_clf_1.best_params_

{'C': 10, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}

In [162]:
d = {'col1': [1, 2], 'col2': [3, 4]}
df = pd.DataFrame(data=d)

data = {'C': [best_clf_1.best_params_['C'], best_clf_2.best_params_['C']], 
        'max_iter': [best_clf_1.best_params_['max_iter'], best_clf_2.best_params_['max_iter']],
        'penalty': [best_clf_1.best_params_['penalty'], best_clf_2.best_params_['penalty']],
        'solver': [best_clf_1.best_params_['solver'], best_clf_2.best_params_['solver']],
        'Score': ["{:.2f}".format(best_clf_1.best_score_*100), "{:.2f}".format(best_clf_2.best_score_*100)]}


best_hyperparameters = pd.DataFrame(data = data)
best_hyperparameters

Unnamed: 0,C,max_iter,penalty,solver,Score
0,10,100,l2,liblinear,99.81
1,1,750,l1,liblinear,99.71


In [163]:
model_1 = LogisticRegression()
model_1.fit(x_train_vectors, y_train)

LogisticRegression()

## Testing the Logistic Regression Model

In [None]:
y_predicted = model_1.predict(x_test_vectors)
accuracy = accuracy_score(y_test, y_predicted)
precision, recall, f1, support = precision_recall_fscore_support(y_test, y_predicted) 

print("Accuracy = {:.2f}".format(accuracy))
print("Precision = ", precision)
print("Recall = ", recall)
print('F1-Score', f1)

In [None]:
df_manual = pd.read_csv('Datasets/manual_testing.csv')
df_manual = df_manual.drop(['Unnamed: 0'], axis=1)
print(df_manual.shape)
df_manual.head(3)

In [None]:
df_manual_vect = vectorization.transform(df_manual.loc[0, ['Tweets']]).todense()



In [None]:
manual_prediction = model_1.predict(np.asarray(df_manual_vect))

In [None]:
manual_prediction