# Data Preprocessing

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.metrics import confusion_matrix
from plotly.subplots import make_subplots
import plotly.express as px
from sklearn.metrics import accuracy_score

## Importing the dataset

In [2]:
dataset = pd.read_csv('news.csv')
labels=dataset.label

## Splitting the dataset into the Training set and Test set

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset['text'], labels, test_size = 0.2, random_state = 7)

## Initialize TF-IDF-vectorizer

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

## Fit and Transform training set. Transform test set.

In [5]:
v_train = vectorizer.fit_transform(X_train)
v_test = vectorizer.transform(X_test)

In [8]:
import pickle
filename = 'Application/Backend/fakeNews_vectorizer.sav'
pickle.dump(vectorizer, open(filename, 'wb'))

## Initialize and fit RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', max_depth=12, random_state = 0)
rfc.fit(v_train, y_train)

RandomForestClassifier(criterion='entropy', max_depth=12, random_state=0)

## Initialize DecisionTree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(criterion = 'entropy', max_depth=10, random_state = 0)
dtc.fit(v_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=10, random_state=0)

## Initialize PassiveAgressiveClassifier

In [None]:
from sklearn.linear_model import PassiveAggressiveClassifier
pac=PassiveAggressiveClassifier(max_iter=25, C=0.25, random_state = 0)
pac.fit(v_train,y_train)

PassiveAggressiveClassifier(C=0.25, max_iter=25, random_state=0)

## Predict the test set results

In [None]:
y_pred_rfc=rfc.predict(v_test)
score_rfc=accuracy_score(y_test,y_pred_rfc)
print(f'Accuracy: {round(score_rfc*100,2)}%')
y_pred_dtc=dtc.predict(v_test)
score_dtc=accuracy_score(y_test,y_pred_dtc)
print(f'Accuracy: {round(score_dtc*100,2)}%')
y_pred_pac=pac.predict(v_test)
score_pac=accuracy_score(y_test,y_pred_pac)
print(f'Accuracy: {round(score_pac*100,2)}%')


Accuracy: 84.37%
Accuracy: 81.53%
Accuracy: 92.74%


## Confusion Matrix

In [None]:
cm_dtc = confusion_matrix(y_test, y_pred_dtc)
cm_rfc = confusion_matrix(y_test, y_pred_rfc)
cm_pac = confusion_matrix(y_test, y_pred_pac)

plot_labels=dict(x="Predicted", y="Actual")
x_axes=['Fake', 'Real']
y_axes=['Fake', 'Real']

fig_dtc = px.imshow(cm_dtc,labels=plot_labels, x=x_axes, y=y_axes,text_auto=True)
fig_rfc = px.imshow(cm_rfc,labels=plot_labels, x=x_axes, y=y_axes,text_auto=True)
fig_pac = px.imshow(cm_pac,labels=plot_labels, x=x_axes, y=y_axes,text_auto=True)

fig = make_subplots(rows=1, cols=3, subplot_titles=("DecisionTree", "RandomForest", "PassiveAgressive"))

fig.add_trace(fig_dtc.data[0], row=1, col=1)
fig.add_trace(fig_rfc.data[0], row=1, col=2)
fig.add_trace(fig_pac.data[0], row=1, col=3)

fig.show()

## Precision, Recal & F1

In [None]:
TN, FP, FN, TP = cm_dtc.ravel()
print(TN)
print(FN)
prec_dtc = TP/(TP+FP)
recal_dtc = TP/(TP+FN)
f1_dtc = 2/((1/prec_dtc)+(1/recal_dtc))
print("====DTC====")
print(f'Precision for Decision Tree classifier: {round(prec_dtc*100,2)}%')
print(f'Recal for Decision Tree classifier: {round(recal_dtc*100,2)}%')
print(f'F1 for Decision Tree classifier: {round(f1_dtc*100,2)}%')
print()

TN, FP, FN, TP = cm_rfc.ravel()
prec_rfc = TP/(TP+FP)
recal_rfc = TP/(TP+FN)
f1_rfc = 2/((1/prec_rfc)+(1/recal_rfc))
print("====RFC====")
print(f'Precision for Random forest classifier: {round(prec_rfc*100,2)}%')
print(f'Recal for Decision Tree classifier: {round(recal_rfc*100,2)}%')
print(f'F1 for Decision Tree classifier: {round(f1_rfc*100,2)}%')
print()

TN, FP, FN, TP = cm_pac.ravel()
prec_pac = TP/(TP+FP)
recal_pac = TP/(TP+FN)
f1_pac = 2/((1/prec_pac)+(1/recal_pac))
print("====PAC====")
print(f'Precision for Passive agressive classifier: {round(prec_pac*100,2)}%')
print(f'Recal for Decision Tree classifier: {round(recal_pac*100,2)}%')
print(f'F1 for Decision Tree classifier: {round(f1_pac*100,2)}%')
print()

553
149
====DTC====
Precision for Decision Tree classifier: 84.96%
Recal for Decision Tree classifier: 76.31%
F1 for Decision Tree classifier: 80.4%

====RFC====
Precision for Random forest classifier: 89.69%
Recal for Decision Tree classifier: 77.42%
F1 for Decision Tree classifier: 83.11%

====PAC====
Precision for Passive agressive classifier: 92.28%
Recal for Decision Tree classifier: 93.16%
F1 for Decision Tree classifier: 92.72%



## Different GridSearch for the classifiers

In [None]:
from sklearn.model_selection import GridSearchCV
###########################################
#For decision tree:
dtc_param = [{'criterion': ['gini', 'entropy'], 'max_depth' : [2,4,6,8,10,12]}]
#Best Accuracy: 81.97 %
#Best Parameters: {'criterion': 'gini', 'max_depth': 10}

###########################################
#For random forest:
rfc_param = [{'n_estimators':[10,25,50,100], 'criterion':['gini', 'entropy'], 'max_depth' : [2,4,6,8,10,12]}]
# Best Accuracy: 85.68 %
# Best Parameters: {'criterion': 'entropy', 'max_depth': 12, 'n_estimators': 100}

###########################################
#For passive agressive
pac_param = [{'C': [0.25, 0.5, 0.75, 1], 'max_iter':[25, 50, 100, 150, 250, 500, 1000]}]
# Best Accuracy: 93.94 %
# Best Parameters: {'C': 0.25, 'max_iter': 25}

grid_search = GridSearchCV(estimator = pac,
                           param_grid = pac_param,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(v_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best Accuracy: 93.94 %
Best Parameters: {'C': 0.25, 'max_iter': 25}


# Trying out other classification methods

### Ridge Classification

In [None]:
from sklearn.linear_model import RidgeClassifier
clf = RidgeClassifier().fit(v_train, y_train)
clf.score(v_test,y_test)
y_pred = clf.predict(v_test)
# Accuracy: 92.9%
# Precision: 93.96%
# Recal: 91.57%
# F1: 92.75%

### Stochastic Gradient Descent Classification

In [6]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss="hinge", penalty="l2", alpha=0.0001, random_state=0)
clf.fit(v_train, y_train)
y_pred = clf.predict(v_test)
# Accuracy: 92.98%
# Precision: 93.69%
# Recal: 92.05%
# F1: 92.86%

In [7]:
import pickle
filename = 'Application/Backend/fakeNews_model.sav'
pickle.dump(clf, open(filename, 'wb'))

### KNN Classification

In [None]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=2)
clf.fit(v_train, y_train)
y_pred = clf.predict(v_test)
# Accuracy: 58.96%
# Precision: 97.39%
# Recal: 17.81%
# F1: 30.11%

### Neural network: MLP Classifier

In [None]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='adam', alpha=1e-5,hidden_layer_sizes=(10, 10), random_state=0)
clf.fit(v_train, y_train)
y_pred = clf.predict(v_test)
# Accuracy: 92.58%
# Precision: 93.08%
# Recal: 91.89%
# F1: 92.48%

#### Grid Searching for MLP

In [None]:
from sklearn.model_selection import GridSearchCV
clf_param = [{'solver':['adam'], 'hidden_layer_sizes':[(5,2),(10,),(5,10),(10,10)]}]
# 'lbfgs','sgd','adam'


grid_search = GridSearchCV(estimator = clf,
                           param_grid = clf_param,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(v_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best Accuracy: 93.78 %
Best Parameters: {'hidden_layer_sizes': (10, 10), 'solver': 'adam'}


### Support vector machine SVM

In [None]:
from sklearn import svm
clf = svm.SVC(C=1, kernel='linear', random_state=0)
clf.fit(v_train, y_train)
y_pred = clf.predict(v_test)
# Accuracy: 93.05%
# Precision: 93.56%
# Recal: 92.37%
# F1: 92.96%

# Best Accuracy: 93.59 %
# Best Parameters: {'C': 1, 'kernel': 'linear'}

#### Grid Searching for best SVM

In [None]:
from sklearn.model_selection import GridSearchCV
clf_param = [{'C': [0.25, 0.5, 0.75, 1], 'kernel': ['linear']},
              {'C': [0.25, 0.5, 0.75, 1], 'kernel': ['rbf'], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}]


grid_search = GridSearchCV(estimator = clf,
                           param_grid = clf_param,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(v_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

### Results

In [None]:
cm_other = confusion_matrix(y_test, y_pred)

plot_labels=dict(x="Predicted", y="Actual")
x_axes=['Fake', 'Real']
y_axes=['Fake', 'Real']

fig_other = px.imshow(cm_other,labels=plot_labels, x=x_axes, y=y_axes,text_auto=True)


fig_other.show()
acc_score=accuracy_score(y_test,y_pred)
TN, FP, FN, TP = cm_other.ravel()
prec_score = TP/(TP+FP)
recal_score = TP/(TP+FN)
f1_score = 2/((1/prec_score)+(1/recal_score))
print(f'Accuracy: {round(acc_score*100,2)}%')
print(f'Precision: {round(prec_score*100,2)}%')
print(f'Recal: {round(recal_score*100,2)}%')
print(f'F1: {round(f1_score*100,2)}%')
print()

Accuracy: 92.58%
Precision: 93.08%
Recal: 91.89%
F1: 92.48%

