# Data Preprocessing

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [None]:
dataset = pd.read_csv('news.csv')
labels=dataset.label

## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset['text'], labels, test_size = 0.2, random_state = 7)
print(y_train)

6237    FAKE
3722    FAKE
5774    FAKE
336     REAL
3622    REAL
        ... 
5699    FAKE
2550    REAL
537     REAL
1220    REAL
4271    REAL
Name: label, Length: 5068, dtype: object


## Initialize TF-IDF-vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

## Fit and Transform training set. Transform test set.

In [None]:
v_train = vectorizer.fit_transform(X_train)
v_test = vectorizer.transform(X_test)

## Initialize and fit RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', max_depth=12, random_state = 0)
rfc.fit(v_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)

## Initialize DecisionTree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(criterion = 'entropy', max_depth=10, random_state = 0)
dtc.fit(v_train, y_train)

DecisionTreeClassifier(criterion='entropy', random_state=0)

## Initialize PassiveAgressiveClassifier

In [None]:
from sklearn.linear_model import PassiveAggressiveClassifier
pac=PassiveAggressiveClassifier(max_iter=25, C=0.25, random_state = 0)
pac.fit(v_train,y_train)

PassiveAggressiveClassifier(max_iter=50, random_state=0)

## Predict the test set results

In [None]:
from sklearn.metrics import accuracy_score
y_pred_rfc=rfc.predict(v_test)
score_rfc=accuracy_score(y_test,y_pred_rfc)
print(f'Accuracy: {round(score_rfc*100,2)}%')
y_pred_dtc=dtc.predict(v_test)
score_dtc=accuracy_score(y_test,y_pred_dtc)
print(f'Accuracy: {round(score_dtc*100,2)}%')
y_pred_pac=pac.predict(v_test)
score_pac=accuracy_score(y_test,y_pred_pac)
print(f'Accuracy: {round(score_pac*100,2)}%')


Accuracy: 83.74%
Accuracy: 80.66%
Accuracy: 92.66%


## Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_rfc)
print(cm)
accuracy_score(y_test, y_pred_rfc)

[[580  58]
 [148 481]]


0.8374112075769534

## Different GridSearch for the classifiers

In [None]:
from sklearn.model_selection import GridSearchCV
###########################################
#For decision tree:
dtc_param = [{'criterion': ['gini', 'entropy'], 'max_depth' : [2,4,6,8,10,12]}]
#Best Accuracy: 81.97 %
#Best Parameters: {'criterion': 'gini', 'max_depth': 10}

###########################################
#For random forest:
rfc_param = [{'n_estimators':[10,25,50,100], 'criterion':['gini', 'entropy'], 'max_depth' : [2,4,6,8,10,12]}]
# Best Accuracy: 85.68 %
# Best Parameters: {'criterion': 'entropy', 'max_depth': 12, 'n_estimators': 100}

###########################################
#For passive agressive
pac_param = [{'C': [0.25, 0.5, 0.75, 1], 'max_iter':[25, 50, 100, 150, 250, 500, 1000]}]
# Best Accuracy: 93.94 %
# Best Parameters: {'C': 0.25, 'max_iter': 25}

grid_search = GridSearchCV(estimator = pac,
                           param_grid = pac_param,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(v_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best Accuracy: 93.94 %
Best Parameters: {'C': 0.25, 'max_iter': 25}
