In [53]:
# Import packages
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier

from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix, ConfusionMatrixDisplay, accuracy_score

In [13]:
# Import data
true_data = pd.read_csv("../Data/dataset_1/clean_data/true_clean_data.csv", index_col=0)
fake_data = pd.read_csv("../Data/dataset_1/clean_data/fake_clean_data.csv", index_col=0)

true_data.head()

Unnamed: 0,text,subject,class_label,text_preprocessed
0,"As U.S. budget fight looms, Republicans flip t...",politicsNews,0,us budget fight looms republicans flip fiscal ...
1,U.S. military to accept transgender recruits o...,politicsNews,0,us military accept transgender recruits monday...
2,Senior U.S. Republican senator: 'Let Mr. Muell...,politicsNews,0,senior us republican senator let mr mueller jo...
3,FBI Russia probe helped by Australian diplomat...,politicsNews,0,fbi russia probe helped australian diplomat ti...
4,Trump wants Postal Service to charge 'much mor...,politicsNews,0,trump wants postal service charge much amazon ...


# Train-Test-Split

In [16]:
true_X = true_data["text_preprocessed"].values
true_y = true_data["class_label"].values

fake_X = fake_data["text_preprocessed"].values
fake_y = fake_data["class_label"].values

In [17]:
# Splitting true and fake data into training and test subsets
true_X_train, true_X_test, true_y_train, true_y_test = train_test_split(true_X, true_y, test_size = 0.2, random_state=99)
fake_X_train, fake_X_test, fake_y_train, fake_y_test = train_test_split(fake_X, fake_y, test_size = 0.2, random_state=99)

# Splitting training data into train and validation subsets
true_X_train, true_X_val, true_y_train, true_y_val = train_test_split(true_X_train, true_y_train, test_size = 0.2, random_state=99)
fake_X_train, fake_X_val, fake_y_train, fake_y_val = train_test_split(fake_X_train, fake_y_train, test_size = 0.2, random_state=99)

In [19]:
# Text 
X_train_text = np.concatenate((true_X_train,fake_X_train))
X_val_text = np.concatenate((true_X_val, fake_X_val))
X_test_text = np.concatenate((true_X_test, fake_X_test))

# Labels
y_train = np.concatenate((true_y_train, fake_y_train))
y_val = np.concatenate((true_y_val, fake_y_val))
y_test = np.concatenate((true_y_test, fake_y_test))

# Baseline Model (Decision Tree)
## Using CountVectoriser

In [23]:
vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,1))
vectorizer.fit(X_train_text)

X_train = vectorizer.transform(X_train_text)
X_val = vectorizer.transform(X_val_text)
X_test = vectorizer.transform(X_test_text)

In [25]:
# initialise Decision Tree
clf = DecisionTreeClassifier(criterion='gini',random_state=0) 

# train model
clf.fit(X_train, y_train)

DecisionTreeClassifier(random_state=0)

In [29]:
y_val_pred = clf.predict(X_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      3427
           1       1.00      0.99      1.00      3757

    accuracy                           1.00      7184
   macro avg       1.00      1.00      1.00      7184
weighted avg       1.00      1.00      1.00      7184



In [30]:
y_test_pred = clf.predict(X_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4284
           1       1.00      1.00      1.00      4697

    accuracy                           1.00      8981
   macro avg       1.00      1.00      1.00      8981
weighted avg       1.00      1.00      1.00      8981



In [50]:
from sklearn.tree import export_graphviz
import pydotplus
from six import StringIO  
from IPython.display import Image

# feature_cols = ['text_preprocessed']

# dot_data = StringIO()
# export_graphviz(clf, out_file = dot_data, 
#                       feature_names = feature_cols,  
#                      filled = True, rounded = True,  
#                     special_characters = True)

# graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
# Image(graph.create_png())

In [40]:
# get predictions and compare results
predictions = clf.predict(X_test)
results = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})
results.sample(10)

Unnamed: 0,Actual,Predicted
5112,1,1
7,0,0
5221,1,1
2218,0,0
4486,1,1
4398,1,1
797,0,0
4129,0,0
3957,0,0
6541,1,1


In [51]:
# Accuracy
accuracy = accuracy_score(y_test,predictions)
print("Accuracy:",accuracy)

Accuracy: 0.9965482685669748


In [54]:
# Precision and Recall
precision = metrics.precision_score(y_test, predictions)
recall = metrics.recall_score(y_test, predictions)
f_measure = metrics.f1_score(y_test, predictions)
print("Precision:",precision)
print("Recall:",recall)
print("F-measure:",f_measure)

Precision: 0.9972293265132139
Recall: 0.99616776665957
F-measure: 0.9966982639258706
