In [26]:
# Packages for data 
import pandas as pd
import numpy as np
import pickle

# Packages for machine learning modelling
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB

from sklearn.metrics import classification_report,confusion_matrix, ConfusionMatrixDisplay, accuracy_score
# precision_score, recall_score, f1_score

# Packages for visualisation 
import matplotlib.pyplot as plt

# Packages for MLP
import nltk


# Train-test-split

In [27]:
true_data = pd.read_csv("../Data/true_clean_data.csv", index_col=0)
fake_data = pd.read_csv("../Data/fake_clean_data.csv", index_col=0)

In [28]:
true_X = true_data["text_preprocessed"].values
true_y = true_data["class_label"].values

fake_X = fake_data["text_preprocessed"].values
fake_y = fake_data["class_label"].values

In [29]:
# Splitting true and fake data into training and test subsets
true_X_train, true_X_test, true_y_train, true_y_test = train_test_split(true_X, true_y, test_size = 0.2, random_state=99)
fake_X_train, fake_X_test, fake_y_train, fake_y_test = train_test_split(fake_X, fake_y, test_size = 0.2, random_state=99)

# Splitting training data into train and validation subsets
true_X_train, true_X_val, true_y_train, true_y_val = train_test_split(true_X_train, true_y_train, test_size = 0.2, random_state=99)
fake_X_train, fake_X_val, fake_y_train, fake_y_val = train_test_split(fake_X_train, fake_y_train, test_size = 0.2, random_state=99)

In [30]:
# Text 
X_train_text = np.concatenate((true_X_train,fake_X_train))
X_val_text = np.concatenate((true_X_val, fake_X_val))
X_test_text = np.concatenate((true_X_test, fake_X_test))

# Labels
y_train = np.concatenate((true_y_train, fake_y_train))
y_val = np.concatenate((true_y_val, fake_y_val))
y_test = np.concatenate((true_y_test, fake_y_test))

In [31]:
print(true_data.shape[0] + fake_data.shape[0])
print(len(X_train_text) + len(X_val_text) + len(X_test_text))

44898
44898


# Creating the Base Model using Naive Bayes

## Using CountVectorizer with Bag of Words, Unigrams

In [32]:
vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,1))
vectorizer.fit(X_train_text)

X_train = vectorizer.transform(X_train_text)
X_val = vectorizer.transform(X_val_text)
X_test = vectorizer.transform(X_test_text)

In [33]:
# Sparse vector of frequency of each word appearing in a text article
print(X_train)

  (0, 1020)	1
  (0, 1267)	1
  (0, 2929)	1
  (0, 3406)	1
  (0, 6442)	1
  (0, 6663)	1
  (0, 6789)	1
  (0, 10164)	1
  (0, 11894)	1
  (0, 11979)	1
  (0, 13184)	1
  (0, 14018)	1
  (0, 14352)	1
  (0, 17093)	1
  (0, 19614)	1
  (0, 20249)	1
  (0, 20911)	2
  (0, 21401)	2
  (0, 21930)	1
  (0, 22659)	2
  (0, 23359)	3
  (0, 23376)	1
  (0, 26317)	2
  (0, 28545)	4
  (0, 29933)	1
  :	:
  (28732, 168212)	1
  (28732, 170249)	1
  (28732, 170567)	1
  (28732, 170803)	1
  (28732, 173947)	2
  (28732, 174291)	1
  (28732, 174384)	2
  (28732, 174624)	1
  (28732, 175075)	2
  (28732, 175145)	1
  (28732, 175393)	1
  (28732, 175490)	1
  (28732, 175940)	1
  (28732, 176175)	1
  (28732, 178226)	1
  (28732, 178340)	1
  (28732, 178750)	1
  (28732, 178789)	1
  (28732, 178935)	1
  (28732, 178980)	1
  (28732, 179709)	2
  (28732, 179869)	1
  (28732, 180841)	1
  (28732, 181109)	1
  (28732, 181342)	1


In [34]:
naive_bayes_clf = BernoulliNB()
naive_bayes_clf.fit(X_train, y_train)

BernoulliNB()

In [35]:
y_val_pred = naive_bayes_clf.predict(X_val)
# accuracy_score(y_val, y_val_pred)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      3427
           1       0.98      0.96      0.97      3757

    accuracy                           0.97      7184
   macro avg       0.97      0.97      0.97      7184
weighted avg       0.97      0.97      0.97      7184



In [36]:
y_test_pred = naive_bayes_clf.predict(X_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      4284
           1       0.99      0.97      0.98      4697

    accuracy                           0.98      8981
   macro avg       0.98      0.98      0.98      8981
weighted avg       0.98      0.98      0.98      8981



## Using CountVectorizer with Bag of Words, Unigrams + Bigrams

In [37]:
vectorizer2 = CountVectorizer(stop_words='english', ngram_range=(1,2))
vectorizer2.fit(X_train_text)

X_train = vectorizer2.transform(X_train_text)
X_val = vectorizer2.transform(X_val_text)
X_test = vectorizer2.transform(X_test_text)

In [38]:
# Validation Data
naive_bayes_clf.fit(X_train, y_train)

y_val_pred2 = naive_bayes_clf.predict(X_val)
print(classification_report(y_val, y_val_pred2))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      3427
           1       0.99      0.98      0.98      3757

    accuracy                           0.98      7184
   macro avg       0.98      0.98      0.98      7184
weighted avg       0.98      0.98      0.98      7184



In [39]:
# Test Data
naive_bayes_clf.fit(X_train, y_train)

y_test_pred2 = naive_bayes_clf.predict(X_test)
print(classification_report(y_test, y_test_pred2))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      4284
           1       0.99      0.98      0.98      4697

    accuracy                           0.98      8981
   macro avg       0.98      0.98      0.98      8981
weighted avg       0.98      0.98      0.98      8981



## Using CountVectorizer with Bag of Words, Bigrams Only

In [40]:
vectorizer3 = CountVectorizer(stop_words='english', ngram_range=(2,2))
vectorizer3.fit(X_train_text)

X_train = vectorizer3.transform(X_train_text)
X_val = vectorizer3.transform(X_val_text)
X_test = vectorizer3.transform(X_test_text)

In [41]:
# Validation Data
naive_bayes_clf.fit(X_train, y_train)

y_val_pred3 = naive_bayes_clf.predict(X_val)
print(classification_report(y_val, y_val_pred3))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      3427
           1       0.99      0.98      0.98      3757

    accuracy                           0.98      7184
   macro avg       0.98      0.98      0.98      7184
weighted avg       0.98      0.98      0.98      7184



In [42]:
# Test Data
naive_bayes_clf.fit(X_train, y_train)

y_test_pred3 = naive_bayes_clf.predict(X_test)
print(classification_report(y_test, y_test_pred3))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      4284
           1       0.99      0.98      0.99      4697

    accuracy                           0.99      8981
   macro avg       0.99      0.99      0.99      8981
weighted avg       0.99      0.99      0.99      8981



## Using Tf-Idf and Unigrams

In [43]:
tfidf_params = {'unigram':(1,1), 'unigram and bigram': (1,2), 'bigram':(2,2)}

for ngram, values in tfidf_params.items():
    tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=values)
    tfidf_vectorizer.fit(X_train_text)

    X_train = tfidf_vectorizer.transform(X_train_text)
    X_val = tfidf_vectorizer.transform(X_val_text)
    X_test = tfidf_vectorizer.transform(X_test_text)

    naive_bayes_clf = BernoulliNB()
    print(f"Model with {ngram}")
    naive_bayes_clf.fit(X_train, y_train)

    # Validation Data
    print("Testing using validation data:")    
    y_val_pred = naive_bayes_clf.predict(X_val)
    print(classification_report(y_val, y_val_pred))
    print("------------------------------------------")

    # Test Data
    print("Testing using test data:")
    y_test_pred = naive_bayes_clf.predict(X_test)
    print(classification_report(y_test, y_test_pred))
    print("------------------------------------------")
    print("------------------------------------------")

Model with unigram
Testing using validation data:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      3427
           1       0.98      0.96      0.97      3757

    accuracy                           0.97      7184
   macro avg       0.97      0.97      0.97      7184
weighted avg       0.97      0.97      0.97      7184

------------------------------------------
Testing using test data:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      4284
           1       0.99      0.97      0.98      4697

    accuracy                           0.98      8981
   macro avg       0.98      0.98      0.98      8981
weighted avg       0.98      0.98      0.98      8981

------------------------------------------
------------------------------------------
Model with unigram and bigram
Testing using validation data:
              precision    recall  f1-score   support

           0       0.98   

In [45]:
# Ignore first

# best_config = {
#     # 'Max features': 0, 
#     'Best model': None,
#     'Accuracy': 0, 
#     'Precision': 0, 
#     'Recall': 0, 
#     'F1': 0
# }

# # for n in range(500, 7000, 50):
# vectorizer = CountVectorizer(max_features=1650, max_df=0.15) # max_features=500, min_df=0.01
# vectorizer.fit(sentences)
# X_train = vectorizer.transform(sentences)

# validation_X_test = vectorizer.transform(validation_sentences)

# for model_name in models_dict:
#     model = models_dict[model_name]
#     model.fit(X_train, y)
#     predictions = model.predict(validation_X_test)

#     acc = accuracy_score(validation_y,predictions)  # always true label first, then your predicted labels!
#     precision = precision_score(validation_y,predictions) 
#     recall = recall_score(validation_y,predictions) 
#     f1 = f1_score(validation_y,predictions)

#     if f1 > best_config['F1']:
#         best_config['Best model'] = model_name
#         best_config['Accuracy'] = round(acc, 3)
#         best_config['Precision'] = round(precision, 3)
#         best_config['Recall'] = round(recall, 3)
#         best_config['F1'] = round(f1, 3)

# for k, v in best_config.items():
#     print(f'{k}: {v}')