In [1]:
# Import packages
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix, ConfusionMatrixDisplay, accuracy_score

# Reading the Data

In [2]:
train_data = pd.read_csv("../../Data/Combined data/train_data.csv")
val_data = pd.read_csv("../../Data/Combined data/validation_data.csv")
test_data = pd.read_csv("../../Data/Combined data/test_data.csv")

train_data.sample(5)

Unnamed: 0.1,Unnamed: 0,text,class_label,text_preprocessed
31765,2859,"Donald Trump, Fort Lauderdale: Your Friday Eve...",0,donald trump fort lauderdal friday even brief ...
19566,506,Top Democrats in Congress say won't meet with ...,0,top democrat congress say wont meet trump plan...
33123,12977,Turner Classic Movies Sets 24-Hour Debbie Reyn...,0,turner classic movi set 24hour debbi reynold m...
9561,13628,"Trump, Japan's Abe agree to boost deterrence a...",0,trump japan abe agre boost deterr north korea ...
11025,7330,Gatlinburg Wildfires Force Evacuations: ‘It Wa...,0,gatlinburg wildfir forc evacu drive hell new y...


In [3]:
X_train_text = train_data["text_preprocessed"].values
y_train = train_data["class_label"].values

X_val_text = val_data["text_preprocessed"].values
y_val = val_data["class_label"].values

X_test_text = test_data["text_preprocessed"].values
y_test = test_data["class_label"].values

# Baseline Model (Random Forest)
## Using CountVectoriser with Bag of Words, Unigrams

In [5]:
vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,1))
vectorizer.fit(X_train_text)

X_train = vectorizer.transform(X_train_text)
X_val = vectorizer.transform(X_val_text) 
X_test = vectorizer.transform(X_test_text)

In [6]:
# initialise Random Forest
clf = RandomForestClassifier(n_estimators=100, random_state=0) 

# train model
clf.fit(X_train, y_train)

RandomForestClassifier(random_state=0)

In [7]:
y_val_pred = clf.predict(X_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.92      0.95      0.93      6361
           1       0.95      0.92      0.94      6659

    accuracy                           0.93     13020
   macro avg       0.93      0.94      0.93     13020
weighted avg       0.94      0.93      0.93     13020



In [8]:
y_test_pred = clf.predict(X_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.93      0.94      0.94      6361
           1       0.95      0.93      0.94      6660

    accuracy                           0.94     13021
   macro avg       0.94      0.94      0.94     13021
weighted avg       0.94      0.94      0.94     13021



## Using CountVectorizer with Bag of Words, Unigrams + Bigrams

In [12]:
#  with 4mil features
vectorizer5 = CountVectorizer(stop_words='english', ngram_range=(1,2))
vectorizer5.fit(X_train_text)

X_train = vectorizer5.transform(X_train_text)
X_val = vectorizer5.transform(X_val_text)
X_test = vectorizer5.transform(X_test_text)

In [13]:
# initialise Random Forest
clf = RandomForestClassifier(n_estimators=100, random_state=0) 

# train model
clf.fit(X_train, y_train)

RandomForestClassifier(random_state=0)

In [14]:
y_val_pred = clf.predict(X_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.92      0.95      0.93      6361
           1       0.95      0.93      0.94      6659

    accuracy                           0.94     13020
   macro avg       0.94      0.94      0.94     13020
weighted avg       0.94      0.94      0.94     13020



In [15]:
y_test_pred = clf.predict(X_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94      6361
           1       0.95      0.94      0.94      6660

    accuracy                           0.94     13021
   macro avg       0.94      0.94      0.94     13021
weighted avg       0.94      0.94      0.94     13021



In [4]:
#  with 3k features
vectorizer2 = CountVectorizer(stop_words='english', ngram_range=(1,2), min_df=0.01)
vectorizer2.fit(X_train_text)

X_train = vectorizer2.transform(X_train_text)
X_val = vectorizer2.transform(X_val_text)
X_test = vectorizer2.transform(X_test_text)

In [5]:
# train model
clf.fit(X_train, y_train)

RandomForestClassifier(random_state=0)

In [6]:
y_val_pred = clf.predict(X_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.97      0.95      0.96      6361
           1       0.95      0.97      0.96      6659

    accuracy                           0.96     13020
   macro avg       0.96      0.96      0.96     13020
weighted avg       0.96      0.96      0.96     13020



In [7]:
y_test_pred = clf.predict(X_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.97      0.95      0.96      6361
           1       0.95      0.97      0.96      6660

    accuracy                           0.96     13021
   macro avg       0.96      0.96      0.96     13021
weighted avg       0.96      0.96      0.96     13021



In [8]:
#  with 134 features
vectorizer4 = CountVectorizer(stop_words='english', ngram_range=(1,2), min_df=0.15)
vectorizer4.fit(X_train_text)

X_train = vectorizer4.transform(X_train_text)
X_val = vectorizer4.transform(X_val_text)
X_test = vectorizer4.transform(X_test_text)

In [9]:
# train model
clf.fit(X_train, y_train)

RandomForestClassifier(random_state=0)

In [10]:
y_val_pred = clf.predict(X_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.96      0.90      0.93      6361
           1       0.91      0.96      0.94      6659

    accuracy                           0.94     13020
   macro avg       0.94      0.93      0.93     13020
weighted avg       0.94      0.94      0.93     13020



In [11]:
y_test_pred = clf.predict(X_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.96      0.92      0.94      6361
           1       0.92      0.96      0.94      6660

    accuracy                           0.94     13021
   macro avg       0.94      0.94      0.94     13021
weighted avg       0.94      0.94      0.94     13021



## Using CountVectorizer with Bag of Words, Bigrams Only

In [14]:
vectorizer3 = CountVectorizer(stop_words='english', ngram_range=(2,2))
vectorizer3.fit(X_train_text)

X_train = vectorizer3.transform(X_train_text)
X_val = vectorizer3.transform(X_val_text)
X_test = vectorizer3.transform(X_test_text)

In [15]:
# initialise Random Forest
clf = RandomForestClassifier(n_estimators=100, random_state=0) 

# train model
clf.fit(X_train, y_train)

RandomForestClassifier(random_state=0)

In [16]:
y_val_pred = clf.predict(X_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.92      0.96      0.94      6361
           1       0.96      0.92      0.94      6659

    accuracy                           0.94     13020
   macro avg       0.94      0.94      0.94     13020
weighted avg       0.94      0.94      0.94     13020



In [17]:
y_test_pred = clf.predict(X_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.92      0.96      0.94      6361
           1       0.96      0.92      0.94      6660

    accuracy                           0.94     13021
   macro avg       0.94      0.94      0.94     13021
weighted avg       0.94      0.94      0.94     13021



# Using Tf-Idf and Unigrams

In [11]:
tfidf_params = {'unigram':(1,1), 'unigram and bigram': (1,2), 'bigram':(2,2)}

for ngram, values in tfidf_params.items():
    tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=values)
    tfidf_vectorizer.fit(X_train_text)

    X_train = tfidf_vectorizer.transform(X_train_text)
    X_val = tfidf_vectorizer.transform(X_val_text)
    X_test = tfidf_vectorizer.transform(X_test_text)

    print(f"Model with {ngram}")
    clf.fit(X_train, y_train)

    # Validation Data
    print("Testing using validation data:")    
    y_val_pred = clf.predict(X_val)
    print(classification_report(y_val, y_val_pred))
    print("------------------------------------------")

    # Test Data
    print("Testing using test data:")
    y_test_pred = clf.predict(X_test)
    print(classification_report(y_test, y_test_pred))
    print("------------------------------------------")
    print("------------------------------------------")

Model with unigram
Testing using validation data:
              precision    recall  f1-score   support

           0       0.93      0.94      0.93      6361
           1       0.94      0.93      0.94      6659

    accuracy                           0.94     13020
   macro avg       0.93      0.94      0.93     13020
weighted avg       0.94      0.94      0.94     13020

------------------------------------------
Testing using test data:
              precision    recall  f1-score   support

           0       0.94      0.94      0.94      6361
           1       0.94      0.94      0.94      6660

    accuracy                           0.94     13021
   macro avg       0.94      0.94      0.94     13021
weighted avg       0.94      0.94      0.94     13021

------------------------------------------
------------------------------------------
Model with unigram and bigram
Testing using validation data:
              precision    recall  f1-score   support

           0       0.93   