In [1]:
# Packages for data 
import pandas as pd
import numpy as np
import pickle

# Packages for machine learning modelling
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.metrics import classification_report,confusion_matrix, ConfusionMatrixDisplay, accuracy_score
# precision_score, recall_score, f1_score

# Packages for visualisation 
import matplotlib.pyplot as plt

# Packages for MLP
import nltk


# Train-test-split

In [2]:
train_data = pd.read_csv("new data/train_data.csv")
val_data = pd.read_csv("new data/validation_data.csv")
test_data = pd.read_csv("new data/test_data.csv")

In [3]:
X_train_text = train_data["text_preprocessed"].values
y_train = train_data["class_label"].values

X_val_text = val_data["text_preprocessed"].values
y_val = val_data["class_label"].values

X_test_text = test_data["text_preprocessed"].values
y_test = test_data["class_label"].values

# Creating the Base Model using Linear Support Vector Classification

## Using CountVectorizer with Bag of Words, Unigrams


In [4]:
vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,1))
vectorizer.fit(X_train_text)

X_train = vectorizer.transform(X_train_text)
X_val = vectorizer.transform(X_val_text)
X_test = vectorizer.transform(X_test_text)

In [5]:
# Sparse vector of frequency of each word appearing in a text article
print(X_train)

  (0, 516)	1
  (0, 1922)	1
  (0, 2029)	1
  (0, 4715)	1
  (0, 5598)	1
  (0, 6839)	1
  (0, 7282)	1
  (0, 7488)	1
  (0, 7630)	1
  (0, 7783)	1
  (0, 8151)	1
  (0, 8253)	1
  (0, 9085)	1
  (0, 9332)	1
  (0, 9881)	1
  (0, 11762)	1
  (0, 14195)	1
  (0, 14350)	1
  (0, 14626)	1
  (0, 15044)	1
  (0, 15420)	1
  (0, 16070)	1
  (0, 16649)	1
  (0, 16853)	2
  (0, 16959)	1
  :	:
  (39059, 207643)	1
  (39059, 207675)	1
  (39059, 207722)	8
  (39059, 209919)	1
  (39059, 211547)	1
  (39059, 212398)	1
  (39059, 213163)	1
  (39059, 213646)	1
  (39059, 213724)	3
  (39059, 213905)	1
  (39059, 214143)	1
  (39059, 214295)	1
  (39059, 214663)	2
  (39059, 216315)	1
  (39059, 217267)	1
  (39059, 217354)	1
  (39059, 217846)	1
  (39059, 217891)	1
  (39059, 218513)	1
  (39059, 218862)	1
  (39059, 218944)	1
  (39059, 219049)	1
  (39059, 219173)	1
  (39059, 219210)	1
  (39059, 220668)	1


In [6]:
svm_clf = LinearSVC()
svm_clf.fit(X_train, y_train)



LinearSVC()

In [7]:
y_val_pred = svm_clf.predict(X_val)
# accuracy_score(y_val, y_val_pred)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.97      0.96      0.96      6361
           1       0.96      0.97      0.96      6659

    accuracy                           0.96     13020
   macro avg       0.96      0.96      0.96     13020
weighted avg       0.96      0.96      0.96     13020



In [8]:
y_test_pred = svm_clf.predict(X_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.97      0.96      0.96      6361
           1       0.96      0.97      0.96      6660

    accuracy                           0.96     13021
   macro avg       0.96      0.96      0.96     13021
weighted avg       0.96      0.96      0.96     13021



## Using CountVectorizer with Bag of Words, Unigrams + Bigrams

In [9]:
vectorizer2 = CountVectorizer(stop_words='english', ngram_range=(1,2))
vectorizer2.fit(X_train_text)

X_train = vectorizer2.transform(X_train_text)
X_val = vectorizer2.transform(X_val_text)
X_test = vectorizer2.transform(X_test_text)

In [10]:
# Validation Data
svm_clf.fit(X_train, y_train)

y_val_pred2 = svm_clf.predict(X_val)
print(classification_report(y_val, y_val_pred2))

              precision    recall  f1-score   support

           0       0.98      0.97      0.97      6361
           1       0.97      0.98      0.97      6659

    accuracy                           0.97     13020
   macro avg       0.97      0.97      0.97     13020
weighted avg       0.97      0.97      0.97     13020





In [11]:
# Test Data
svm_clf.fit(X_train, y_train)

y_test_pred2 = svm_clf.predict(X_test)
print(classification_report(y_test, y_test_pred2))



              precision    recall  f1-score   support

           0       0.97      0.97      0.97      6361
           1       0.97      0.98      0.97      6660

    accuracy                           0.97     13021
   macro avg       0.97      0.97      0.97     13021
weighted avg       0.97      0.97      0.97     13021



## Using CountVectorizer with Bag of Words, Bigrams Only

In [12]:
vectorizer3 = CountVectorizer(stop_words='english', ngram_range=(2,2))
vectorizer3.fit(X_train_text)

X_train = vectorizer3.transform(X_train_text)
X_val = vectorizer3.transform(X_val_text)
X_test = vectorizer3.transform(X_test_text)

In [13]:
# Validation Data
svm_clf.fit(X_train, y_train)

y_val_pred3 = svm_clf.predict(X_val)
print(classification_report(y_val, y_val_pred3))

              precision    recall  f1-score   support

           0       0.96      0.94      0.95      6361
           1       0.94      0.96      0.95      6659

    accuracy                           0.95     13020
   macro avg       0.95      0.95      0.95     13020
weighted avg       0.95      0.95      0.95     13020





In [14]:
# Test Data
svm_clf.fit(X_train, y_train)

y_test_pred3 = svm_clf.predict(X_test)
print(classification_report(y_test, y_test_pred3))

              precision    recall  f1-score   support

           0       0.96      0.93      0.95      6361
           1       0.94      0.97      0.95      6660

    accuracy                           0.95     13021
   macro avg       0.95      0.95      0.95     13021
weighted avg       0.95      0.95      0.95     13021





## Using Tf-Idf and Unigrams

In [15]:
tfidf_params = {'unigram':(1,1), 'unigram and bigram': (1,2), 'bigram':(2,2)}

for ngram, values in tfidf_params.items():
    tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=values)
    tfidf_vectorizer.fit(X_train_text)

    X_train = tfidf_vectorizer.transform(X_train_text)
    X_val = tfidf_vectorizer.transform(X_val_text)
    X_test = tfidf_vectorizer.transform(X_test_text)

    svm_clf = LinearSVC()
    print(f"Model with {ngram}")
    svm_clf.fit(X_train, y_train)

    # Validation Data
    print("Testing using validation data:")    
    y_val_pred = svm_clf.predict(X_val)
    print(classification_report(y_val, y_val_pred))
    print("------------------------------------------")

    # Test Data
    print("Testing using test data:")
    y_test_pred = svm_clf.predict(X_test)
    print(classification_report(y_test, y_test_pred))
    print("------------------------------------------")
    print("------------------------------------------")

Model with unigram
Testing using validation data:
              precision    recall  f1-score   support

           0       0.98      0.97      0.97      6361
           1       0.97      0.98      0.97      6659

    accuracy                           0.97     13020
   macro avg       0.97      0.97      0.97     13020
weighted avg       0.97      0.97      0.97     13020

------------------------------------------
Testing using test data:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      6361
           1       0.97      0.97      0.97      6660

    accuracy                           0.97     13021
   macro avg       0.97      0.97      0.97     13021
weighted avg       0.97      0.97      0.97     13021

------------------------------------------
------------------------------------------
Model with unigram and bigram
Testing using validation data:
              precision    recall  f1-score   support

           0       0.97   

In [45]:
# Ignore first

# best_config = {
#     # 'Max features': 0, 
#     'Best model': None,
#     'Accuracy': 0, 
#     'Precision': 0, 
#     'Recall': 0, 
#     'F1': 0
# }

# # for n in range(500, 7000, 50):
# vectorizer = CountVectorizer(max_features=1650, max_df=0.15) # max_features=500, min_df=0.01
# vectorizer.fit(sentences)
# X_train = vectorizer.transform(sentences)

# validation_X_test = vectorizer.transform(validation_sentences)

# for model_name in models_dict:
#     model = models_dict[model_name]
#     model.fit(X_train, y)
#     predictions = model.predict(validation_X_test)

#     acc = accuracy_score(validation_y,predictions)  # always true label first, then your predicted labels!
#     precision = precision_score(validation_y,predictions) 
#     recall = recall_score(validation_y,predictions) 
#     f1 = f1_score(validation_y,predictions)

#     if f1 > best_config['F1']:
#         best_config['Best model'] = model_name
#         best_config['Accuracy'] = round(acc, 3)
#         best_config['Precision'] = round(precision, 3)
#         best_config['Recall'] = round(recall, 3)
#         best_config['F1'] = round(f1, 3)

# for k, v in best_config.items():
#     print(f'{k}: {v}')