In [1]:
# Packages for data 
import pandas as pd
import numpy as np
import pickle
from collections import Counter
from sklearn_pandas import DataFrameMapper

# Packages for machine learning modelling
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.metrics import classification_report,confusion_matrix, ConfusionMatrixDisplay, accuracy_score
# precision_score, recall_score, f1_score

from sklearn_pandas import DataFrameMapper
# from sklearn.pipeline import Pipeline
# from sklearn.pipeline import FeatureUnion

# Packages for sentiment analysis
from textblob import TextBlob


# Packages for visualisation 
import matplotlib.pyplot as plt

# Packages for NLP
import nltk

from sklearn.metrics import precision_recall_fscore_support

import matplotlib.pyplot as plt



# Reading the data

In [3]:
train_data = pd.read_csv("new data/train_data.csv", index_col=1)
val_data = pd.read_csv("new data/validation_data.csv", index_col=1)
test_data = pd.read_csv("new data/test_data.csv", index_col=1)

In [4]:
X_train_text = train_data["text_preprocessed"].values
y_train = train_data["class_label"].values

X_val_text = val_data["text_preprocessed"].values
y_val = val_data["class_label"].values

X_test_text = test_data["text_preprocessed"].values
y_test = test_data["class_label"].values

# Creating Model using SVM Linear Support Vector Classification

## Using CountVectorizer

In [5]:
vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,1))
vectorizer.fit(X_train_text)

X_train = vectorizer.transform(X_train_text)
X_val = vectorizer.transform(X_val_text)
X_test = vectorizer.transform(X_test_text)

In [6]:
print("number of features used:", len(vectorizer.get_feature_names()))

number of features used: 238266


In [7]:
# Sparse vector of frequency of each word appearing in a text article
print(type(X_train))

<class 'scipy.sparse.csr.csr_matrix'>


In [8]:
svm_clf = LinearSVC()
svm_clf.fit(X_train, y_train)



LinearSVC()

In [9]:
count_vectorizer_params = {'unigram':(1,1), 'unigram and bigram': (1,2), 'bigram':(2,2)}

for ngram, values in count_vectorizer_params.items():
    vectorizer = CountVectorizer(stop_words='english', ngram_range=values)
    vectorizer.fit(X_train_text)

    X_train = vectorizer.transform(X_train_text)
    X_val = vectorizer.transform(X_val_text)
    X_test = vectorizer.transform(X_test_text)

    print(f'CountVectorizer Model with {ngram}')
    svm_clf.fit(X_train, y_train)

    #Validation Data
    print('Testing with validation data:')
    val_pred = svm_clf.predict(X_val)
    print(classification_report(y_val, val_pred))
    print("------------------------------------------")

    # Test Data
    print('Testing using test data:')
    test_pred = svm_clf.predict(X_test)
    print(classification_report(y_test, test_pred))
    print("------------------------------------------")
    print("------------------------------------------")

CountVectorizer Model with unigram




Testing with validation data:
              precision    recall  f1-score   support

           0       0.97      0.96      0.96      6361
           1       0.96      0.97      0.96      6659

    accuracy                           0.96     13020
   macro avg       0.96      0.96      0.96     13020
weighted avg       0.96      0.96      0.96     13020

------------------------------------------
Testing using test data:
              precision    recall  f1-score   support

           0       0.97      0.96      0.96      6361
           1       0.96      0.97      0.96      6660

    accuracy                           0.96     13021
   macro avg       0.96      0.96      0.96     13021
weighted avg       0.96      0.96      0.96     13021

------------------------------------------
------------------------------------------
CountVectorizer Model with unigram and bigram




Testing with validation data:
              precision    recall  f1-score   support

           0       0.98      0.97      0.97      6361
           1       0.97      0.98      0.97      6659

    accuracy                           0.97     13020
   macro avg       0.97      0.97      0.97     13020
weighted avg       0.97      0.97      0.97     13020

------------------------------------------
Testing using test data:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      6361
           1       0.97      0.98      0.97      6660

    accuracy                           0.97     13021
   macro avg       0.97      0.97      0.97     13021
weighted avg       0.97      0.97      0.97     13021

------------------------------------------
------------------------------------------
CountVectorizer Model with bigram




Testing with validation data:
              precision    recall  f1-score   support

           0       0.96      0.94      0.95      6361
           1       0.94      0.96      0.95      6659

    accuracy                           0.95     13020
   macro avg       0.95      0.95      0.95     13020
weighted avg       0.95      0.95      0.95     13020

------------------------------------------
Testing using test data:
              precision    recall  f1-score   support

           0       0.96      0.93      0.95      6361
           1       0.94      0.97      0.95      6660

    accuracy                           0.95     13021
   macro avg       0.95      0.95      0.95     13021
weighted avg       0.95      0.95      0.95     13021

------------------------------------------
------------------------------------------


## Using Tf-Idf

In [12]:
tfidf_params = {'unigram':(1,1), 'unigram and bigram': (1,2), 'bigram':(2,2)}

for ngram, values in tfidf_params.items():
    tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=values)
    tfidf_vectorizer.fit(X_train_text)

    X_train = tfidf_vectorizer.transform(X_train_text)
    X_val = tfidf_vectorizer.transform(X_val_text)
    X_test = tfidf_vectorizer.transform(X_test_text)

    svm_clf = LinearSVC()
    print(f"Model with {ngram}")
    svm_clf.fit(X_train, y_train)

    # Validation Data
    print("Testing using validation data:")    
    y_val_pred = svm_clf.predict(X_val)
    print(classification_report(y_val, y_val_pred))
    print("------------------------------------------")

    # Test Data
    print("Testing using test data:")
    y_test_pred = svm_clf.predict(X_test)
    print(classification_report(y_test, y_test_pred))
    print("------------------------------------------")
    print("------------------------------------------")

Model with unigram
Testing using validation data:
              precision    recall  f1-score   support

           0       0.98      0.97      0.97      6361
           1       0.97      0.98      0.97      6659

    accuracy                           0.97     13020
   macro avg       0.97      0.97      0.97     13020
weighted avg       0.97      0.97      0.97     13020

------------------------------------------
Testing using test data:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      6361
           1       0.97      0.97      0.97      6660

    accuracy                           0.97     13021
   macro avg       0.97      0.97      0.97     13021
weighted avg       0.97      0.97      0.97     13021

------------------------------------------
------------------------------------------
Model with unigram and bigram
Testing using validation data:
              precision    recall  f1-score   support

           0       0.97   

# Feature Selection

## CountVectorizer

### min_df = 0.01

In [10]:
print('--------------------CountVectorizer--------------------')

vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,2), min_df=0.01)
vectorizer.fit(X_train_text)

X_train = vectorizer.transform(X_train_text)
X_val = vectorizer.transform(X_val_text)
X_test = vectorizer.transform(X_test_text)

print(f'CountVectorizer Model with min_df=0.01')
svm_clf.fit(X_train, y_train)
num_features = len(vectorizer.get_feature_names())
print(num_features)
# countvectorizer_numfeatures.append(num_features)

#Validation Data
print('Testing with validation data:')
val_pred = svm_clf.predict(X_val)
print(classification_report(y_val, val_pred))
print("------------------------------------------")

# Test Data
print('Testing using test data:')
test_pred = svm_clf.predict(X_test)
report = classification_report(y_test, test_pred)
print(report)
print("------------------------------------------")
print("------------------------------------------")

--------------------CountVectorizer--------------------
CountVectorizer Model with min_df=0.01
3373
Testing with validation data:
              precision    recall  f1-score   support

           0       0.95      0.94      0.94      6361
           1       0.94      0.95      0.95      6659

    accuracy                           0.95     13020
   macro avg       0.95      0.95      0.95     13020
weighted avg       0.95      0.95      0.95     13020

------------------------------------------
Testing using test data:
              precision    recall  f1-score   support

           0       0.95      0.94      0.95      6361
           1       0.95      0.95      0.95      6660

    accuracy                           0.95     13021
   macro avg       0.95      0.95      0.95     13021
weighted avg       0.95      0.95      0.95     13021

------------------------------------------
------------------------------------------




### min_df = 0.15

In [11]:
print('--------------------CountVectorizer--------------------')

vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,2), min_df=0.15)
vectorizer.fit(X_train_text)

X_train = vectorizer.transform(X_train_text)
X_val = vectorizer.transform(X_val_text)
X_test = vectorizer.transform(X_test_text)

print(f'CountVectorizer Model with min_df=0.15')
svm_clf.fit(X_train, y_train)
num_features = len(vectorizer.get_feature_names())
print(num_features)

#Validation Data
print('Testing with validation data:')
val_pred = svm_clf.predict(X_val)
print(classification_report(y_val, val_pred))
print("------------------------------------------")

# Test Data
print('Testing using test data:')
test_pred = svm_clf.predict(X_test)
report = classification_report(y_test, test_pred)
print(report)
print("------------------------------------------")
print("------------------------------------------")


--------------------CountVectorizer--------------------
CountVectorizer Model with min_df=0.15
134
Testing with validation data:
              precision    recall  f1-score   support

           0       0.94      0.86      0.90      6361
           1       0.88      0.95      0.91      6659

    accuracy                           0.91     13020
   macro avg       0.91      0.91      0.91     13020
weighted avg       0.91      0.91      0.91     13020

------------------------------------------
Testing using test data:
              precision    recall  f1-score   support

           0       0.94      0.87      0.90      6361
           1       0.88      0.95      0.92      6660

    accuracy                           0.91     13021
   macro avg       0.91      0.91      0.91     13021
weighted avg       0.91      0.91      0.91     13021

------------------------------------------
------------------------------------------




## TF-IDF

### min_df = 0.01

In [13]:
print('--------------------TF-IDF--------------------')

tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), min_df=0.01)
tfidf_vectorizer.fit(X_train_text)

X_train = tfidf_vectorizer.transform(X_train_text)
X_val = tfidf_vectorizer.transform(X_val_text)
X_test = tfidf_vectorizer.transform(X_test_text)

print(f'TF-IDF Model with min_df=0.01')
svm_clf.fit(X_train, y_train)
num_features = len(tfidf_vectorizer.get_feature_names())
print(num_features)
# tfidf_numfeatures.append(num_features)

#Validation Data
print('Testing with validation data:')
val_pred = svm_clf.predict(X_val)
print(classification_report(y_val, val_pred))
print("------------------------------------------")

# Test Data
print('Testing using test data:')
test_pred = svm_clf.predict(X_test)
report = classification_report(y_test, test_pred)
print(report)
print("------------------------------------------")
print("------------------------------------------")

--------------------TF-IDF--------------------
TF-IDF Model with min_df=0.01
3373
Testing with validation data:
              precision    recall  f1-score   support

           0       0.97      0.96      0.96      6361
           1       0.96      0.97      0.96      6659

    accuracy                           0.96     13020
   macro avg       0.96      0.96      0.96     13020
weighted avg       0.96      0.96      0.96     13020

------------------------------------------
Testing using test data:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96      6361
           1       0.96      0.96      0.96      6660

    accuracy                           0.96     13021
   macro avg       0.96      0.96      0.96     13021
weighted avg       0.96      0.96      0.96     13021

------------------------------------------
------------------------------------------


### min_df = 0.15

In [14]:
print('--------------------TF-IDF--------------------')

tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), min_df=0.15)
tfidf_vectorizer.fit(X_train_text)

X_train = tfidf_vectorizer.transform(X_train_text)
X_val = tfidf_vectorizer.transform(X_val_text)
X_test = tfidf_vectorizer.transform(X_test_text)

print(f'TF-IDF Model with min_df=0.15')
svm_clf.fit(X_train, y_train)
num_features = len(tfidf_vectorizer.get_feature_names())
print(num_features)
# tfidf_numfeatures.append(num_features)

#Validation Data
print('Testing with validation data:')
val_pred = svm_clf.predict(X_val)
print(classification_report(y_val, val_pred))
print("------------------------------------------")

# Test Data
print('Testing using test data:')
test_pred = svm_clf.predict(X_test)
report = classification_report(y_test, test_pred)
print(report)
print("------------------------------------------")
print("------------------------------------------")

# precision, recall, f1score = get_weighted_average(report)
# tfidf_precision.append(precision)
# tfidf_recall.append(recall)
# tfidf_f1score.append(f1score)


--------------------TF-IDF--------------------
TF-IDF Model with min_df=0.15
134
Testing with validation data:
              precision    recall  f1-score   support

           0       0.91      0.89      0.90      6361
           1       0.90      0.92      0.91      6659

    accuracy                           0.90     13020
   macro avg       0.90      0.90      0.90     13020
weighted avg       0.90      0.90      0.90     13020

------------------------------------------
Testing using test data:
              precision    recall  f1-score   support

           0       0.91      0.89      0.90      6361
           1       0.90      0.92      0.91      6660

    accuracy                           0.91     13021
   macro avg       0.91      0.91      0.91     13021
weighted avg       0.91      0.91      0.91     13021

------------------------------------------
------------------------------------------


### Tuning max_features instead

In [15]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_features=10000) #, max_df=max_value
tfidf_vectorizer.fit(X_train_text)

X_train = tfidf_vectorizer.transform(X_train_text)
X_val = tfidf_vectorizer.transform(X_val_text)
X_test = tfidf_vectorizer.transform(X_test_text)

print(f'TF-IDF Model with min_df=0.2') #, max_df={max_value}
svm_clf.fit(X_train, y_train)
print(len(tfidf_vectorizer.get_feature_names()))

#Validation Data
print('Testing with validation data:')
val_pred = svm_clf.predict(X_val)
print(classification_report(y_val, val_pred))
print("------------------------------------------")

# Test Data
print('Testing using test data:')
test_pred = svm_clf.predict(X_test)
print(classification_report(y_test, test_pred))
print("------------------------------------------")
print("------------------------------------------")

TF-IDF Model with min_df=0.2
10000
Testing with validation data:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      6361
           1       0.97      0.97      0.97      6659

    accuracy                           0.97     13020
   macro avg       0.97      0.97      0.97     13020
weighted avg       0.97      0.97      0.97     13020

------------------------------------------
Testing using test data:
              precision    recall  f1-score   support

           0       0.97      0.96      0.97      6361
           1       0.97      0.97      0.97      6660

    accuracy                           0.97     13021
   macro avg       0.97      0.97      0.97     13021
weighted avg       0.97      0.97      0.97     13021

------------------------------------------
------------------------------------------


### Finding out which words were eliminated and kept

In [17]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), min_df=0.01)
tfidf_vectorizer.fit(X_train_text)
list1 = list(tfidf_vectorizer.get_feature_names())
print(list1)

['000', '10', '10 percent', '10 year', '100', '1000', '10000', '100000', '11', '12', '13', '14', '15', '150', '16', '17', '18', '19', '1960', '1970', '1980', '1990', '20', '20 percent', '20 year', '200', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2016 elect', '2016 presidenti', '2017', '2018', '2019', '2020', '21', '21st', '21st centuri', '21wire', '21wiretv', '22', '23', '24', '25', '26', '27', '28', '29', '30', '300', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '400', '41', '42', '43', '44', '45', '46', '48', '49', '50', '500', '51', '52', '55', '60', '600', '65', '70', '75', '80', '800', '90', '911', 'abandon', 'abc', 'abil', 'abl', 'abort', 'abroad', 'absenc', 'absolut', 'absurd', 'abus', 'academ', 'academi', 'acceler', 'accept', 'access', 'accid', 'accompani', 'accomplish', 'accord', 'accord report', 'account', 'accur', 'accus', 'achiev', 'acknowledg', 'acquir', 'act', 'ac

In [18]:
# tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), min_df=0.00)
# tfidf_vectorizer.fit(X_train_text)
# list2 = list(tfidf_vectorizer.get_feature_names_out())

In [19]:
# unique = set(list2) - set(list1)
# print(len(list(unique)))
# print(list(unique)[:1000])

In [21]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), min_df=0.15)
tfidf_vectorizer.fit(X_train_text)
list3 = list(tfidf_vectorizer.get_feature_names())
print(len(list3))
print(list3)

134
['2016', 'accord', 'act', 'ad', 'administr', 'allow', 'america', 'american', 'anoth', 'appear', 'ask', 'attack', 'becom', 'believ', 'campaign', 'case', 'chang', 'citi', 'claim', 'clinton', 'close', 'come', 'comment', 'continu', 'countri', 'critic', 'day', 'democrat', 'donald', 'donald trump', 'elect', 'end', 'everi', 'face', 'fact', 'feder', 'follow', 'forc', 'gener', 'good', 'govern', 'group', 'happen', 'help', 'hillari', 'hillari clinton', 'hous', 'imag', 'includ', 'issu', 'know', 'law', 'lead', 'leader', 'live', 'long', 'look', 'major', 'make', 'mani', 'mean', 'media', 'meet', 'member', 'million', 'month', 'nation', 'need', 'new', 'new york', 'news', 'number', 'obama', 'offic', 'offici', 'order', 'parti', 'peopl', 'person', 'place', 'plan', 'point', 'polici', 'polit', 'possibl', 'post', 'power', 'presid', 'presidenti', 'public', 'question', 'realli', 'recent', 'report', 'repres', 'republican', 'respons', 'reuter', 'right', 'run', 'said', 'say', 'secur', 'senat', 'sever', 'start'

# Feature Selection for Additional Features

## With Added Features

In [2]:
train_data_features = pd.read_csv("Final datasets/train_data.csv")
# val_data_features = pd.read_csv("Final datasets/val_data.csv")
test_data_features = pd.read_csv("Final datasets/test_data.csv")

In [3]:
y_train = train_data_features["class_label"].values
y_test = test_data_features["class_label"].values
# y_val = val_data_features["class_label"].values

### All added features for min_df = 0.01 (3k)

In [4]:
# Initialise TfidfVectorizer with min_df = 0.01 as per feature selection
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.01)

# Create mapper object to combine added features and tfidf word vectors
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'sentence_count', 'prop_unique_words',
    'avg_sentence_length', 'prop_punctuations', 'prop_stopwords',
    'prop_words_in_quotes', 'prop_nouns', 'prop_verbs', 'prop_adjectives',
    'prop_discourse_relations', 'textblob_sentiment'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

# fit_transform mapper on train data with added features and transform test data with added features
X_train_added_features = mapper.fit_transform(train_data_features)
X_test_added_features = mapper.transform(test_data_features)
# X_val_added_features = mapper.transform(val_data_features)


In [5]:
#define svm linearsvc model
svm_clf = LinearSVC()

In [6]:
svm_clf.fit(X_train_added_features, y_train)
y_pred = svm_clf.predict(X_test_added_features)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.95      0.96      6361
           1       0.96      0.97      0.96      6660

    accuracy                           0.96     13021
   macro avg       0.96      0.96      0.96     13021
weighted avg       0.96      0.96      0.96     13021



### Selected added features for min_df = 0.01 (3k)

In [7]:
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'prop_unique_words', 'avg_sentence_length', 'prop_punctuations', 'prop_stopwords', 'prop_nouns'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

# fit_transform mapper on train data with added features and transform test data with added features
X_train_added_features = mapper.fit_transform(train_data_features)
X_test_added_features = mapper.transform(test_data_features)

svm_clf.fit(X_train_added_features, y_train)
y_pred = svm_clf.predict(X_test_added_features)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.97      0.96      6361
           1       0.97      0.95      0.96      6660

    accuracy                           0.96     13021
   macro avg       0.96      0.96      0.96     13021
weighted avg       0.96      0.96      0.96     13021



### All added features for min_df = 0.15 (134)

In [8]:
# Initialise TfidfVectorizer with min_df = 0.01 as per feature selection
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.15)

# Create mapper object to combine added features and tfidf word vectors
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'sentence_count', 'prop_unique_words',
    'avg_sentence_length', 'prop_punctuations', 'prop_stopwords',
    'prop_words_in_quotes', 'prop_nouns', 'prop_verbs', 'prop_adjectives',
    'prop_discourse_relations', 'textblob_sentiment'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

# fit_transform mapper on train data with added features and transform test data with added features
X_train_added_features = mapper.fit_transform(train_data_features)
X_test_added_features = mapper.transform(test_data_features)

svm_clf.fit(X_train_added_features, y_train)
y_pred = svm_clf.predict(X_test_added_features)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.85      0.90      6361
           1       0.87      0.96      0.92      6660

    accuracy                           0.91     13021
   macro avg       0.92      0.91      0.91     13021
weighted avg       0.91      0.91      0.91     13021



### Selected added features for min_df = 0.15 (134)

In [9]:
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'prop_unique_words', 'avg_sentence_length', 'prop_punctuations', 'prop_stopwords', 'prop_nouns'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

# fit_transform mapper on train data with added features and transform test data with added features
X_train_added_features = mapper.fit_transform(train_data_features)
X_test_added_features = mapper.transform(test_data_features)

svm_clf.fit(X_train_added_features, y_train)
y_pred = svm_clf.predict(X_test_added_features)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.95      0.89      6361
           1       0.95      0.83      0.89      6660

    accuracy                           0.89     13021
   macro avg       0.90      0.89      0.89     13021
weighted avg       0.90      0.89      0.89     13021

