In [None]:
import pandas as pd
import numpy as np
import re
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import csr_matrix
from scipy.sparse import hstack

###1. Exploratory Data Analysis.

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/SecondAssignmentData/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/SecondAssignmentData/test.csv')

In [None]:
train_df.tail()

Unnamed: 0,Index,Sentiment,Text
1048570,1048570,1,"Back home, thought I'd done for the week, but ..."
1048571,1048571,1,My GrandMa is making Dinenr with my Mum
1048572,1048572,1,Mid-morning snack time... A bowl of cheese noo...
1048573,1048573,1,@ShaDeLa same here say it like from the Termi...
1048574,1048574,1,@DestinyHope92 im great thaanks wbuu?


In [None]:
print(len(train_df))

1048575


In [None]:
# Confirming that the only value for the 'Sentiment' column is either zero or one:
print((train_df['Sentiment'] == 0).sum())
print((train_df['Sentiment'] == 1).sum())
print((train_df['Sentiment'] >= 2).sum())

800000
248575
0


In [None]:
# First, the data must be cleaned up. There should not be significant quantities of null values in the dataframe before proceeding further with feature extraction.
print(train_df.isnull().sum())
print(test_df.isnull().sum())

# Since there are no null values, we can proceed further with preprocessing.  

Index        0
Sentiment    0
Text         0
dtype: int64
Index        0
Sentiment    0
Text         0
dtype: int64


### 2. Text Preprocessing.

In [None]:
# We want to perform the following procedures for preprocessing:
# 1. Make all text lowercase.

train_df['Text'] = train_df['Text'].str.lower()
test_df['Text'] = test_df['Text'].str.lower()

In [None]:
train_df.tail()

Unnamed: 0,Index,Sentiment,Text
1048570,1048570,1,"back home, thought i'd done for the week, but ..."
1048571,1048571,1,my grandma is making dinenr with my mum
1048572,1048572,1,mid-morning snack time... a bowl of cheese noo...
1048573,1048573,1,@shadela same here say it like from the termi...
1048574,1048574,1,@destinyhope92 im great thaanks wbuu?


In [None]:
test_df.tail()

Unnamed: 0,Index,Sentiment,Text
354,492,1,"after using latex a lot, any other typeset mat..."
355,494,0,"on that note, i hate word. i hate pages. i hat..."
356,495,1,ahhh... back in a *real* text editing environm...
357,496,0,"trouble in iran, i see. hmm. iran. iran so far..."
358,497,0,reading the tweets coming out of iran... the w...


In [None]:
# 2. Remove special characters using regular expression operations.

train_df['Text'] = train_df['Text'].str.replace('[^\w\s]', '')
test_df['Text'] = test_df['Text'].str.replace('[^\w\s]', '')

  train_df['Text'] = train_df['Text'].str.replace('[^\w\s]', '')
  test_df['Text'] = test_df['Text'].str.replace('[^\w\s]', '')


In [None]:
train_df.tail()

Unnamed: 0,Index,Sentiment,Text
1048570,1048570,1,back home thought id done for the week but jus...
1048571,1048571,1,my grandma is making dinenr with my mum
1048572,1048572,1,midmorning snack time a bowl of cheese noodles...
1048573,1048573,1,shadela same here say it like from the termin...
1048574,1048574,1,destinyhope92 im great thaanks wbuu


In [None]:
# 3. Remove digits:
# In regular expression syntax, '\d+' would mean that any digit (one or more) is the target (from COP4020).

train_df['Text'] = train_df['Text'].str.replace('\d+', '')
test_df['Text'] = test_df['Text'].str.replace('\d+', '')

  train_df['Text'] = train_df['Text'].str.replace('\d+', '')
  test_df['Text'] = test_df['Text'].str.replace('\d+', '')


In [None]:
train_df.tail()

Unnamed: 0,Index,Sentiment,Text
1048570,1048570,1,back home thought id done for the week but jus...
1048571,1048571,1,my grandma is making dinenr with my mum
1048572,1048572,1,midmorning snack time a bowl of cheese noodles...
1048573,1048573,1,shadela same here say it like from the termin...
1048574,1048574,1,destinyhope im great thaanks wbuu


###3. Linguistic Feature Extraction.

In [None]:
# We can use the TD-IDF feature extraction to extract the importance of each word
# to proceed further.

# Convert the columns to lists to fit further machine learning algorithms.
train_text = train_df['Text'].tolist()
train_sentiment = train_df['Sentiment'].tolist()

# TF-IDF feature extraction
vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform(train_text)

In [None]:
# We can concatenate the 'Sentiment' column with the TF-IDF feature matrix
# so that it is included as an additional feature.
features = csr_matrix(features)

sentiment = np.array(train_sentiment).reshape(-1, 1)
features = hstack([sentiment, features])

# Normalize the feature matrix before beginning the training procedures:

scaler = StandardScaler(with_mean = False)
norm_features = scaler.fit_transform(features)

###4. Sentiment Classification Model(s). 

In [None]:
# Perform logistic regression on the data:

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(norm_features,
                                                    train_sentiment,
                                                    test_size = 0.2
                                                    )

clf = LogisticRegression(max_iter = 1500)
clf.fit(X_train, y_train)


In [None]:
# Predict the validation set
y_pred = clf.predict(X_test)

In [None]:
# Evaluate the model
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 1.0


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the evaluation metrics
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1 score: ", f1)
print("ROC AUC score: ", roc_auc)
print("Confusion matrix: \n", conf_matrix)

Accuracy:  1.0
Precision:  1.0
Recall:  1.0
F1 score:  1.0
ROC AUC score:  1.0
Confusion matrix: 
 [[159925      0]
 [     0  49790]]


In [None]:
# Train Naive Bayes classifier
from sklearn.naive_bayes import MultinomialNB

clf_nb = MultinomialNB()
clf_nb.fit(norm_features, train_sentiment)

# Evaluate the Naive Bayes classifier on the training data
y_pred_train = clf_nb.predict(features)
print('Training Accuracy:', accuracy_score(train_sentiment, y_pred_train))

Training Accuracy: 0.9992198936652124


In [None]:
# Train a Decision Tree classifier on the training set
from sklearn.tree import DecisionTreeClassifier

clf_tree = DecisionTreeClassifier()
clf_tree.fit(norm_features, train_sentiment)

###5. Model Evaluation.

In [None]:
test_tokens = test_df['Text'].tolist()

# Extract TF-IDF features for the test data
test_tfidf_features = vectorizer.transform(test_tokens).toarray()

# Concatenate sentiment column with the TF-IDF matrix
test_features = np.concatenate((test_df['Sentiment'].values.reshape(-1, 1), test_tfidf_features), axis=1)

# Standardize the feature matrix
test_features = scaler.transform(test_features)

# Evaluate the logistic regression model on the test data
y_pred = clf.predict(test_features)
print('Accuracy for logistic regression:', accuracy_score(test_df['Sentiment'], y_pred))
print('Confusion Matrix:\n', confusion_matrix(test_df['Sentiment'], y_pred))
y_pred = clf_nb.predict(test_features)
print('Accuracy for Naive Bayes:', accuracy_score(test_df['Sentiment'], y_pred))
print('Confusion Matrix:\n', confusion_matrix(test_df['Sentiment'], y_pred))

Accuracy for logistic regression: 0.49303621169916434
Confusion Matrix:
 [[177   0]
 [182   0]]
Accuracy for Naive Bayes: 0.9832869080779945
Confusion Matrix:
 [[171   6]
 [  0 182]]


In [None]:
y_pred = clf_tree.predict(test_features)
print('Accuracy:', accuracy_score(test_df['Sentiment'], y_pred))
print('Confusion Matrix:\n', confusion_matrix(test_df['Sentiment'], y_pred))

Accuracy: 0.49303621169916434
Confusion Matrix:
 [[177   0]
 [182   0]]


We see that the Naive Bayes algorithm ultimately has the best performance compared to the decision trees and logistic regression, which was an odd result that indicates that there may have been an issue with the feature extraction or data splitting that resulted in both the logistic regression and decision tree algorithms to have the same accuracy. Furthermore, the accuracies varied wildly during testing. The logistic regression algorithm also had perfect accuracy for the first test and immediately dropped thereafter, which was an odd result. 