In [16]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from imblearn.over_sampling import SMOTE
from collections import Counter

In [17]:
train_df = pd.read_csv("../Data/raw-data/train.csv")
test_df = pd.read_csv("../Data/raw-data/test.csv")

In [18]:
def preprocess(text):

    tokenizer = TreebankWordTokenizer() 
    lemmatizer = WordNetLemmatizer()
    stopwords_list = stopwords.words('english')
    point_noise = string.punctuation + '0123456789'
    
    cleanText = re.sub(r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+', "", text)
    cleanText = re.sub(r'@[a-zA-Z0-9\_\w]+', '', cleanText)
    cleanText = re.sub(r'#[a-zA-Z0-9]+', '', cleanText)
    cleanText = re.sub(r'RT', '', cleanText)
    cleanText = cleanText.lower()
    cleanText = re.sub(r'([https][http][htt][th][ht])', "", cleanText)
    cleanText = ''.join([word for word in cleanText if word not in point_noise])
    cleanText = "".join(word for word in cleanText if ord(word)<128)
    cleanText = tokenizer.tokenize(cleanText)
    cleanText = [lemmatizer.lemmatize(word) for word in cleanText if word not in stopwords_list]
    cleanText = [word for word in cleanText if len(word) >= 2]
    cleanText = ' '.join(cleanText)
    return cleanText

In [19]:
train_df['message'] = train_df['message'].apply(preprocess)
test_df['message'] = test_df['message'].apply(preprocess)

In [20]:
vector = TfidfVectorizer(ngram_range=(1,20), min_df=2)
train_features = vector.fit_transform(train_df['message'])
test_features = vector.transform(test_df['message'])

In [21]:
X_train, X_val, y_train, y_val = train_test_split(
    train_features, 
    train_df['sentiment'],
    test_size=0.2,
    shuffle=True,
    random_state=42
)

In [22]:
print("Applying SMOTE...")
sm = SMOTE(random_state=42)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

print('Before SMOTE:', Counter(y_train))
print('After SMOTE:', Counter(y_train_sm))

Applying SMOTE...
Before SMOTE: Counter({1: 163, -1: 109})
After SMOTE: Counter({-1: 163, 1: 163})


In [23]:
names = ['LogisticRegression', 'ForestClassifier', 'NaiveBayes', 'LinearSVM', 'KNNClassifier']
classifiers = [
    LogisticRegression(C=10),
    RandomForestClassifier(criterion='entropy'),
    MultinomialNB(alpha=1),
    LinearSVC(C=10, class_weight=None),
    KNeighborsClassifier(n_neighbors=10)
]

results = []
models = {}

for name, clf in zip(names, classifiers):
    print(f'Training {name}...')
    
    clf.fit(X_train_sm, y_train_sm)
    
    val_pred = clf.predict(X_val)
    test_pred = clf.predict(test_features)
    
    val_accuracy = accuracy_score(y_val, val_pred)
    val_f1 = f1_score(y_val, val_pred, average='macro')
    
    models[name] = clf
    results.append([name, val_accuracy, val_f1])
    
    test_df[f'{name}_predictions'] = test_pred
    
    print(f'{name} - Validation Accuracy: {val_accuracy:.4f}, F1: {val_f1:.4f}')

Training LogisticRegression...
LogisticRegression - Validation Accuracy: 0.9130, F1: 0.9126
Training ForestClassifier...
ForestClassifier - Validation Accuracy: 0.8406, F1: 0.8315
Training NaiveBayes...
NaiveBayes - Validation Accuracy: 0.9130, F1: 0.9126
Training LinearSVM...
LinearSVM - Validation Accuracy: 0.9420, F1: 0.9419
Training KNNClassifier...
KNNClassifier - Validation Accuracy: 0.4783, F1: 0.3463


In [24]:
results_df = pd.DataFrame(results, columns=['Classifier', 'Validation Accuracy', 'Validation F1'])
results_df.set_index('Classifier', inplace=True)

print("\nModel Performance on Validation Set:")
print(results_df.sort_values('Validation F1', ascending=False))


Model Performance on Validation Set:
                    Validation Accuracy  Validation F1
Classifier                                            
LinearSVM                      0.942029       0.941919
LogisticRegression             0.913043       0.912584
NaiveBayes                     0.913043       0.912584
ForestClassifier               0.840580       0.831521
KNNClassifier                  0.478261       0.346316


In [25]:
output_file = "model_predictions.csv"
test_df.to_csv(output_file, index=False)
print(f"\nPredictions saved to {output_file}")


Predictions saved to model_predictions.csv


In [26]:
if 'sentiment' in test_df.columns:
    print("\nModel Performance on Dataset:")
    for name in names:
        predictions = test_df[f'{name}_predictions']
        accuracy = accuracy_score(test_df['sentiment'], predictions)
        f1 = f1_score(test_df['sentiment'], predictions, average='macro')
        print(f"\n{name}:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"F1 Score: {f1:.4f}")


Model Performance on Dataset:

LogisticRegression:
Accuracy: 0.8455
F1 Score: 0.8430

ForestClassifier:
Accuracy: 0.7077
F1 Score: 0.7076

NaiveBayes:
Accuracy: 0.8455
F1 Score: 0.8405

LinearSVM:
Accuracy: 0.8309
F1 Score: 0.8278

KNNClassifier:
Accuracy: 0.6221
F1 Score: 0.3937


In [27]:
df = pd.read_csv("model_predictions.csv")

selected_columns = ['LogisticRegression_predictions', 'NaiveBayes_predictions', 'LinearSVM_predictions']
df['model_label'] = df[selected_columns].mode(axis=1).iloc[:,0]

for col in df.columns:
    if '_predictions' in col:
        df = df.drop(col, axis=1)

df.to_csv("../Data/processed-data/model_predictions.csv", index=False)

print("\nDistribution of final model labels:")
print(df['model_label'].value_counts())


Distribution of final model labels:
model_label
-1    244
 1    235
Name: count, dtype: int64


In [28]:
df = pd.read_csv("../Data/processed-data/model_predictions.csv")

president_df = df[df['state'] == 'US']
senators_df = df[df['state'] != 'US']

president_df.to_csv("../Data/processed-data/President_sentiment.csv", index=False)
senators_df.to_csv("../Data/processed-data/Senators_sentiment.csv", index=False)