In [1]:
from locallyWeightedRandomForest import LocallyWeightedRandomForest
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import seaborn
import scipy
import sklearn
from word_preprocess import *
# from sentence_transformers import SentenceTransformer
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import statistics

from importlib import reload
import locallyWeightedRandomForest
reload(locallyWeightedRandomForest)
from locallyWeightedRandomForest import LocallyWeightedRandomForest

[nltk_data] Downloading package wordnet to /u/arinaldi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /u/arinaldi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Import Data

In [2]:
train_df = pd.read_csv('twitter_dataset/twitter_training_sample.csv', index_col = 0)

print(f'Training shape: {train_df.shape}')

test_df = pd.read_csv('twitter_dataset/twitter_validation.csv', header = None)
test_df.columns = [
    'tweet_id',
    'video_game',
    'sentiment',
    'text'
]
print(f'Testing shape: {test_df.shape}')

Training shape: (5000, 5)
Testing shape: (1000, 4)


### Remove NA

In [3]:
train_df = train_df.loc[~train_df.isna().any(axis=1),:]
test_df = test_df.loc[~test_df.isna().any(axis=1),:]
print(f'Training shape: {train_df.shape}')
print(f'Testing shape: {test_df.shape}')

Training shape: (5000, 5)
Testing shape: (1000, 4)


## Remove Duplicates

In [4]:
unique_train_index = np.unique(train_df['tweet_id'], return_index=True)[1]
train_df = train_df.iloc[unique_train_index, :]
print(f'Training shape: {train_df.shape}')

Training shape: (5000, 5)


## Pre-processing Text

In [5]:
# train_df['clean_text'] = train_df.text.apply(lambda x: clean_string(x))
test_df['clean_text'] = test_df.text.apply(lambda x: clean_string(x))

In [6]:
train_df.head()

Unnamed: 0,tweet_id,video_game,sentiment,text,clean_text
4668,2,Amazon,Negative,I’m really disappointed with amazon today! I o...,’ really disappointed amazon today ordered kin...
4680,4,Amazon,Negative,@amazon probably some of the worst customer se...,amazon probably worst customer service ’ deal ...
4692,6,Amazon,Neutral,Love Speculative Fiction?,love speculative fiction
4698,7,Amazon,Neutral,Amazon I Israel. . carlnorberg.se/2019/05/01/a...,amazon israel. carlnorberg.se/2019/05/01/ama… ...
4728,12,Amazon,Positive,Amazon has the coolest shit I never thought I ...,amazon coolest shit never thought needed purch...


## Remove Empty Strings

In [7]:
train_df = train_df.loc[~(train_df['clean_text'] == ""),:]
test_df = test_df.loc[~(test_df['clean_text'] == ""),:]
print(f'Training shape: {train_df.shape}')
print(f'Testing shape: {test_df.shape}')

Training shape: (5000, 5)
Testing shape: (999, 5)


## Convert Text To Sentence Embedding

In [7]:
model = SentenceTransformer('all-MiniLM-L6-v2')
train_sbert_clean = model.encode(train_df['clean_text'].to_list())
test_sbert_clean = model.encode(test_df['clean_text'].to_list())
train_sbert_raw = model.encode(train_df['text'].to_list())
test_sbert_raw = model.encode(test_df['text'].to_list())

In [8]:
# save encodings to csv
np.savetxt('twitter_dataset/train_sbert_clean.csv', train_sbert_clean, delimiter = ',')
np.savetxt('twitter_dataset/test_sbert_clean.csv', test_sbert_clean, delimiter = ',')
np.savetxt('twitter_dataset/train_sbert_raw.csv', train_sbert_raw, delimiter = ',')
np.savetxt('twitter_dataset/test_sbert_raw.csv', test_sbert_raw, delimiter = ',')

In [8]:
# read in numpy data
train_sbert_clean = np.loadtxt('twitter_dataset/train_sbert_clean.csv', delimiter=',')
test_sbert_clean = np.loadtxt('twitter_dataset/test_sbert_clean.csv', delimiter=',')
train_sbert_raw = np.loadtxt('twitter_dataset/train_sbert_raw.csv', delimiter=',')
test_sbert_raw = np.loadtxt('twitter_dataset/test_sbert_raw.csv', delimiter=',')

## Baseline

In [9]:
labeller = LabelEncoder()
# set up X and y variables
X_train = train_sbert_raw
X_test = test_sbert_raw
y_train = labeller.fit_transform(train_df['sentiment'].values)
y_test = labeller.transform(test_df['sentiment'].values)

### Random Forest

In [20]:
grid_search_parameters = {'max_depth':[5, 10, 20, 50], 
                          'n_estimators':[10, 25, 50, 100, 250, 500], 
                          'max_samples': [0.3, 0.5, 0.7, 0.9, 1]}

random_forest_classifier = RandomForestClassifier(random_state=0)
random_forest_classifier_GS = GridSearchCV(random_forest_classifier, grid_search_parameters, cv=5, verbose=3, n_jobs = -1)
random_forest_classifier_GS.fit(X_train, y_train)

print("Best Parameters: ", random_forest_classifier_GS.best_params_)
print("Best Score: ", random_forest_classifier_GS.best_score_)


Fitting 5 folds for each of 120 candidates, totalling 600 fits
[CV 1/5] END max_depth=5, max_samples=0.3, n_estimators=10;, score=0.436 total time=   0.9s
[CV 2/5] END max_depth=5, max_samples=0.3, n_estimators=25;, score=0.446 total time=   1.1s
[CV 5/5] END max_depth=5, max_samples=0.3, n_estimators=10;, score=0.429 total time=   0.9s
[CV 4/5] END max_depth=5, max_samples=0.3, n_estimators=10;, score=0.407 total time=   1.3s
[CV 2/5] END max_depth=5, max_samples=0.3, n_estimators=10;, score=0.411 total time=   1.2s
[CV 3/5] END max_depth=5, max_samples=0.3, n_estimators=10;, score=0.428 total time=   1.2s
[CV 3/5] END max_depth=5, max_samples=0.3, n_estimators=25;, score=0.465 total time=   1.7s
[CV 1/5] END max_depth=5, max_samples=0.3, n_estimators=25;, score=0.457 total time=   2.0s
[CV 5/5] END max_depth=5, max_samples=0.3, n_estimators=25;, score=0.483 total time=   1.2s
[CV 4/5] END max_depth=5, max_samples=0.3, n_estimators=25;, score=0.448 total time=   1.8s
[CV 2/5] END max_

In [23]:
random_forest_classifier = RandomForestClassifier(random_state=1, max_depth = 20, max_samples = 0.9, n_estimators = 250)
random_forest_classifier.fit(X_train, y_train)
predictions = random_forest_classifier.predict(X_test)

print(classification_report(y_test, predictions, digits=5))

              precision    recall  f1-score   support

           0    0.95652   0.38372   0.54772       172
           1    0.62849   0.84906   0.72231       265
           2    0.76768   0.53333   0.62940       285
           3    0.62567   0.84477   0.71889       277

    accuracy                        0.67768       999
   macro avg    0.74459   0.65272   0.65458       999
weighted avg    0.72389   0.67768   0.66480       999



### KNN

In [11]:
from sklearn.neighbors import KNeighborsClassifier
grid_search_parameters = {'n_neighbors':[2, 5, 8, 10,15,25]}

KNN = KNeighborsClassifier()
KNN_classifier_GS = GridSearchCV(KNN, grid_search_parameters, cv=5, verbose=3, n_jobs = -1)
KNN_classifier_GS.fit(X_train, y_train)

print("Best Parameters: ", KNN_classifier_GS.best_params_)
print("Best Score: ", KNN_classifier_GS.best_score_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 2/5] END .....................n_neighbors=5;, score=0.363 total time=   1.2s
[CV 5/5] END .....................n_neighbors=2;, score=0.434 total time=   1.2s
[CV 3/5] END .....................n_neighbors=2;, score=0.338 total time=   1.4s
[CV 1/5] END .....................n_neighbors=2;, score=0.447 total time=   1.5s
[CV 4/5] END .....................n_neighbors=2;, score=0.362 total time=   1.7s
[CV 2/5] END .....................n_neighbors=2;, score=0.319 total time=   1.6s
[CV 5/5] END .....................n_neighbors=5;, score=0.486 total time=   0.8s
[CV 3/5] END .....................n_neighbors=5;, score=0.408 total time=   1.9s
[CV 1/5] END .....................n_neighbors=5;, score=0.501 total time=   2.3s
[CV 4/5] END .....................n_neighbors=5;, score=0.420 total time=   1.2s
[CV 1/5] END .....................n_neighbors=8;, score=0.514 total time=   1.0s
[CV 5/5] END .....................n_neighbors=8;,

In [17]:
KNN_classifier = KNeighborsClassifier(n_neighbors=25)
KNN_classifier.fit(X_train, y_train)
predictions = KNN_classifier.predict(X_test)

print(classification_report(y_test, predictions, digits=5))

              precision    recall  f1-score   support

           0    0.63736   0.33721   0.44106       172
           1    0.52674   0.74340   0.61659       265
           2    0.61000   0.42807   0.50309       285
           3    0.57186   0.68953   0.62520       277

    accuracy                        0.56857       999
   macro avg    0.58649   0.54955   0.54649       999
weighted avg    0.58205   0.56857   0.55638       999



### SVM

In [None]:
from sklearn.svm import SVC

grid_search_parameters = {'kernel': ['poly', 'rbf'], 
                          'C': [0.25,0.5,1,3,5],
                          'gamma': [0.1, 1, 'scale', 'auto']}
                          
#Using LibLinear to allow for both l1 and l2 loss in grid search 
svm = SVC()
svm_GS = GridSearchCV(svm, grid_search_parameters, cv=5, verbose=3, n_jobs = -1)
svm_GS.fit(X_train, y_train)

print("Best Parameters: ", svm_GS.best_params_)
print("Best Score: ", svm_GS.best_score_)

In [24]:
SVM_classifier = SVC(C=0.5, gamma=0.1, kernel='rbf')
SVM_classifier.fit(X_train, y_train)
predictions = SVM_classifier.predict(X_test)

print(classification_report(y_test, predictions, digits=5))

              precision    recall  f1-score   support

           0    0.60465   0.15116   0.24186       172
           1    0.55091   0.79623   0.65123       265
           2    0.61692   0.43509   0.51029       285
           3    0.56989   0.76534   0.65331       277

    accuracy                        0.57357       999
   macro avg    0.58559   0.53695   0.51417       999
weighted avg    0.58426   0.57357   0.54112       999



### Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression
grid_search_parameters = {'penalty':['l2', 'l1'], 
                          'C': [0.25,0.5,1,3,5]}
                          
#Using LibLinear to allow for both l1 and l2 loss in grid search 
LR = LogisticRegression(solver='liblinear')
LogisticRegression_GS = GridSearchCV(LR, grid_search_parameters, cv=5, verbose=3, n_jobs = -1)
LogisticRegression_GS.fit(X_train, y_train)

print("Best Parameters: ", LogisticRegression_GS.best_params_)
print("Best Score: ", LogisticRegression_GS.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END ................C=0.25, penalty=l1;, score=0.519 total time=   2.7s
[CV 5/5] END ................C=0.25, penalty=l2;, score=0.561 total time=   3.3s
[CV 1/5] END ................C=0.25, penalty=l2;, score=0.550 total time=   3.7s
[CV 3/5] END ................C=0.25, penalty=l1;, score=0.521 total time=   2.9s[CV 2/5] END ................C=0.25, penalty=l1;, score=0.517 total time=   2.5s

[CV 3/5] END ................C=0.25, penalty=l2;, score=0.532 total time=   4.2s
[CV 4/5] END ................C=0.25, penalty=l1;, score=0.500 total time=   2.2s
[CV 4/5] END ................C=0.25, penalty=l2;, score=0.501 total time=   4.8s
[CV 2/5] END ................C=0.25, penalty=l2;, score=0.514 total time=   5.0s
[CV 5/5] END ................C=0.25, penalty=l1;, score=0.539 total time=   3.0s
[CV 3/5] END .................C=0.5, penalty=l2;, score=0.524 total time=   4.1s
[CV 5/5] END .................C=0.5, penalty=l2;

In [19]:
# Train the model with the best parameters:
LR_classifier = LogisticRegression(solver='liblinear', C=0.25, penalty='l2')
LR_classifier.fit(X_train, y_train)
predictions = LR_classifier.predict(X_test)

print(classification_report(y_test, predictions, digits=5))

              precision    recall  f1-score   support

           0    0.59551   0.30814   0.40613       172
           1    0.57925   0.75849   0.65686       265
           2    0.62500   0.47368   0.53892       285
           3    0.59654   0.74729   0.66346       277

    accuracy                        0.59660       999
   macro avg    0.59907   0.57190   0.56634       999
weighted avg    0.59990   0.59660   0.58188       999



## Our Model

In [9]:
def euclidean_distance(x_1:np.ndarray, x_2:np.ndarray) -> float:
    return np.linalg.norm(x_1 - x_2)

def mean_distance(point:np.ndarray, dataset:np.ndarray, distance_function:callable = lambda a,b: 1)-> float:
    distance_sum = 0
    for p in dataset:
        distance_sum += distance_function(point, p)
    return distance_sum/len(dataset)

def distance_to_dataset_mean(point:np.ndarray, dataset:np.ndarray, distance_function:callable = lambda a,b: 1)-> float:
    mean_dataset_point = np.mean(dataset, axis=0)
    return distance_function(point, mean_dataset_point)

def median_distance(point:np.ndarray, dataset:np.ndarray, distance_function:callable = lambda a,b: 1)-> float:
    distances = []
    for p in dataset:
        distances.append(distance_function(point, p))
    return statistics.median(distances)

def nearest_k_distance_mean(k:int) -> callable:
    def _smallest_k_distances_mean(point:np.ndarray, dataset:np.ndarray, distance_function:callable = lambda a,b: 1) -> float:
        distances = np.zeros(len(dataset))
        for i in range(len(dataset)):
            distances[i] = distance_function(point, dataset[i])
        smallest_k_distaces = np.sort(distances)[:k]
        return np.mean(smallest_k_distaces, axis=0)

    return _smallest_k_distances_mean

In [10]:
labeller = LabelEncoder()
# set up X and y variables
X_train = train_sbert_raw
X_test = test_sbert_raw
y_train = labeller.fit_transform(train_df['sentiment'].values)
y_test = labeller.transform(test_df['sentiment'].values)

lwrf = LocallyWeightedRandomForest(n_estimators=5, max_samples = 0.9, max_depth = 5)

In [28]:
param_grid = {
    "n_estimators" : [20,50,100,500],
    "criterion" : ['gini'],
    "max_depth" : [25,50],
    "max_samples" : [0.6,0.7],
    "temp" : np.geomspace(1e-1,10, num=3),
    "distance_function" : [euclidean_distance],
    "distance_aggregation_function" : [distance_to_dataset_mean]
}

gcv = GridSearchCV(
    estimator=lwrf,
    param_grid=param_grid,
    verbose = 3,
    n_jobs = 10
)

gcv.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV 5/5] END criterion=gini, distance_aggregation_function=<function distance_to_dataset_mean at 0x7faacf6913f0>, distance_function=<function euclidean_distance at 0x7faacf24fb50>, max_depth=25, max_samples=0.6, n_estimators=20, temp=0.1;, score=0.437 total time= 1.5min
[CV 1/5] END criterion=gini, distance_aggregation_function=<function distance_to_dataset_mean at 0x7fc0bd2c13f0>, distance_function=<function euclidean_distance at 0x7fc0bce27b50>, max_depth=25, max_samples=0.6, n_estimators=20, temp=0.1;, score=0.444 total time= 1.7min
[CV 2/5] END criterion=gini, distance_aggregation_function=<function distance_to_dataset_mean at 0x7fa6810e93f0>, distance_function=<function euclidean_distance at 0x7fa680cabb50>, max_depth=25, max_samples=0.6, n_estimators=20, temp=1.0;, score=0.455 total time= 1.7min
[CV 4/5] END criterion=gini, distance_aggregation_function=<function distance_to_dataset_mean at 0x7f8fde4a53f0>, distance_fu

In [29]:
print("Best Parameters: ", gcv.best_params_)
print("Best Score: ", gcv.best_score_)

Best Parameters:  {'criterion': 'gini', 'distance_aggregation_function': <function distance_to_dataset_mean at 0x7fee372d0d30>, 'distance_function': <function euclidean_distance at 0x7fee3b06b250>, 'max_depth': 25, 'max_samples': 0.6, 'n_estimators': 500, 'temp': 1.0}
Best Score:  0.48739999999999994


In [15]:
lwrf = LocallyWeightedRandomForest(
    n_estimators=500,
    max_samples = 0.6,
    max_depth = 25,
    temp=1.0,
    distance_function = euclidean_distance,
    distance_aggregation_function=distance_to_dataset_mean,
    random_state = 0
)
lwrf.fit(X_train, y_train.reshape(-1,1))

In [16]:
lwrf.set_params(
    distance_aggregation_function = distance_to_dataset_mean
)

predictions = lwrf.predict(X_test)

print(classification_report(y_test, predictions, digits=5))

              precision    recall  f1-score   support

           0    0.93056   0.38953   0.54918       172
           1    0.62534   0.85660   0.72293       265
           2    0.76500   0.53684   0.63093       285
           3    0.63736   0.83755   0.72387       277

    accuracy                        0.67968       999
   macro avg    0.73957   0.65513   0.65673       999
weighted avg    0.72107   0.67968   0.66703       999

