In [19]:
from locallyWeightedRandomForest import LocallyWeightedRandomForest
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import scipy
import sklearn
from word_preprocess import *
from sentence_transformers import SentenceTransformer
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

## Import Data

In [13]:
train_df = pd.read_csv('twitter_dataset/twitter_training.csv', header=None)
train_df.columns = [
    'tweet_id',
    'video_game',
    'sentiment',
    'text'
]
print(f'Training shape: {train_df.shape}')

test_df = pd.read_csv('twitter_dataset/twitter_validation.csv', header = None)
test_df.columns = [
    'tweet_id',
    'video_game',
    'sentiment',
    'text'
]
print(f'Testing shape: {test_df.shape}')

Training shape: (74682, 4)
Testing shape: (1000, 4)


### Remove NA

In [14]:
train_df = train_df.loc[~train_df.isna().any(axis=1),:]
test_df = test_df.loc[~test_df.isna().any(axis=1),:]
print(f'Training shape: {train_df.shape}')
print(f'Testing shape: {test_df.shape}')

Training shape: (73996, 4)
Testing shape: (1000, 4)


## Remove Duplicates

In [None]:
unique_train_index = np.unique(train_df['tweet_id'], return_index=True)[1]
train_df = train_df.iloc[unique_train_index, :]
print(f'Training shape: {train_df.shape}')

## Pre-processing Text

In [15]:
train_df['clean_text'] = train_df.text.apply(lambda x: clean_string(x))
test_df['clean_text'] = test_df.text.apply(lambda x: clean_string(x))

In [16]:
train_df.head()

Unnamed: 0,tweet_id,video_game,sentiment,text,clean_text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,im getting borderland murder
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,coming border kill
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,im getting borderland kill
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,im coming borderland murder
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,im getting borderland NUMBER murder


## Remove Empty Strings

In [17]:
train_df = train_df.loc[~(train_df['clean_text'] == ""),:]
test_df = test_df.loc[~(test_df['clean_text'] == ""),:]
print(f'Training shape: {train_df.shape}')
print(f'Testing shape: {test_df.shape}')

Training shape: (72505, 5)
Testing shape: (999, 5)


## Convert Text To Sentence Embedding

In [7]:
model = SentenceTransformer('all-MiniLM-L6-v2')
train_sbert_clean = model.encode(train_df['clean_text'].to_list())
test_sbert_clean = model.encode(test_df['clean_text'].to_list())
train_sbert_raw = model.encode(train_df['text'].to_list())
test_sbert_raw = model.encode(test_df['text'].to_list())

In [8]:
# save encodings to csv
np.savetxt('twitter_dataset/train_sbert_clean.csv', train_sbert_clean, delimiter = ',')
np.savetxt('twitter_dataset/test_sbert_clean.csv', test_sbert_clean, delimiter = ',')
np.savetxt('twitter_dataset/train_sbert_raw.csv', train_sbert_raw, delimiter = ',')
np.savetxt('twitter_dataset/test_sbert_raw.csv', test_sbert_raw, delimiter = ',')

In [18]:
# read in numpy data
train_sbert_clean = np.loadtxt('twitter_dataset/train_sbert_clean.csv', delimiter=',')
test_sbert_clean = np.loadtxt('twitter_dataset/test_sbert_clean.csv', delimiter=',')
train_sbert_raw = np.loadtxt('twitter_dataset/train_sbert_raw.csv', delimiter=',')
test_sbert_raw = np.loadtxt('twitter_dataset/test_sbert_raw.csv', delimiter=',')

## Baseline

In [None]:
labeller = LabelEncoder()
# set up X and y variables
X_train = train_sbert_raw
X_test = test_sbert_raw
y_train = labeller.fit_transform(train_df['sentiment'].values)
y_test = labeller.transform(test_df['sentiment'].values)

grid_search_parameters = {'max_depth':[5, 10, 15, 30, 50, None], 
                          'n_estimators':[10, 25, 50, 100, 250, 500, 1000], 
                          'max_samples': [0.3, 0.5, 0.7, 0.8, 0.9, 1]}

random_forest_classifier = RandomForestClassifier(random_state=0)
random_forest_classifier_GS = GridSearchCV(random_forest_classifier, grid_search_parameters, cv=10, verbose=3)
random_forest_classifier_GS.fit(X_train, y_train)

print("Best Parameters: ", random_forest_classifier_GS.best_params_)
print("Best Score: ", random_forest_classifier_GS.best_score_)

## Our Model

In [8]:
labeller = LabelEncoder()
# set up X and y variables
X_train = train_sbert_raw
X_test = test_sbert_raw
y_train = labeller.fit_transform(train_df['sentiment'].values)
y_test = labeller.transform(test_df['sentiment'].values)

lwrf = LocallyWeightedRandomForest(n_estimators=5, max_samples = 0.9, max_depth = 5)

from importlib import reload
import locallyWeightedRandomForest
reload(locallyWeightedRandomForest)
from locallyWeightedRandomForest import LocallyWeightedRandomForest

In [9]:
param_grid = {
    "n_estimators" : [5,10],
    "criterion" : ['gini'],
    "max_depth" : [5,10],
    "max_samples" : [0.8],
    "temp" : [1, 5]
}
gcv = GridSearchCV(
    estimator=lwrf,
    param_grid=param_grid,
    scoring = 'f1_weighted',
    verbose = 3
)

gcv.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END criterion=gini, max_depth=5, max_samples=0.8, n_estimators=5, temp=1;, score=0.345 total time=  51.1s
[CV 2/5] END criterion=gini, max_depth=5, max_samples=0.8, n_estimators=5, temp=1;, score=0.343 total time=  52.5s
[CV 3/5] END criterion=gini, max_depth=5, max_samples=0.8, n_estimators=5, temp=1;, score=0.340 total time=  52.3s
[CV 4/5] END criterion=gini, max_depth=5, max_samples=0.8, n_estimators=5, temp=1;, score=0.319 total time=  53.5s
[CV 5/5] END criterion=gini, max_depth=5, max_samples=0.8, n_estimators=5, temp=1;, score=0.347 total time=  53.0s
[CV 1/5] END criterion=gini, max_depth=5, max_samples=0.8, n_estimators=5, temp=5;, score=0.367 total time=  53.2s
[CV 2/5] END criterion=gini, max_depth=5, max_samples=0.8, n_estimators=5, temp=5;, score=0.353 total time=  53.4s
[CV 3/5] END criterion=gini, max_depth=5, max_samples=0.8, n_estimators=5, temp=5;, score=0.351 total time=  53.2s
[CV 4/5] END criteri

In [11]:
gcv.best_score_

0.3951096934453628