In [9]:
from locallyWeightedRandomForest import LocallyWeightedRandomForest
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import scipy
import sklearn
from word_preprocess import *
from sentence_transformers import SentenceTransformer
from sklearn.metrics import classification_report

## Import Data

In [3]:
train_df = pd.read_csv('twitter_dataset/twitter_training.csv', header=None)
train_df.columns = [
    'tweet_id',
    'video_game',
    'sentiment',
    'text'
]
print(f'Training shape: {train_df.shape}')

test_df = pd.read_csv('twitter_dataset/twitter_validation.csv', header = None)
test_df.columns = [
    'tweet_id',
    'video_game',
    'sentiment',
    'text'
]
print(f'Testing shape: {test_df.shape}')

Training shape: (74682, 4)
Testing shape: (1000, 4)


### Remove NA

In [4]:
train_df = train_df.loc[~train_df.isna().any(axis=1),:]
test_df = test_df.loc[~test_df.isna().any(axis=1),:]
print(f'Training shape: {train_df.shape}')
print(f'Testing shape: {test_df.shape}')

Training shape: (73996, 4)
Testing shape: (1000, 4)


## Pre-processing Text

In [5]:
train_df['clean_text'] = train_df.text.apply(lambda x: clean_string(x))
test_df['clean_text'] = test_df.text.apply(lambda x: clean_string(x))

In [6]:
train_df.head()

Unnamed: 0,tweet_id,video_game,sentiment,text,clean_text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,im getting borderland murder
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,coming border kill
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,im getting borderland kill
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,im coming borderland murder
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,im getting borderland NUMBER murder


## Remove Empty Strings

In [7]:
train_df = train_df.loc[~(train_df['clean_text'] == ""),:]
test_df = test_df.loc[~(test_df['clean_text'] == ""),:]
print(f'Training shape: {train_df.shape}')
print(f'Testing shape: {test_df.shape}')

Training shape: (72505, 5)
Testing shape: (999, 5)


## Convert Text To Sentence Embedding

In [19]:
model = SentenceTransformer('all-MiniLM-L6-v2')
train_sbert_clean = model.encode(train_df['clean_text'].to_list())
test_sbert_clean = model.encode(test_df['clean_text'].to_list())
train_sbert_raw = model.encode(train_df['text'].to_list())
test_sbert_raw = model.encode(test_df['text'].to_list())

In [20]:
# save encodings to csv
np.savetxt('twitter_dataset/train_sbert_clean.csv', train_sbert_clean, delimiter = ',')
np.savetxt('twitter_dataset/test_sbert_clean.csv', test_sbert_clean, delimiter = ',')
np.savetxt('twitter_dataset/train_sbert_raw.csv', train_sbert_raw, delimiter = ',')
np.savetxt('twitter_dataset/test_sbert_raw.csv', test_sbert_raw, delimiter = ',')

In [8]:
# read in numpy data
train_sbert_clean = np.loadtxt('twitter_dataset/train_sbert_clean.csv', delimiter=',')
test_sbert_clean = np.loadtxt('twitter_dataset/test_sbert_clean.csv', delimiter=',')
train_sbert_raw = np.loadtxt('twitter_dataset/train_sbert_raw.csv', delimiter=',')
test_sbert_raw = np.loadtxt('twitter_dataset/test_sbert_raw.csv', delimiter=',')

In [10]:
# set up X and y variables
X_train = train_sbert_raw
X_test = test_sbert_raw
y_train = train_df['sentiment'].values
y_test = test_df['sentiment'].values

lwrf = LocallyWeightedRandomForest(n_estimators=100, max_samples = 0.9, max_depth = 10)
lwrf.fit(X_train, y_train)
pred = lwrf.predict(X_test, temperature=1)
print(classification_report(y_test, pred, digits=5))

: 

: 