In [1]:
import pandas as pd
import numpy as np

In [2]:
data = np.load('embeddings.npy')

In [3]:
labels = pd.read_csv('under-sample.csv')['stars']
embeddings = pd.DataFrame(data)

In [4]:
labels

0         1.0
1         5.0
2         5.0
3         2.0
4         5.0
         ... 
327657    5.0
327658    5.0
327659    5.0
327660    5.0
327661    5.0
Name: stars, Length: 327662, dtype: float64

In [5]:
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.439325,-0.682406,-0.673594,0.476764,-0.322630,-0.289587,-0.971796,-0.243982,0.371537,0.602596,...,0.243212,-0.634313,-0.122451,0.594080,-1.049217,-0.484577,-1.020499,0.402851,-0.495940,0.531883
1,0.305087,-0.127651,-0.209986,0.404762,0.552883,-0.020585,-0.062873,-0.568029,-0.332120,-0.130250,...,-0.888367,-0.342813,0.012810,-0.498089,-0.136595,-0.548513,0.311660,0.223268,-0.226586,-0.069075
2,-0.229291,-1.034940,-0.679176,0.071638,0.251054,-0.260691,-0.454224,-0.475818,0.131180,0.564549,...,-0.418233,-0.442920,-0.332616,0.244606,-0.534785,-0.658122,-0.980320,-0.037407,-1.119320,0.326172
3,-0.983781,0.373828,0.620552,0.231366,-0.044346,0.553025,-0.247402,-0.552601,-0.215788,-1.269489,...,-0.330496,0.909710,0.801179,-0.083926,-0.032956,0.865100,-0.729443,0.222648,-0.187046,1.161810
4,-0.384294,-1.125762,-0.200058,0.229588,0.324902,0.646749,-0.449385,-1.246233,0.805815,0.503905,...,-1.041725,-0.414013,0.562926,-0.873566,0.763939,-0.514713,-0.913831,0.562077,-0.354985,-0.640136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327657,-0.019851,-0.341211,-0.537466,0.101686,0.230673,0.746317,-0.557398,-1.110050,0.560732,0.708126,...,-0.796369,-0.338606,0.874495,-0.813324,-0.481205,-0.664062,-0.318348,0.307294,-0.117685,0.196572
327658,-0.069315,-0.963177,-0.826324,0.384276,0.186533,0.451003,-0.623645,-0.561217,-0.052340,0.358365,...,-1.219544,-0.440254,-0.193069,-0.709089,0.027073,-0.704814,-0.147694,0.477097,-0.181089,-0.650796
327659,0.226185,-0.674745,-0.433839,0.275807,0.191941,0.682916,0.341908,-0.503522,0.176282,0.130044,...,-1.009993,-0.625784,0.129250,-0.681478,0.612080,-1.218316,-0.407127,0.154350,-0.169552,0.392742
327660,-0.221416,-1.585322,-1.337286,0.329141,-0.405752,0.360811,-0.395144,0.442768,-0.116486,0.943206,...,-1.313192,0.092170,-1.140614,0.159778,-0.166666,-0.506728,-0.045201,-0.515075,0.611105,-0.506982


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

## Random Forest Classifier

In [10]:
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size = 0.2, random_state = 42)

In [8]:
rf_model = RandomForestClassifier(random_state = 42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

In [9]:
f1_score = sklearn.metrics.f1_score(y_test, y_pred, average = 'weighted')
print(f1_score)

0.5851911658972694


In [10]:
print(sklearn.metrics.classification_report(y_test, y_pred))
pd.DataFrame(confusion_matrix(y_test, y_pred, normalize = 'true'), columns = rf_model.classes_, index = rf_model.classes_)

              precision    recall  f1-score   support

         1.0       0.66      0.81      0.73      9908
         2.0       0.38      0.12      0.18      5069
         3.0       0.38      0.15      0.22      6465
         4.0       0.43      0.30      0.35     13666
         5.0       0.70      0.91      0.79     30425

    accuracy                           0.63     65533
   macro avg       0.51      0.46      0.45     65533
weighted avg       0.58      0.63      0.59     65533



Unnamed: 0,1.0,2.0,3.0,4.0,5.0
1.0,0.813282,0.03805,0.026544,0.031994,0.090129
2.0,0.425725,0.120734,0.138094,0.149142,0.166305
3.0,0.139211,0.066976,0.150039,0.354679,0.289095
4.0,0.032124,0.009586,0.036221,0.302576,0.619494
5.0,0.018866,0.001676,0.003977,0.070074,0.905407


## XGBoost Classifier

In [11]:
pip install xgboost



In [12]:
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

In [13]:
 y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [14]:
encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_test_encoded = encoder.fit_transform(y_test)

In [15]:
xgbclassifier = XGBClassifier(enable_categorical=True)
xgbclassifier.fit(X_train, y_train_encoded)
y_pred_xgb = xgbclassifier.predict(X_test)

In [16]:
f1_score_xgb = sklearn.metrics.f1_score(y_test, y_pred_xgb, average = 'weighted')
print(f1_score)

0.5851911658972694


In [17]:
print(sklearn.metrics.classification_report(y_test_encoded, y_pred_xgb))
pd.DataFrame(confusion_matrix(y_test_encoded, y_pred_xgb, normalize = 'true'), columns = xgbclassifier.classes_, index = xgbclassifier.classes_)

              precision    recall  f1-score   support

           0       0.70      0.82      0.76      9908
           1       0.38      0.25      0.30      5069
           2       0.42      0.26      0.32      6465
           3       0.47      0.39      0.43     13666
           4       0.75      0.87      0.80     30425

    accuracy                           0.65     65533
   macro avg       0.54      0.52      0.52     65533
weighted avg       0.62      0.65      0.63     65533



Unnamed: 0,0,1,2,3,4
0,0.816108,0.08478,0.030077,0.018672,0.050363
1,0.376011,0.249753,0.178536,0.107516,0.088183
2,0.113998,0.122351,0.261408,0.335499,0.166744
3,0.02283,0.018806,0.065125,0.394336,0.498902
4,0.014462,0.004503,0.008677,0.105538,0.86682
