# Comparing the performances of LightGBM and XGBoost models with three types of embedding (LSI, Word2Vec, FastText)

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgbm
from sklearn.metrics import accuracy_score

## Loading the data

In [2]:
X_train_lsi = pd.read_csv('spark_lsi/X_train_spark_lsi.csv')
X_test_lsi = pd.read_csv('spark_lsi/X_test_spark_lsi.csv')
y_train_lsi = np.load('spark_lsi/y_train_spark_lsi.npy')
y_test_lsi = np.load('spark_lsi/y_test_spark_lsi.npy')
y_train_lsi = y_train_lsi.argmax(1) + 1
y_test_lsi = y_test_lsi.argmax(1) + 1

In [3]:
X_train_word2vec = pd.read_csv('word2vec/X_train_word2vec.csv')
X_test_word2vec = pd.read_csv('word2vec/X_test_word2vec.csv')
y_train_word2vec = np.load('word2vec/y_train_word2vec.npy')
y_test_word2vec = np.load('word2vec/y_test_word2vec.npy')
y_train_word2vec = y_train_word2vec.argmax(1) + 1
y_test_word2vec = y_test_word2vec.argmax(1) + 1

In [4]:
X_train_fasttext = pd.read_csv('fasttext/X_train_fasttext.csv')
X_test_fasttext = pd.read_csv('fasttext/X_test_fasttext.csv')
y_train_fasttext = np.load('fasttext/y_train_fasttext.npy')
y_test_fasttext = np.load('fasttext/y_test_fasttext.npy')
y_train_fasttext = y_train_fasttext.argmax(1) + 1
y_test_fasttext = y_test_fasttext.argmax(1) + 1

## Building models

### LGBM

In [5]:
lgbm_classifier = lgbm.LGBMClassifier(boosting_type='dart', n_estimators=5000, learning_rate=0.1, max_depth=-1, num_leaves=16, subsample=0.9, colsample_bytree=0.9, subsample_freq=1, uniform_drop=True)

In [6]:
lgbm_classifier.fit(X_train_lsi,y_train_lsi)
y_pred_lsi = lgbm_classifier.predict(X_test_lsi)
accuracy = accuracy_score(y_test_lsi, y_pred_lsi)
print(f'The accuracy obtained with lgbm and lsi is {accuracy}')

The accuracy obtained with lgbm and lsi is 0.5230894973436861


In [7]:
lgbm_classifier.fit(X_train_word2vec,y_train_word2vec)
y_pred_word2vec = lgbm_classifier.predict(X_test_word2vec)
accuracy = accuracy_score(y_test_word2vec, y_pred_word2vec)
print(f'The accuracy obtained with lgbm and word2vec is {accuracy}')

The accuracy obtained with lgbm and word2vec is 0.5623212096444626


In [8]:
lgbm_classifier.fit(X_train_fasttext,y_train_fasttext)
y_pred_fasttext = lgbm_classifier.predict(X_test_fasttext)
accuracy = accuracy_score(y_test_fasttext, y_pred_fasttext)
print(f'The accuracy obtained with lgbm and fasttext is {accuracy}')

The accuracy obtained with lgbm and fasttext is 0.4495300367797303


### XGBoost

In [11]:
xgb_classifier = xgb.XGBClassifier(n_estimators=1000, learning_rate=0.1, max_depth=5, subsample=0.9, colsample_bytree = 0.9)

In [12]:
xgb_classifier.fit(X_train_lsi,y_train_lsi)
y_pred_lsi = xgb_classifier.predict(X_test_lsi)
accuracy = accuracy_score(y_test_lsi, y_pred_lsi)
print(f'The accuracy obtained with xgboost and lsi is {accuracy}')

The accuracy obtained with xgboost and lsi is 0.5190028606456886


In [13]:
xgb_classifier.fit(X_train_word2vec,y_train_word2vec)
y_pred_word2vec = xgb_classifier.predict(X_test_word2vec)
accuracy = accuracy_score(y_test_word2vec, y_pred_word2vec)
print(f'The accuracy obtained with xgboost and word2vec is {accuracy}')

The accuracy obtained with xgboost and word2vec is 0.5557825909276666


In [14]:
xgb_classifier.fit(X_train_fasttext,y_train_fasttext)
y_pred_fasttext = xgb_classifier.predict(X_test_fasttext)
accuracy = accuracy_score(y_test_fasttext, y_pred_fasttext)
print(f'The accuracy obtained with xgboost and fasttext is {accuracy}')

The accuracy obtained with xgboost and fasttext is 0.4446260727421332


## Conclusion

The best result is achieved using Light Gradient Boosting Machine on the Word2Vec embedding matrix, which confirms the results found in the `embedding_performance_comparison.ipynb` notebook. The latter states that Word2Vec and LSI embeddings enable to achieve similar performances in guessing the rating of reviews.

However, the best accuracy (59%) is achieved using a tuned Neural Network on a Word2Vec embedding matrix.