# 5.0 Final Summary

Contents

[5.1 Intro](#51-introduction)
* [5.1.1 Project Recap](#511-project-recap)
* [5.1.2 Imports](#512-imports)

[5.2 Final Model Comparison](#52-final-model-comparison)

## 5.1 Introduction

### 5.1.1 Project Recap

### 5.1.2 Imports

In [1]:
import pandas as pd
import numpy as np
import time
import pyarrow.parquet as pq 

from sklearn.model_selection import train_test_split, KFold

#scaling/vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# models
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from imblearn.pipeline import Pipeline
import lightgbm as lgb

import shap

  from pandas import MultiIndex, Int64Index


In [2]:
data = pq.read_table("../data/edited/fashion.parquet")
fashion = data.to_pandas()
fashion

Unnamed: 0,review,neg_sentiment,stars,review_length
0,exactly need,0,5,4
1,agree review opening small bent hook expensiv...,1,2,49
2,love going order pack work including losing ea...,0,4,50
3,tiny open,1,2,4
4,okay,1,3,1
...,...,...,...,...
883631,absolutely love dress sexy comfortable split ...,0,5,51
883632,lbs tall wear large ordered large comfortable...,0,5,39
883633,big chest area,1,3,6
883634,clear needs lin,1,3,7


## 5.2 Final Model Comparison

In [3]:
# previous best models
model_df = pd.read_csv("best_models.csv")

model_df.drop([1,3], axis=0, inplace=True)

model_df


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_C,params_classifier,params_learning_rate,params_max_depth,params_n_estimators,state
0,97,0.841938536224133,2022-08-09 02:35:41.225839,2022-08-09 02:37:34.537309,0 days 00:01:53.311470,,xgboost,0.1,17.0,500.0,COMPLETE
2,106,0.8445268119737319,2022-08-09 02:37:51.645079,2022-08-09 02:38:43.209772,0 days 00:00:51.564693,,lgbm,0.01,20.0,1000.0,COMPLETE
4,35,0.9164345403899722,2022-08-09 01:40:03.147489,2022-08-09 01:40:04.929709,0 days 00:00:01.782220,0.01,logreg,,,,COMPLETE


In [4]:
def model_initialize():
    lr = LogisticRegression(C=0.01)

    xg = xgb.XGBClassifier(n_estimators=500, max_depth=17, learning_rate=0.1, n_jobs=-1, verbosity=0, use_label_encoder=False)

    lgbm = lgb.LGBMClassifier(n_estimators=1000, max_depth=20, learning_rate=0.01, n_jobs=-1, verbosity=0, force_col_wise=True)

    return lr, xg, lgbm

In [5]:
def time_and_record(model_name, model, X, y):
    performance["model_type"].append(model_name)
    
    fit_start = time.time()
    model.fit(X,y)
    fit_end = time.time()
        
    performance["fit_time"].append(fit_end - fit_start)

    predict_start = time.time()
    model.predict(X)
    predict_end = time.time()
    performance["prediction_time"].append(predict_end - predict_start)

    if model_name == "lr":
        performance["feature_importance"].append(model.coef_[0])
    elif model_name == "xgb":
        performance["feature_importance"].append(model.feature_importances_)
    else:
        performance["feature_importance"].append(model.feature_importances_)

In [6]:
performance = {"model_type": [], "fit_time": [], "prediction_time": [], "feature_importance": []}


In [7]:
models = list(model_initialize())



tfidf = TfidfVectorizer(ngram_range=(1,2), min_df = 5, max_df=0.95)

X = tfidf.fit_transform(fashion.review)

y = np.ravel(fashion.neg_sentiment)




In [8]:

for model_name, model in zip(['lr', 'xgboost', 'lgbm'], models):
    
    time_and_record(model_name, model, X, y)



In [9]:
for model_name, imp in zip(performance["model_type"], performance["feature_importance"]):
    if model_name != "lr":
        twenty_largest_importance = pd.Series(imp).nlargest(20)
        print(pd.Series(tfidf.get_feature_names()).loc[twenty_largest_importance.index])

    else:
        twenty_largest_importance = pd.concat([pd.Series(imp).nlargest(10),pd.Series(imp).nsmallest(10)], axis=0)
        print(pd.Series(tfidf.get_feature_names()).loc[twenty_largest_importance.index])




156925             not
35496            cheap
27006            broke
223351           small
223347            smal
198806          return
266586             way
61504     disappointed
181702            poor
36318          cheaply
138203            love
96781            great
172656         perfect
140798           loves
43772      comfortable
14053        beautiful
140013           loved
154140            nice
70550          exactly
173905       perfectly
dtype: object
181702                poor
160359            not wait
70783                excel
265798               waste
243525            terrible
140798               loves
70798            excellent
106556    highly recommend
157775      not disappoint
181779              poorly
172656             perfect
61495           disappoint
109253            horrible
223347                smal
36318              cheaply
179818               pleas
138203                love
9920                 awful
109252             horribl
158312       

