In [None]:
import joblib
import pandas as pd
from sklearn.model_selection import cross_val_score
import xgboost as xgb

from utils import get_competition_data_path

In [None]:
path_dict = get_competition_data_path('imdb-sentiment-analysis')

X_train_path = path_dict.get('X_train_path')
y_train_path = path_dict.get('y_train_path')


Load processed data


In [None]:
X_train = pd.read_csv(X_train_path)
y_train = pd.read_csv(y_train_path, squeeze=True)

X_train_arr = X_train.to_numpy()
y_train_arr = y_train.to_numpy()

In [None]:
from sklearn.model_selection import train_test_split

X_train_s, X_val_s, y_train_s, y_val_s = train_test_split(X_train_arr, y_train_arr, test_size=0.2)
D_train = xgb.DMatrix(X_train_arr, label=y_train_arr)
D_train_s = xgb.DMatrix(X_train_s, label=y_train_s)
D_val_s = xgb.DMatrix(X_val_s, label=y_val_s)



Train and validate the model

In [None]:
params  = dict(max_depth=5,
              eta=0.2,
              gamma=4,
              min_child_weight=6,
              subsample=0.8,
              silent=0,
              objective='binary:logistic',
              early_stopping_rounds=10,
              num_round=500)

XGBOOST API

In [None]:
model = xgb.train(params, D_train_s, num_boost_round=10)


In [None]:
from sklearn.metrics import accuracy_score
y_pred = model.predict(D_val_s)
accuracy_score(y_val_s, y_pred.round(0))


In [None]:
import matplotlib.pyplot as plt

xgb.plot_tree(model,num_trees=0)
plt.rcParams['figure.figsize'] = [50, 10]
plt.show()

XGBOOST CV


In [None]:
cv_results = xgb.cv(dtrain=D_train,
                    params=params,
                    nfold=5,
                    num_boost_round=10,
                    early_stopping_rounds=10,
                    metrics='error', 
                    as_pandas=True, seed=123)

In [None]:
1 - cv_results.tail(1)

Sklearn API

In [None]:
model = xgb.XGBRegressor(objective='binary:logistic',
                         colsample_bytree=0.3,
                         learning_rate=0.1,
                         max_depth=5,
                         n_estimators=10)


In [None]:
model.fit(X_train_arr, y_train_arr)
y_pred = model.predict(X_val_s)
accuracy_score(y_val_s, y_pred.round(0))


Hyperparameter tuning


In [None]:
grid = {
     "objective": "binary:logistic",
     "learning_rate"    : [0.1, 0.2],
     "max_depth"        : [5, 10],
     "gamma"            : [0.4],
     "colsample_bytree" : [0.3]
     }

In [None]:
from sklearn.model_selection import GridSearchCV

model = xgb.XGBClassifier()
grid = GridSearchCV(model,
                    grid, n_jobs=4,
                    scoring="neg_log_loss",
                    cv=3)

grid.fit(X_train_arr, y_train_arr)

In [None]:
print(f"Best: {grid.best_score_} using {grid.best_params_}")


In [19]:
model = xgb.XGBClassifier(objective='binary:logistic',
                          gamma=0.4,
                         colsample_bytree=0.3,
                         learning_rate=0.2,
                         max_depth=10,
                         n_estimators=10)
model.fit(X_train_arr, y_train_arr)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.3, gamma=0.4,
              learning_rate=0.2, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=None, n_estimators=10, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

Save the trained model


In [20]:
joblib.dump(model, 'trained_model.sav')

['trained_model.sav']