In [None]:
from package import *
import importlib
import utility as u
importlib.reload(u)
pass



In [None]:
df = pd.read_parquet(f"../data/data_train_test.parquet")


## Data Split

In [None]:
split_strategy = "random" 
train_test_split_util = u.TrainTestSplitUtilClass(
    test_size=0.2, 
    split_strategy=split_strategy
)

X_train, y_train, X_test, y_test = train_test_split_util.run(data=df)

# The following step is needed to avoid extra memory consumption in training a xgboost model: https://github.com/dmlc/xgboost/issues/6908
X_train_arr = np.ascontiguousarray(X_train)
y_train_arr = np.ascontiguousarray(y_train.astype(int))
X_test_arr = np.ascontiguousarray(X_test)
y_test_arr = np.ascontiguousarray(y_test.astype(int))


## Model Training

In [None]:
import warnings
warnings.filterwarnings('ignore')

start_time = time()

cv = TimeSeriesSplit(n_splits=u.CV_FOLDS) if split_strategy=="time_series_split" else u.CV_FOLDS

grid_search = True
# param_grid={
#         "learning_rate": [0.01, 0.05, 0.1],
#         "max_depth":[2, 3],
#         "n_estimators":[50, 100, 200],
#         "reg_alpha": [0, 0.1, 0.2],
#         "colsample_bytree":[0.8, 1.0],
#         "subsample":[0.8, 1.0]
# }

param_grid={
        "learning_rate": [0.01],
        "max_depth":[2, 3],
        "n_estimators":[50],
        "reg_alpha": [0],
        "colsample_bytree":[0.8],
        "subsample":[0.8]
}

if not grid_search:
    model = XGBClassifier(
        learning_rate=0.05, 
        n_estimators=200, 
        max_depth=5,
        random_state=0,
        nthread=4,
        use_label_encoder=False
    )
else:
    model = GridSearchCV(
        estimator=XGBClassifier(
            objective="binary:logistic", 
            tree_method="hist",  # to speed up training
            nthread=4, 
            seed=42,
            use_label_encoder=False
        ), 
        param_grid=param_grid, 
        cv=cv, 
        scoring="average_precision", #"roc_auc",
        n_jobs=-1,
    )
    
model = model.fit(X_train_arr, y_train_arr)

if grid_search:
    print("best params from grid search:", model.best_params_)
    model = model.best_estimator_

model.save_model(u.MODEL_PATH)

print(f"Execution time: {round((time()-start_time)/60, 2)} mins.")

best_params = model.get_params()
best_params


### n_estimator learning curves

To diagnose overfitting/underfitting

In [None]:
util = u.BinaryClassifierPerformanceAndDiagnosisUtilClass()
eval_data_sets = [(X_train_arr, y_train_arr), (X_test_arr, y_test_arr)]
util.learning_curve_n_estimators(
    estimator=XGBClassifier(**best_params), 
    eval_data_sets=eval_data_sets, 
    eval_metrics=["logloss", "auc", "aucpr"]
)


## Model Eval

In [None]:
import warnings
warnings.filterwarnings('ignore')

model = xgb.Booster()
model.load_model(u.MODEL_PATH)
predictor = u.PredictorUtilClass()

df_train = predictor.get_predictions(X=X_train, y=y_train, model=model)
df_test = predictor.get_predictions(X=X_test, y=y_test, model=model)

for name, data in zip(["train", "test"], [df_train, df_test]):
    print(name)
    df_tmp = data[["y", u.BINARY_PREDICTION_NAME]]
    df_tmp.sort_values(by=[u.BINARY_PREDICTION_NAME], ascending=False, inplace=True)

    num_of_positive = sum(df_tmp["y"])
    percentage_of_positive_examples = round(np.mean(df_tmp["y"])*100)
    print(f"percentage of positive examples: {percentage_of_positive_examples }%")

    percentage_of_positive_examples_at_the_top = round(sum(df_tmp.head(num_of_positive)["y"])/num_of_positive*100)
    print(f"{percentage_of_positive_examples_at_the_top}% of positive examples are captured by {num_of_positive} ({percentage_of_positive_examples}%) of invoices with highest predicted probabilities.")


In [None]:
df_tmp = df_test[["y", u.BINARY_PREDICTION_NAME]]
df_tmp.sort_values(by=[u.BINARY_PREDICTION_NAME], ascending=False, inplace=True)
#df_tmp.head(100).reset_index(drop=True)#.mean()
#df_tmp.tail(200)


In [None]:
util = u.BinaryClassifierPerformanceAndDiagnosisUtilClass(model)
util.roc_pr_curves_train_test(df_train, 
                              df_test, 
                              target_col_name="y", 
                              prediction_col_name=u.BINARY_PREDICTION_NAME, 
                              suptitle=f'my_title',
                              save_to_path=f"../plot/ModelPerformance/ROC_PR_Curves.png"
                             )


In [None]:
util.histogram_of_predicted_prob_train_test(df_train, 
                                            df_test, 
                                            target_col_name="y", 
                                            prediction_col_name=u.BINARY_PREDICTION_NAME,
                                            suptitle=f"Histograms of Predicted Probabilities)",
                                            save_to_path=f"../plot/ModelPerformance/Histograms_Of_Predicted_Probabilities.png"
                                           )


### Feature Importance

In [None]:
feature_importance = model.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
fig = plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, np.array(u.FEATURE_COLS)[sorted_idx])

plt.title(f"Feature Importance)")
plt.savefig(fname=f"../plot/ModelPerformance/Feature_Importance.png", bbox_inches="tight")
plt.show()


### SHAP Value

[shap api doc](https://shap.readthedocs.io/en/latest/index.html)


In [None]:
explainer = shap.Explainer(model)    
shap_values = explainer(X_test)

plt.figure(figsize=(14, 6))
shap.summary_plot(shap_values, X_test, show=False)
plt.title(f"Shap Summary)")
plt.savefig(fname=f"../plot/ModelPerformance/Shap_Summary.png", bbox_inches="tight")
plt.show()


In [None]:
shap.plots.bar(shap_values, max_display=20, show=False)
plt.title(f"Shap Bar)")
plt.savefig(fname=f"../plot/ModelPerformance/Shap_Bar.png", bbox_inches="tight")
plt.show()


In [None]:
#shap.plots.waterfall(shap_values[0])

In [None]:
shap.initjs()
shap.plots.force(shap_values[0])


### Learning Curve

In [None]:
import warnings
warnings.filterwarnings('ignore')

model_single = XGBClassifier(objective="binary:logistic", 
                             learning_rate=0.05, 
                             n_estimators=100, 
                             max_depth=6,
                             random_state=0,
                             nthread=4, 
                             seed=42, 
                             use_label_encoder=False)
plot = util.plot_learning_curve(
    model_single, 
    X=X_train, 
    y=y_train, 
    cv=5, 
    scoring="average_precision"
)
plot.title(f"Learning Curve")
plt.ylim(0,1)
plot.savefig(fname=f"../plot/ModelPerformance/Learning_curve.png", bbox_inches="tight")
