# Machine Learning: Gradient Boosting

## 0 Imports

In [12]:
import pandas as pd
import numpy as np
import time
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import confusion_matrix, recall_score, matthews_corrcoef
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier

In [13]:
data_change = pd.read_csv("../dat/dips/DIPS_Data_cleaned_change.csv", sep = ",", low_memory = False)

## 1 XGBoost Model

In [14]:
# Predictors
X = data_change
X = X.drop(["hpi"], axis = 1)

# Target
y = data_change["hpi"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

print("XGBoost \n",
      "X_train shape: ", X_train.shape, round(X_train.shape[0]/len(X), 2), "\n",
      "X_test shape: ", X_test.shape, round(X_test.shape[0]/len(X), 2),  "\n",
      "y_train shape: ", y_train.shape, round(y_train.shape[0]/len(y), 2), "\n",
      "y_test shape: ", y_test.shape, round(y_test.shape[0]/len(y), 2), "\n")

XGBoost 
 X_train shape:  (983, 22) 0.8 
 X_test shape:  (246, 22) 0.2 
 y_train shape:  (983,) 0.8 
 y_test shape:  (246,) 0.2 



In [None]:
def xgboost(x_test_xgb, x_train_xgb, y_train_xgb, y_test_xgb):

    # 1 GRID SEARCH
    xgb_model = XGBClassifier(random_state = 42, use_label_encoder = False, eval_metric = 'mlogloss')

    param_grid = {
        'n_estimators': [100, 200, 300, 500, 1000],
        'max_depth': [None, 3, 6, 10],
        'learning_rate': np.logspace(-3, 0, 5),
        'subsample': [0.5, 0.8, 1.0],
        'colsample_bytree': [0.5, 0.8, 1.0]
    }

    grid_search = GridSearchCV(
        estimator = xgb_model,
        param_grid = param_grid,
        cv = 10
    )
    grid_search.fit(x_train_xgb, y_train_xgb)

    best_model = grid_search.best_estimator_
    print("Best parameters: ", grid_search.best_params_)
    print("Best score: ", grid_search.best_score_)

    # 2: FITTING THE MODEL
    model = XGBClassifier(
        n_estimators = best_model.n_estimators,
        max_depth = best_model.max_depth,
        learning_rate = best_model.learning_rate,
        subsample = best_model.subsample,
        colsample_bytree = best_model.colsample_bytree,
        use_label_encoder = False,
        eval_metric = 'mlogloss',
        random_state = 42
    )
    model.fit(x_train_xgb, y_train_xgb)

    # 3: ESTIMATING FEATURE IMPORTANCE
    weights_xgb = model.feature_importances_
    features_xgb = model.get_booster().feature_names

    # 4: COMPUTE TEST SET PREDICTIONS
    y_pred_xgb = model.predict(x_test_xgb)

    # 5: COMPUTE METRICS
    accuracy_xgb = model.score(x_test_xgb, y_test_xgb)
    macro_f1_xgb = recall_score(y_test_xgb, y_pred_xgb, average = "macro")
    micro_f1_xgb = recall_score(y_test_xgb, y_pred_xgb, average = "micro")
    mcc_xgb = matthews_corrcoef(y_test_xgb, y_pred_xgb)

    cm_xgb = confusion_matrix(y_test_xgb, y_pred_xgb)
    try:
        precision_xgb = cm_xgb[1][1] / (cm_xgb[1][1] + cm_xgb[0][1])
        recall_xgb = cm_xgb[1][1] / (cm_xgb[1][1] + cm_xgb[1][0])
    except IndexError:
        precision_xgb = recall_xgb = 0.0  # For multiclass confusion matrices

    # store metrics in a dictionary
    metrics_xgb = {
        "accuracy": round(accuracy_xgb, 4),
        "macro_f1": round(macro_f1_xgb, 4),
        "micro_f1": round(micro_f1_xgb, 4),
        "mcc": round(mcc_xgb, 4),
        "precision": round(precision_xgb, 4),
        "recall": round(recall_xgb, 4),
        "confusion_matrix": cm_xgb
    }

    return weights_xgb, y_pred_xgb, features_xgb, metrics_xgb

In [None]:
start = time.time()

weights_xgb, y_pred_xgb, features_xgb, metrics_xgb = xgboost(X_test, X_train, y_train, y_test)

end = time.time()
print("Execution time: ", {end - start}, "seconds")
time_xgb = end - start
time_xgb_df = pd.DataFrame({"time": [time_xgb]})
time_xgb_df.to_csv("../exp/times_ML/time_xgb.csv", sep = ",", index = False)

# save y_pred_ML
y_pred_xgb = pd.DataFrame(y_pred_xgb, columns = ["y_pred"])
y_pred_xgb.to_csv("../exp/y_pred_ML/y_pred_xgb.csv", sep = ",", index = False)

weights_xgb = pd.DataFrame(weights_xgb, columns = ["weights_ML"])
weights_xgb.to_csv("../exp/weights_ML/weights_xgb.csv", sep = ",", index = False)

In [17]:
metrics_xgb

{'accuracy': 0.7073,
 'macro_f1': 0.5048,
 'micro_f1': 0.7073,
 'mcc': np.float64(0.0229),
 'precision': np.float64(0.3333),
 'recall': np.float64(0.0435),
 'confusion_matrix': array([[171,   6],
        [ 66,   3]])}