# Machine Learning: Gradient Boosting

## 0 Imports

In [1]:
import pandas as pd
import numpy as np
from scipy.constants import micro
from sklearn.metrics import confusion_matrix, recall_score, matthews_corrcoef
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
data = pd.read_csv("../dat/dips/DIPS_Data_cleaned.csv", sep = ",", low_memory = False)
data_change = pd.read_csv("../dat/dips/DIPS_Data_cleaned_change.csv", sep = ",", low_memory = False)
data_pred = pd.read_csv("../dat/dips/DIPS_Data_cleaned_pred.csv", sep = ",", low_memory = False)
data_pred_y = pd.read_csv("../dat/dips/DIPS_Data_cleaned_pred_y.csv", sep = ",", low_memory = False)

## 1 DNN Model

In [3]:
# Predictors
X = data_change
X = X.drop(["hpi"], axis = 1)

# Target
y = data_change["hpi"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print("Gradient Boosting \n",
      "X_train shape: ", X_train.shape, round(X_train.shape[0]/len(X), 2), "\n",
      "X_test shape: ", X_test.shape, round(X_test.shape[0]/len(X), 2),  "\n",
      "y_train shape: ", y_train.shape, round(y_train.shape[0]/len(y), 2), "\n",
      "y_test shape: ", y_test.shape, round(y_test.shape[0]/len(y), 2), "\n")

Gradient Boosting 
 X_train shape:  (983, 22) 0.8 
 X_test shape:  (246, 22) 0.2 
 y_train shape:  (983,) 0.8 
 y_test shape:  (246,) 0.2 



In [4]:
def sklearn_gradient_boosting(x_test_gb, x_train_gb, y_train_gb, y_test_gb):
    """Computes OLS weights for linear regression without regularization using the sklearn library on the training set and
       returns weights, testset predictions and metrics.
    """

    # 1: GRID SEARCH
    gb_model = GradientBoostingClassifier(random_state = 42)

    param_grid = {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    }

    grid_search = GridSearchCV(estimator = gb_model, param_grid = param_grid, cv = 10)
    grid_search.fit(x_train_gb, y_train_gb)
    best_model = grid_search.best_estimator_
    print("Best parameters: ", grid_search.best_params_)
    print("Best score: ", grid_search.best_score_)

    # 2: FITTING THE MODEL
    model = GradientBoostingClassifier(n_estimators = best_model.n_estimators,
                                       learning_rate = best_model.learning_rate,
                                       max_depth = best_model.max_depth,
                                       min_samples_split = best_model.min_samples_split,
                                       min_samples_leaf = best_model.min_samples_leaf,
                                       random_state = 42)
    model.fit(x_train_gb, y_train_gb)

    # 3: ESTIMATING WEIGHTS
    weights = model.feature_importances_
    features = x_train_gb.columns

    # 4: COMPUTE TEST SET PREDICTIONS
    y_pred = model.predict(x_test_gb)

    # 5: COMPUTE METRICS
    accuracy_gb = model.score(x_test_gb, y_test_gb)
    macro_f1_gb = recall_score(y_test_gb, y_pred, average = "macro")
    micro_f1_gb = recall_score(y_test_gb, y_pred, average = "micro")
    mcc_gb = matthews_corrcoef(y_test_gb, y_pred)

    cm = confusion_matrix(y_test_gb, y_pred)
    precision_gb = cm[1, 1] / (cm[1, 1] + cm[0, 1])
    recall_gb = cm[1, 1] / (cm[1, 1] + cm[1, 0])

    # store metrics in a dictionary
    metrics = {
        "accuracy": accuracy_gb,
        "macro_f1": macro_f1_gb,
        "micro_f1": micro_f1_gb,
        "mcc": mcc_gb,
        "precision": precision_gb,
        "recall": recall_gb,
        "confusion_matrix": cm
    }

    return weights, features, y_pred, metrics

In [7]:
weights_gb, features_gb, y_pred_gb, metrics_gb = sklearn_gradient_boosting(X_test, X_train, y_train, y_test)

# save weights and predictions
weights_gb = pd.DataFrame([weights_gb], columns = features_gb)
weights_gb.to_csv("../exp/weights/weights_gb.csv", sep = ",", index = False)

y_pred_gb = pd.DataFrame(y_pred_gb, columns = ["y_pred"])
y_pred_gb.to_csv("../exp/predictions/y_pred_gb.csv", sep = ",", index = False)

Best parameters:  {'learning_rate': 0.01, 'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
Best score:  0.7396000824572254


In [8]:
metrics_gb

{'accuracy': 0.6747967479674797,
 'macro_f1': 0.510346395815963,
 'micro_f1': 0.6747967479674797,
 'mcc': np.float64(0.05146304012202291),
 'precision': np.float64(0.4444444444444444),
 'recall': np.float64(0.05063291139240506),
 'confusion_matrix': array([[162,   5],
        [ 75,   4]])}