# Machine Learning: Random Forest

## 0 Imports

In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.metrics import confusion_matrix, recall_score, matthews_corrcoef
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [2]:
data_change = pd.read_csv("../../dat/dips/DIPS_Data_cleaned_change.csv", sep =",", low_memory = False)

## 1 Random Forest Model

In [3]:
# Predictors
X = data_change
X = X.drop(["hpi"], axis = 1)

# Target
y = data_change["hpi"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

print("Random Forest \n",
      "X_train shape: ", X_train.shape, round(X_train.shape[0]/len(X), 2), "\n",
      "X_test shape: ", X_test.shape, round(X_test.shape[0]/len(X), 2),  "\n",
      "y_train shape: ", y_train.shape, round(y_train.shape[0]/len(y), 2), "\n",
      "y_test shape: ", y_test.shape, round(y_test.shape[0]/len(y), 2), "\n")

Random Forest 
 X_train shape:  (983, 22) 0.8 
 X_test shape:  (246, 22) 0.2 
 y_train shape:  (983,) 0.8 
 y_test shape:  (246,) 0.2 



In [6]:
def sklearn_random_forest(x_test_rf, x_train_rf, y_train_rf, y_test_rf):

    # 1: GRID SEARCH
    rfc_model = RandomForestClassifier(random_state = 42)

    # param_grid = {
    #     'n_estimators': [100, 200, 300],
    #     'max_depth': [None, 10, 20, 30],
    #     'min_samples_split': [2, 5, 10],
    #     'min_samples_leaf': [1, 2, 4]
    # }

    param_grid = {
        'bootstrap': [True],
        'max_depth': [None, 10, 30, 50],
        'max_features': [2, 5, 10],
        'min_samples_leaf': [1, 3, 5],
        'min_samples_split': [2, 5, 10, 12],
        'n_estimators': [100, 200, 300, 500],
        'criterion': ['gini', 'entropy']
    }

    grid_search = GridSearchCV(
        estimator = rfc_model,
        param_grid = param_grid,
        cv = 10
    )
    grid_search.fit(x_train_rf, y_train_rf)
    best_model = grid_search.best_estimator_
    print("Best parameters: ", grid_search.best_params_)
    print("Best score: ", grid_search.best_score_)

    # 2: FITTING THE MODEL
    model = RandomForestClassifier(
        bootstrap = best_model.bootstrap,
        max_depth = best_model.max_depth,
        max_features = best_model.max_features,
        min_samples_leaf = best_model.min_samples_leaf,
        min_samples_split = best_model.min_samples_split,
        n_estimators = best_model.n_estimators,
        criterion = best_model.criterion,
        random_state = 42
    )
    model.fit(x_train_rf, y_train_rf)

    # 3: ESTIMATING WEIGHTS
    weights_randf = model.feature_importances_
    features_randf = model.feature_names_in_

    # 4: COMPUTE TEST SET PREDICTIONS
    y_pred_randf = model.predict(x_test_rf)
    y_pred_proba_randf = model.predict_proba(x_test_rf)

    # 5: COMPUTE METRICS
    accuracy_rf = model.score(x_test_rf, y_test_rf)
    macro_f1_rf = recall_score(y_test_rf, y_pred_randf, average = "macro")
    micro_f1_rf = recall_score(y_test_rf, y_pred_randf, average = "micro")
    mcc_rf = matthews_corrcoef(y_test_rf, y_pred_randf)

    cm_rf = confusion_matrix(y_test_rf, y_pred_randf)
    precision_rf = cm_rf[1][1] / (cm_rf[1][1] + cm_rf[0][1])
    recall_rf = cm_rf[1][1] / (cm_rf[1][1] + cm_rf[1][0])

    # store metrics in a dictionary
    metrics_randf = {
        "accuracy": round(accuracy_rf, 4),
        "macro_f1": round(macro_f1_rf, 4),
        "micro_f1": round(micro_f1_rf, 4),
        "mcc": round(mcc_rf, 4),
        "precision": round(precision_rf, 4),
        "recall": round(recall_rf, 4),
        "confusion_matrix": cm_rf
    }

    return weights_randf, y_pred_randf, y_pred_proba_randf, features_randf, metrics_randf

In [7]:
start = time.time()

weights_rf, y_pred_rf, y_pred_proba_rf, features_rf, metrics_rf = sklearn_random_forest(X_test, X_train, y_train, y_test)

end = time.time()
print("Execution time: ", {end - start}, "seconds")
time_rf = end - start
time_rf_df = pd.DataFrame({"time": [time_rf]})
# time_rf_df.to_csv("../exp/times_ML/time_rf.csv", sep = ",", index = False)

y_pred_rf = pd.DataFrame(y_pred_rf, columns = ["y_pred"])
y_pred_rf.to_csv("y_pred_ML/y_pred_rf.csv", sep = ",", index = False)

KeyboardInterrupt: 

In [6]:
metrics_rf

{'accuracy': 0.7276,
 'macro_f1': 0.5676,
 'micro_f1': 0.7276,
 'mcc': np.float64(0.1974),
 'precision': np.float64(0.5385),
 'recall': np.float64(0.2029),
 'confusion_matrix': array([[165,  12],
        [ 55,  14]])}