# Machine Learning: Random Forest

## 0 Imports

In [1]:
import pandas as pd
from sklearn.metrics import confusion_matrix, recall_score, matthews_corrcoef
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [2]:
data = pd.read_csv("../dat/dips/DIPS_Data_cleaned.csv", sep = ",", low_memory = False)
data_pred = pd.read_csv("../dat/dips/DIPS_Data_cleaned_pred.csv", sep = ",", low_memory = False)
data_pred_y = pd.read_csv("../dat/dips/DIPS_Data_cleaned_pred_y.csv", sep = ",", low_memory = False)

## 1 Random Forest Model

In [3]:
# Predictors
X = data_pred
X = X.drop(["hpi"], axis = 1)

# Target
y = data["hpi"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print("Logistic Regression \n",
      "X_train shape: ", X_train.shape, round(X_train.shape[0]/len(X), 2), "\n",
      "X_test shape: ", X_test.shape, round(X_test.shape[0]/len(X), 2),  "\n",
      "y_train shape: ", y_train.shape, round(y_train.shape[0]/len(y), 2), "\n",
      "y_test shape: ", y_test.shape, round(y_test.shape[0]/len(y), 2), "\n")

Logistic Regression 
 X_train shape:  (1005, 15) 0.8 
 X_test shape:  (252, 15) 0.2 
 y_train shape:  (1005,) 0.8 
 y_test shape:  (252,) 0.2 



In [4]:
def sklearn_random_forest(x_test_rf, x_train_rf, y_train_rf, y_test_rf):
    """Computes OLS weights for linear regression without regularization using the sklearn library on the training set and
       returns weights, testset predictions and metrics.
    """

    # 1: GRID SEARCH
    rfc_model = RandomForestClassifier(random_state = 42)

    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    grid_search = GridSearchCV(estimator = rfc_model, param_grid = param_grid, cv = 10)
    grid_search.fit(x_train_rf, y_train_rf)
    best_model = grid_search.best_estimator_
    print("Best parameters: ", grid_search.best_params_)
    print("Best score: ", grid_search.best_score_)

    # # 2: FITTING THE MODEL
    model = RandomForestClassifier(n_estimators = best_model.n_estimators,
                                   max_depth = best_model.max_depth,
                                   min_samples_split = best_model.min_samples_split,
                                   min_samples_leaf = best_model.min_samples_leaf,
                                   random_state = 42)

    # fitting it on the training data
    model.fit(x_train_rf, y_train_rf)

    # 2: ESTIMATING WEIGHTS
    weights_randf = model.feature_importances_
    features_randf = model.feature_names_in_

    # 3: COMPUTE TEST SET PREDICTIONS
    y_pred_randf = model.predict(x_test_rf)
    y_pred_proba_randf = model.predict_proba(x_test_rf)

    # 4: COMPUTE METRICS
    accuracy_rf = model.score(x_test_rf, y_test_rf)
    macro_f1_rf = recall_score(y_test_rf, y_pred_randf, average = "macro")
    micro_f1_rf = recall_score(y_test_rf, y_pred_randf, average = "micro")
    mcc_rf = matthews_corrcoef(y_test_rf, y_pred_randf)

    cm_rf = confusion_matrix(y_test_rf, y_pred_randf)
    precision_rf = cm_rf[1][1] / (cm_rf[1][1] + cm_rf[0][1])
    recall_rf = cm_rf[1][1] / (cm_rf[1][1] + cm_rf[1][0])

    # store metrics in a dictionary
    metrics_randf = {
        "accuracy": round(accuracy_rf, 4),
        "macro_f1": round(macro_f1_rf, 4),
        "micro_f1": round(micro_f1_rf, 4),
        "mcc": round(mcc_rf, 4),
        "precision": round(precision_rf, 4),
        "recall": round(recall_rf, 4),
        "confusion_matrix": cm_rf
    }

    return weights_randf, y_pred_randf, y_pred_proba_randf, features_randf, metrics_randf

In [5]:
weights_rf, y_pred_rf, y_pred_proba_rf, features_rf, metrics_rf = sklearn_random_forest(X_test, X_train, y_train, y_test)

# save weights and predictions
weights_rf = pd.DataFrame(weights_rf, columns = features_rf)
weights_rf.to_csv("../exp/weights/weights_rf.csv", sep = ",", index = False)

y_pred_rf = pd.DataFrame(y_pred_rf, columns = ["y_pred"])
y_pred_rf.to_csv("../exp/predictions/y_pred_rf.csv", sep = ",", index = False)

Best parameters:  {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
Best score:  0.7373168316831682


ValueError: Shape of passed values is (15, 1), indices imply (15, 15)

In [6]:
metrics_rf

{'accuracy': 0.6944,
 'macro_f1': 0.5507,
 'micro_f1': 0.6944,
 'mcc': np.float64(0.1751),
 'precision': np.float64(0.6),
 'recall': np.float64(0.1481),
 'confusion_matrix': array([[163,   8],
        [ 69,  12]])}

In [19]:
# convert weights_rf to dataframe
weights_rf_df = pd.DataFrame(weights_rf, columns = features_rf)

ValueError: Shape of passed values is (15, 1), indices imply (15, 15)

In [14]:
features_rf

array(['whi_ges', 'soz_ges', 'gke_ges', 'lzh_ges', 'ile', 'woc_gesp',
       'woc_gese', 'asi_ges', 'bsq_ges', 'das_ges', 'scl_gsi', 'alter',
       'bild', 'bmi_kat', 'ses_kom'], dtype=object)