# Machine Learning: Deep Neural Network - Multilayer Perceptron

## 0 Imports

In [1]:
import pandas as pd
import numpy as np
from scipy.constants import micro
from sklearn.metrics import confusion_matrix, recall_score, matthews_corrcoef
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier

In [2]:
data = pd.read_csv("../dat/dips/DIPS_Data_cleaned.csv", sep = ",", low_memory = False)
data_change = pd.read_csv("../dat/dips/DIPS_Data_cleaned_change.csv", sep = ",", low_memory = False)
data_pred = pd.read_csv("../dat/dips/DIPS_Data_cleaned_pred.csv", sep = ",", low_memory = False)
data_pred_y = pd.read_csv("../dat/dips/DIPS_Data_cleaned_pred_y.csv", sep = ",", low_memory = False)

## 1 DNN Model

In [3]:
# Predictors
X = data_change
X = X.drop(["hpi"], axis = 1)

# Target
y = data_change["hpi"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

print("DNN \n",
      "X_train shape: ", X_train.shape, round(X_train.shape[0]/len(X), 2), "\n",
      "X_test shape: ", X_test.shape, round(X_test.shape[0]/len(X), 2),  "\n",
      "y_train shape: ", y_train.shape, round(y_train.shape[0]/len(y), 2), "\n",
      "y_test shape: ", y_test.shape, round(y_test.shape[0]/len(y), 2), "\n")

DNN 
 X_train shape:  (983, 22) 0.8 
 X_test shape:  (246, 22) 0.2 
 y_train shape:  (983,) 0.8 
 y_test shape:  (246,) 0.2 



In [4]:
def sklearn_dnn(x_test_dnn, x_train_dnn, y_train_dnn, y_test_dnn):
    """Computes OLS weights for linear regression without regularization using the sklearn library on the training set and
       returns weights, testset predictions and metrics.
    """

    # 1: GRID SEARCH
    dnn_model = MLPClassifier(random_state = 42)

    param_grid = {
        'hidden_layer_sizes': [(50,), (100,), (50, 50)],
        'activation': ['relu', 'tanh'],
        'solver': ['adam', 'sgd'],
        'alpha': [0.0001, 0.001],
        'learning_rate': ['constant', 'adaptive']
    }

    grid_search = GridSearchCV(estimator = dnn_model, param_grid = param_grid, cv = 10)
    grid_search.fit(x_train_dnn, y_train_dnn)
    best_model = grid_search.best_estimator_
    print("Best parameters: ", grid_search.best_params_)
    print("Best score: ", grid_search.best_score_)

    # 2: FITTING THE MODEL
    model = MLPClassifier(hidden_layer_sizes = best_model.hidden_layer_sizes,
                          activation = best_model.activation,
                          solver = best_model.solver,
                          alpha = best_model.alpha,
                          learning_rate = best_model.learning_rate,
                          random_state = 42)
    model.fit(x_train_dnn, y_train_dnn)

    # 3: ESTIMATING WEIGHTS
    weights_d = model.coefs_
    features_d = model.feature_names_in_

    # 4: COMPUTE TEST SET PREDICTIONS
    y_pred_d = model.predict(x_test_dnn)
    y_pred_proba_d = model.predict_proba(x_test_dnn)

    # 5: COMPUTE METRICS
    accuracy_dnn = model.score(x_test_dnn, y_test_dnn)
    macro_f1_dnn = recall_score(y_test_dnn, y_pred_d, average = "macro")
    micro_f1_dnn = recall_score(y_test_dnn, y_pred_d, average = "micro")
    mcc_dnn = matthews_corrcoef(y_test_dnn, y_pred_d)

    cm_dnn = confusion_matrix(y_test_dnn, y_pred_d)
    precision_dnn = cm_dnn[1, 1] / (cm_dnn[1, 1] + cm_dnn[0, 1])
    recall_dnn = cm_dnn[1, 1] / (cm_dnn[1, 1] + cm_dnn[1, 0])

    # store metrics in a dictionary
    metrics_d = {
        "accuracy": accuracy_dnn,
        "macro_f1": macro_f1_dnn,
        "micro_f1": micro_f1_dnn,
        "mcc": mcc_dnn,
        "precision": precision_dnn,
        "recall": recall_dnn,
        "confusion_matrix": cm_dnn
    }

    return weights_d, features_d, y_pred_d, y_pred_proba_d, metrics_d

In [5]:
weights_dnn, features_dnn, y_pred_dnn, y_pred_proba_dnn, metrics_dnn = sklearn_dnn(X_test, X_train, y_train, y_test)

# save weights and predictions
first_layer_weights = weights_dnn[0]
importance_scores = np.mean(np.abs(first_layer_weights), axis=1)
weights_dnn_df = pd.DataFrame([importance_scores], columns = features_dnn)
weights_dnn_df.to_csv("../exp/weights/weights_dnn.csv", sep = ",", index = False)

y_pred_dnn = pd.DataFrame(y_pred_dnn, columns = ["y_pred"])
y_pred_dnn.to_csv("../exp/predictions/y_pred_dnn.csv", sep = ",", index = False)



Best parameters:  {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50), 'learning_rate': 'constant', 'solver': 'sgd'}
Best score:  0.7313543599257886


In [6]:
metrics_dnn

{'accuracy': 0.7276422764227642,
 'macro_f1': 0.563129452223041,
 'micro_f1': 0.7276422764227642,
 'mcc': np.float64(0.19115738775906188),
 'precision': np.float64(0.5416666666666666),
 'recall': np.float64(0.18840579710144928),
 'confusion_matrix': array([[166,  11],
        [ 56,  13]])}