![](../images/itam_logo.png)

M. Sc. Liliana Millán Núñez liliana.millan@itam.mx

Noviembre 2020 


### Random Forest

Utilizando los mismos datos del Titanic

In [None]:
import pandas as pd
import numpy as np

In [None]:
np.random.seed(20201108)

In [None]:
titanic = pd.read_csv("/home/silil/Documents/itam/mineria_datos_licenciatura/data/titanic/titanic.csv")
titanic.shape

In [None]:
def standarize_column_name(col):
    return col.lower().replace(' ','_').replace('/','_')

In [None]:
titanic.rename(columns={element: standarize_column_name(element) for element in titanic.columns.values}, inplace=True)
titanic.columns

In [None]:
### cambiando la variable sex a 0: male, 1: female
titanic.sex.mask(titanic.sex == 'female', 1, inplace=True)
titanic.sex.mask(titanic.sex == 'male', 0, inplace=True)

titanic.head()

In [None]:
## Separando en variables explicativas y variable target
X = titanic[['pclass','sex','age','siblings_spouses_aboard','parents_children_aboard','fare']]
y = titanic.survived

(X.shape, y.shape)

In [None]:
## separando en train, test 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## verificando los tamaños de nuestros 4 sets
print("entrenamiento: X: {}, y: {}".format(X_train.shape, y_train.shape))
print("prueba: X: {}, y: {}".format(X_test.shape, y_test.shape))

In [None]:
## Configuración del RF
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import time

# n_estimators corresponde al número de árboles que queremos crear
grid = {'n_estimators': [300, 500, 800, 1000], 'min_samples_leaf': [3,5,7,9,11], 
       'criterion':['gini','entropy']}

rf = RandomForestClassifier(oob_score=True, n_jobs=-1)
gs_rf = GridSearchCV(rf, grid, cv=2, scoring='precision', return_train_score=True, n_jobs=-1)

start = time.time()
# entrenando los random forest
gs_rf.fit(X_train, y_train)
print("tiempo de ejecución en segundos: {}".format(time.time() - start))

In [None]:
# mejor configuración 
gs_rf.best_estimator_

In [None]:
gs_rf.best_estimator_.n_features_

In [None]:
gs_rf.best_estimator_.oob_score_

¿Cuántos modelo estamos corriendo con esta configuración de hiperparámetros?

In [None]:
# predicciones con el mejor predictor 
predicted_labels = gs_rf.predict(X_test)

In [None]:
# predicciones en score con el mejor predictor
predicted_scores = gs_rf.predict_proba(X_test)

In [None]:
predicted_scores[:10]

### Métricas de desempeño 

#### ROC y AUC

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

import matplotlib.pyplot as plt

%matplotlib inline

fpr, tpr, thresholds = roc_curve(y_test, predicted_scores[:,1], pos_label=1)

plt.clf()
plt.plot([0,1],[0,1], 'k--', c="red")
plt.plot(fpr, tpr)
plt.title("ROC best RF, AUC: {}".format(roc_auc_score(y_test, predicted_labels)))
plt.xlabel("fpr")
plt.ylabel("tpr")
plt.show()

#### Matriz de confusión

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, predicted_labels)

#### Accuracy

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, predicted_labels)

In [None]:
#### Precision, recall, thresholds
from sklearn.metrics import precision_recall_curve

precision, recall, thresholds_2 = precision_recall_curve(y_test, predicted_scores[:,1], pos_label=1)

In [None]:
thresholds_2 = np.append(thresholds_2, 1)

In [None]:
(precision.shape, recall.shape, thresholds_2.shape)

#### Reporte de métricas

In [None]:
def get_metrics_report(fpr, tpr, thresholds, precision, recall, thresholds_2):
    df_1 = pd.DataFrame({'threshold': thresholds_2,'precision': precision,
                    'recall': recall})
    df_1['f1_score'] = 2 * (df_1.precision * df_1.recall) / (df_1.precision + df_1.recall)
    
    df_2 = pd.DataFrame({'tpr': tpr, 'fpr': fpr, 'threshold': thresholds})
    df_2['tnr'] = 1 - df_2['fpr']
    df_2['fnr'] = 1 - df_2['tpr']
    
    df = df_1.merge(df_2, on="threshold")
    
    return df

In [None]:
metrics_report = get_metrics_report(fpr, tpr, thresholds, precision, recall, thresholds_2)
metrics_report

¿Cuál es el threshold que tendríamos que escoger para que no tengamos más de 6% de FP?

In [None]:
negocio = metrics_report[metrics_report.fpr <= 0.06]

In [None]:
negocio

¿Cuál es la matriz de confusión asociada a ese punto de corte? 

In [None]:
punto_corte = negocio.head(1).threshold.values[0]
punto_corte

In [None]:
new_labels = [0 if score < punto_corte else 1 for score in predicted_scores[:,1]]

In [None]:
new_labels[:10]

In [None]:
pd.Series(new_labels).sum()

In [None]:
confusion_matrix(y_test, new_labels)

In [None]:
confusion_matrix(y_test, predicted_labels)