# **Caso: Reclamos en una Empresa de Seguros**

El objetivo es construir un modelo para poder calcular la probabilidad de que un cliente presente un reclamo de seguro. Como la proporción de clientes que cumplen con esta condición es reducida, aprovecharemos este caso para probar distintos métodos de balanceo de muestras.

https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/data


In [None]:
## Podemos hacer el balanceo de manera artesanal, para entender las lógicas.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### **Carga de la base de datos**

Desarrollar el mejor modelo de propensión que prediga si se logrará la venta de un préstamo digital.

In [None]:
train = pd.read_csv('../data/train.csv')

In [None]:
train.head(5)

In [None]:
train.target.value_counts(normalize=True)

In [None]:
train.target.value_counts()

In [None]:
sns.factorplot('target',data=train,kind="count")

In [None]:
train.dtypes

In [None]:
df = train.sample(frac=0.9, random_state=1000)
df_unseen = train.drop(df.index)
df.reset_index(inplace=True, drop=True)
df_unseen.reset_index(inplace=True, drop=True)
print('Data for Modeling: ' + str(df.shape))
print('Unseen Data For Predictions: ' + str(df_unseen.shape))

In [None]:
df.target.value_counts()

### Modelos con la muestra completa

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

In [None]:
# limpieza básica
df = df.drop(['id'], axis=1)

In [None]:
#Separación de predictoras y predicha
X = df.drop('target', axis=1)
y = df['target']

In [None]:
#Creación de muestras de train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=22)

In [None]:
X_train.shape, y_train.shape

In [None]:
X_test.shape, y_test.shape

### Modelo Decision Tree

In [None]:
tree = DecisionTreeClassifier()
tree_model = tree.fit(X_train, y_train)
Y_pred = tree_model.predict(X_test)

In [None]:
ind = pd.DataFrame(columns = ['Modelo', 'Muestra', 'Tamaño', 'Accuracy', 'Precision', 'Recall', 'F1Score'])

In [None]:
print("Matriz confusion: Test")
print(confusion_matrix(y_test,Y_pred))

ind = ind.append({'Modelo' : 'DecisionTree', 'Muestra' : 'Test', 'Tamaño': len(Y_pred), 'Accuracy' : accuracy_score(y_test,Y_pred), 
                  'Precision' : precision_score(y_test,Y_pred), 'Recall' : recall_score(y_test,Y_pred), 'F1Score' : f1_score(y_test,Y_pred)}, ignore_index = True)

In [None]:
# Almacenamos las probabilidades
y_real = df_unseen['target']
Y_pred = tree_model.predict(df_unseen.drop(['id','target'], axis=1))
U_pred = tree_model.predict_proba(df_unseen.drop(['id','target'], axis=1))[:,1]
datos = np.hstack((df_unseen['id'].values.reshape(-1,1), df_unseen['target'].values.reshape(-1,1), U_pred.reshape(-1,1)))
df_submmit = pd.DataFrame(datos, columns=['id','real','prob'])
df_submmit.to_csv('../data/com_tree.csv', index=False)

print("Matriz confusion: Unseen")
print(confusion_matrix(y_real,Y_pred))

ind = ind.append({'Modelo' : 'DecisionTree', 'Muestra' : 'Reserva', 'Tamaño': len(Y_pred), 'Accuracy' : accuracy_score(y_real,Y_pred), 
                  'Precision' : precision_score(y_real,Y_pred), 'Recall' : recall_score(y_real,Y_pred), 'F1Score' : f1_score(y_real,Y_pred)}, ignore_index = True)
ind.head()

### Modelo XGBoost

In [None]:
xgb_mod=xgb.XGBClassifier()
xgb_mod.fit(X_train, y_train)
Y_pred= xgb_mod.predict(X_test)

In [None]:
print("Matriz confusion: Test")
print(confusion_matrix(y_test,Y_pred))

ind = ind.append({'Modelo' : 'XGBoost', 'Muestra' : 'Test', 'Tamaño': len(Y_pred), 'Accuracy' : accuracy_score(y_test,Y_pred), 
                  'Precision' : precision_score(y_test,Y_pred), 'Recall' : recall_score(y_test,Y_pred), 'F1Score' : f1_score(y_test,Y_pred)}, ignore_index = True)

In [None]:
# Almacenamos las probabilidades
y_real = df_unseen['target']
Y_pred = xgb_mod.predict(df_unseen.drop(['id','target'], axis=1))
U_pred = xgb_mod.predict_proba(df_unseen.drop(['id','target'], axis=1))[:,1]
datos = np.hstack((df_unseen['id'].values.reshape(-1,1), df_unseen['target'].values.reshape(-1,1), U_pred.reshape(-1,1)))
df_submmit = pd.DataFrame(datos, columns=['id','real','prob'])
df_submmit.to_csv('../data/com_xgb.csv', index=False)

print("Matriz confusion: Unseen")
print(confusion_matrix(y_real,Y_pred))

ind = ind.append({'Modelo' : 'XGBoost', 'Muestra' : 'Reserva', 'Tamaño': len(Y_pred), 'Accuracy' : accuracy_score(y_real,Y_pred), 
                  'Precision' : precision_score(y_real,Y_pred), 'Recall' : recall_score(y_real,Y_pred), 'F1Score' : f1_score(y_real,Y_pred)}, ignore_index = True)
ind.head()

### Modelo RandomForest

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
Y_pred=rf.predict(X_test)

In [None]:
print("Matriz confusion: Test")
print(confusion_matrix(y_test,Y_pred))

ind = ind.append({'Modelo' : 'RandomForest', 'Muestra' : 'Test', 'Tamaño': len(Y_pred), 'Accuracy' : accuracy_score(y_test,Y_pred), 
                  'Precision' : precision_score(y_test,Y_pred), 'Recall' : recall_score(y_test,Y_pred), 'F1Score' : f1_score(y_test,Y_pred)}, ignore_index = True)

In [None]:
# Almacenamos las probabilidades
y_real = df_unseen['target']
Y_pred = rf.predict(df_unseen.drop(['id','target'], axis=1))
U_pred = rf.predict_proba(df_unseen.drop(['id','target'], axis=1))[:,1]
datos = np.hstack((df_unseen['id'].values.reshape(-1,1), df_unseen['target'].values.reshape(-1,1), U_pred.reshape(-1,1)))
df_submmit = pd.DataFrame(datos, columns=['id','real','prob'])
df_submmit.to_csv('../data/com_rf.csv', index=False)

print("Matriz confusion: Unseen")
print(confusion_matrix(y_real,Y_pred))

ind = ind.append({'Modelo' : 'RandomForest', 'Muestra' : 'Reserva', 'Tamaño': len(Y_pred), 'Accuracy' : accuracy_score(y_real,Y_pred), 
                  'Precision' : precision_score(y_real,Y_pred), 'Recall' : recall_score(y_real,Y_pred), 'F1Score' : f1_score(y_real,Y_pred)}, ignore_index = True)
ind

## **Resampling**

In [None]:
# Conteo de clases
count_class_0, count_class_1 = df.target.value_counts()

In [None]:
print('Cantidades por fila de clase:')
print('Class_0:',count_class_0)
print('Class_1:',count_class_1)

In [None]:
# Dividiendo los sets por clases
df_class_0 = df[df['target'] == 0]
df_class_1 = df[df['target'] == 1]

In [None]:
print('Dimensión por tipo de clases:')
print('dim class_0:',df_class_0.shape)
print('dim class_1:',df_class_1.shape)

### 1. Random under-sampling (Submuestreo aleatorio)

In [None]:
df_class_0_under = df_class_0.sample(count_class_1*4)
df_under = pd.concat([df_class_0_under, df_class_1], axis=0)

In [None]:
print('Dimensión por tipo de clases generadas:')
print('dim df_class_0_under:',df_class_0_under.shape)
print('dim df_under:',df_under.shape)

In [None]:
print('Random under-sampling:')
print(df_under.target.value_counts())

In [None]:
df_under.target.value_counts().plot(kind='bar', title='Count (target)');

In [None]:
#Separación de predictoras y predicha
X = df_under.drop('target', axis=1)
y = df_under['target']

In [None]:
#Creación de muestras de train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=22)

### Modelo Decision Tree

In [None]:
tree = DecisionTreeClassifier()
tree_model = tree.fit(X_train, y_train)
Y_pred = tree_model.predict(X_test)

In [None]:
print("Matriz confusion: Test")
print(confusion_matrix(y_test,Y_pred))

ind = ind.append({'Modelo' : 'DecisionTree Under', 'Muestra' : 'Test', 'Tamaño': len(Y_pred), 'Accuracy' : accuracy_score(y_test,Y_pred), 
                  'Precision' : precision_score(y_test,Y_pred), 'Recall' : recall_score(y_test,Y_pred), 'F1Score' : f1_score(y_test,Y_pred)}, ignore_index = True)

In [None]:
# Almacenamos las probabilidades
y_real = df_unseen['target']
Y_pred = tree_model.predict(df_unseen.drop(['id','target'], axis=1))
U_pred = tree_model.predict_proba(df_unseen.drop(['id','target'], axis=1))[:,1]
datos = np.hstack((df_unseen['id'].values.reshape(-1,1), df_unseen['target'].values.reshape(-1,1), U_pred.reshape(-1,1)))
df_submmit = pd.DataFrame(datos, columns=['id','real','prob'])
df_submmit.to_csv('../data/under_tree.csv', index=False)

print("Matriz confusion: Unseen")
print(confusion_matrix(y_real,Y_pred))

ind = ind.append({'Modelo' : 'DecisionTree Under', 'Muestra' : 'Reserva', 'Tamaño': len(Y_pred), 'Accuracy' : accuracy_score(y_real,Y_pred), 
                  'Precision' : precision_score(y_real,Y_pred), 'Recall' : recall_score(y_real,Y_pred), 'F1Score' : f1_score(y_real,Y_pred)}, ignore_index = True)
ind

### Modelo XGBoost

In [None]:
xgb_mod=xgb.XGBClassifier()
xgb_mod.fit(X_train, y_train)
Y_pred= xgb_mod.predict(X_test)

In [None]:
print("Matriz confusion: Test")
print(confusion_matrix(y_test,Y_pred))

ind = ind.append({'Modelo' : 'XGBoost Under', 'Muestra' : 'Test', 'Tamaño': len(Y_pred), 'Accuracy' : accuracy_score(y_test,Y_pred), 
                  'Precision' : precision_score(y_test,Y_pred), 'Recall' : recall_score(y_test,Y_pred), 'F1Score' : f1_score(y_test,Y_pred)}, ignore_index = True)

In [None]:
# Almacenamos las probabilidades
y_real = df_unseen['target']
Y_pred = xgb_mod.predict(df_unseen.drop(['id','target'], axis=1))
U_pred = xgb_mod.predict_proba(df_unseen.drop(['id','target'], axis=1))[:,1]
datos = np.hstack((df_unseen['id'].values.reshape(-1,1), df_unseen['target'].values.reshape(-1,1), U_pred.reshape(-1,1)))
df_submmit = pd.DataFrame(datos, columns=['id','real','prob'])
df_submmit.to_csv('../data/under_xgb.csv', index=False)

print("Matriz confusion: Unseen")
print(confusion_matrix(y_real,Y_pred))

ind = ind.append({'Modelo' : 'XGBoost Under', 'Muestra' : 'Reserva', 'Tamaño': len(Y_pred), 'Accuracy' : accuracy_score(y_real,Y_pred), 
                  'Precision' : precision_score(y_real,Y_pred), 'Recall' : recall_score(y_real,Y_pred), 'F1Score' : f1_score(y_real,Y_pred)}, ignore_index = True)
ind

### Modelo RandomForest

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
Y_pred=rf.predict(X_test)

In [None]:
print("Matriz confusion: Test")
print(confusion_matrix(y_test,Y_pred))

ind = ind.append({'Modelo' : 'RandomForest Under', 'Muestra' : 'Test', 'Tamaño': len(Y_pred), 'Accuracy' : accuracy_score(y_test,Y_pred), 
                  'Precision' : precision_score(y_test,Y_pred), 'Recall' : recall_score(y_test,Y_pred), 'F1Score' : f1_score(y_test,Y_pred)}, ignore_index = True)

In [None]:
# Almacenamos las probabilidades
y_real = df_unseen['target']
Y_pred = rf.predict(df_unseen.drop(['id','target'], axis=1))
U_pred = rf.predict_proba(df_unseen.drop(['id','target'], axis=1))[:,1]
datos = np.hstack((df_unseen['id'].values.reshape(-1,1), df_unseen['target'].values.reshape(-1,1), U_pred.reshape(-1,1)))
df_submmit = pd.DataFrame(datos, columns=['id','real','prob'])
df_submmit.to_csv('../data/under_rf.csv', index=False)

print("Matriz confusion: Unseen")
print(confusion_matrix(y_real,Y_pred))

ind = ind.append({'Modelo' : 'RandomForest Under', 'Muestra' : 'Reserva', 'Tamaño': len(Y_pred), 'Accuracy' : accuracy_score(y_real,Y_pred), 
                  'Precision' : precision_score(y_real,Y_pred), 'Recall' : recall_score(y_real,Y_pred), 'F1Score' : f1_score(y_real,Y_pred)}, ignore_index = True)
ind

### 2. Random over-sampling (Sobremuestreo aleatorio)

In [None]:
df_class_1_over = df_class_1.sample(round(count_class_0/4), replace=True)
df_over = pd.concat([df_class_0, df_class_1_over], axis=0)

In [None]:
print('Dimensión por tipo de clases generadas:')
print('dim df_class_1_over:',df_class_1_over.shape)
print('dim df_over:',df_over.shape)

In [None]:
print('Random over-sampling:')
print(df_over.target.value_counts())

In [None]:
df_over.target.value_counts().plot(kind='bar', title='Count (target)');

In [None]:
#Separación de predictoras y predicha
X = df_over.drop('target', axis=1)
y = df_over['target']

In [None]:
#Creación de muestras de train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=22)

### Modelo Decision Tree

In [None]:
tree = DecisionTreeClassifier()
tree_model = tree.fit(X_train, y_train)
Y_pred = tree_model.predict(X_test)

In [None]:
print("Matriz confusion: Test")
print(confusion_matrix(y_test,Y_pred))

ind = ind.append({'Modelo' : 'DecisionTree Over', 'Muestra' : 'Test', 'Tamaño': len(Y_pred), 'Accuracy' : accuracy_score(y_test,Y_pred), 
                  'Precision' : precision_score(y_test,Y_pred), 'Recall' : recall_score(y_test,Y_pred), 'F1Score' : f1_score(y_test,Y_pred)}, ignore_index = True)

In [None]:
# Almacenamos las probabilidades
y_real = df_unseen['target']
Y_pred = tree_model.predict(df_unseen.drop(['id','target'], axis=1))
U_pred = tree_model.predict_proba(df_unseen.drop(['id','target'], axis=1))[:,1]
datos = np.hstack((df_unseen['id'].values.reshape(-1,1), df_unseen['target'].values.reshape(-1,1), U_pred.reshape(-1,1)))
df_submmit = pd.DataFrame(datos, columns=['id','real','prob'])
df_submmit.to_csv('../data/over_tree.csv', index=False)

print("Matriz confusion: Unseen")
print(confusion_matrix(y_real,Y_pred))

ind = ind.append({'Modelo' : 'DecisionTree Over', 'Muestra' : 'Reserva', 'Tamaño': len(Y_pred), 'Accuracy' : accuracy_score(y_real,Y_pred), 
                  'Precision' : precision_score(y_real,Y_pred), 'Recall' : recall_score(y_real,Y_pred), 'F1Score' : f1_score(y_real,Y_pred)}, ignore_index = True)
ind

### Modelo XGBoost

In [None]:
xgb_mod=xgb.XGBClassifier()
xgb_mod.fit(X_train, y_train)
Y_pred= xgb_mod.predict(X_test)

In [None]:
print("Matriz confusion: Test")
print(confusion_matrix(y_test,Y_pred))

ind = ind.append({'Modelo' : 'XGBoost Over', 'Muestra' : 'Test', 'Tamaño': len(Y_pred), 'Accuracy' : accuracy_score(y_test,Y_pred), 
                  'Precision' : precision_score(y_test,Y_pred), 'Recall' : recall_score(y_test,Y_pred), 'F1Score' : f1_score(y_test,Y_pred)}, ignore_index = True)

In [None]:
# Almacenamos las probabilidades
y_real = df_unseen['target']
Y_pred = xgb_mod.predict(df_unseen.drop(['id','target'], axis=1))
U_pred = xgb_mod.predict_proba(df_unseen.drop(['id','target'], axis=1))[:,1]
datos = np.hstack((df_unseen['id'].values.reshape(-1,1), df_unseen['target'].values.reshape(-1,1), U_pred.reshape(-1,1)))
df_submmit = pd.DataFrame(datos, columns=['id','real','prob'])
df_submmit.to_csv('../data/over_xgb.csv', index=False)

print("Matriz confusion: Unseen")
print(confusion_matrix(y_real,Y_pred))

ind = ind.append({'Modelo' : 'XGBoost Over', 'Muestra' : 'Reserva', 'Tamaño': len(Y_pred), 'Accuracy' : accuracy_score(y_real,Y_pred), 
                  'Precision' : precision_score(y_real,Y_pred), 'Recall' : recall_score(y_real,Y_pred), 'F1Score' : f1_score(y_real,Y_pred)}, ignore_index = True)
ind

### Modelo RandomForest

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
Y_pred=rf.predict(X_test)

In [None]:
print("Matriz confusion: Test")
print(confusion_matrix(y_test,Y_pred))

ind = ind.append({'Modelo' : 'RandomForest Over', 'Muestra' : 'Test', 'Tamaño': len(Y_pred), 'Accuracy' : accuracy_score(y_test,Y_pred), 
                  'Precision' : precision_score(y_test,Y_pred), 'Recall' : recall_score(y_test,Y_pred), 'F1Score' : f1_score(y_test,Y_pred)}, ignore_index = True)

In [None]:
# Almacenamos las probabilidades
y_real = df_unseen['target']
Y_pred = rf.predict(df_unseen.drop(['id','target'], axis=1))
U_pred = rf.predict_proba(df_unseen.drop(['id','target'], axis=1))[:,1]
datos = np.hstack((df_unseen['id'].values.reshape(-1,1), df_unseen['target'].values.reshape(-1,1), U_pred.reshape(-1,1)))
df_submmit = pd.DataFrame(datos, columns=['id','real','prob'])
df_submmit.to_csv('../data/over_rf.csv', index=False)

print("Matriz confusion: Unseen")
print(confusion_matrix(y_real,Y_pred))

ind = ind.append({'Modelo' : 'RandomForest Over', 'Muestra' : 'Reserva', 'Tamaño': len(Y_pred), 'Accuracy' : accuracy_score(y_real,Y_pred), 
                  'Precision' : precision_score(y_real,Y_pred), 'Recall' : recall_score(y_real,Y_pred), 'F1Score' : f1_score(y_real,Y_pred)}, ignore_index = True)
ind

### 3. Advanced Resampling - SMOTE

In [None]:
# !pip install imblearn
# conda install -c glemaitre imbalanced-learn
# conda install -c conda-forge imbalanced-learn

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
#Separación de predictoras y predicha
X = df.drop('target', axis=1)
y = df['target']

In [None]:
#Aplicando SMOTE
smote = SMOTE(sampling_strategy=0.2)
X_sm, y_sm = smote.fit_sample(X, y)

In [None]:
df_X_sm = pd.DataFrame(data=X_sm,columns=X.columns)
df_y_sm = pd.DataFrame(data=y_sm,columns=["target"])

# Concatenamos la información
df_balanceado_sm = pd.concat([df_X_sm, df_y_sm], axis=1)

In [None]:
df_balanceado_sm.target.value_counts()

In [None]:
#Separación de predictoras y predicha
X = df_balanceado_sm.drop('target', axis=1)
y = df_balanceado_sm['target']

In [None]:
#Creación de muestras de train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=22)

### Modelo Decision Tree

In [None]:
tree = DecisionTreeClassifier()
tree_model = tree.fit(X_train, y_train)
Y_pred = tree_model.predict(X_test)

In [None]:
print("Matriz confusion: Test")
print(confusion_matrix(y_test,Y_pred))

ind = ind.append({'Modelo' : 'DecisionTree SMOTE', 'Muestra' : 'Test', 'Tamaño': len(Y_pred), 'Accuracy' : accuracy_score(y_test,Y_pred), 
                  'Precision' : precision_score(y_test,Y_pred), 'Recall' : recall_score(y_test,Y_pred), 'F1Score' : f1_score(y_test,Y_pred)}, ignore_index = True)

In [None]:
# Almacenamos las probabilidades
y_real = df_unseen['target']
Y_pred = tree_model.predict(df_unseen.drop(['id','target'], axis=1))
U_pred = tree_model.predict_proba(df_unseen.drop(['id','target'], axis=1))[:,1]
datos = np.hstack((df_unseen['id'].values.reshape(-1,1), df_unseen['target'].values.reshape(-1,1), U_pred.reshape(-1,1)))
df_submmit = pd.DataFrame(datos, columns=['id','real','prob'])
df_submmit.to_csv('../data/smote_tree.csv', index=False)

print("Matriz confusion: Unseen")
print(confusion_matrix(y_real,Y_pred))

ind = ind.append({'Modelo' : 'DecisionTree SMOTE', 'Muestra' : 'Reserva', 'Tamaño': len(Y_pred), 'Accuracy' : accuracy_score(y_real,Y_pred), 
                  'Precision' : precision_score(y_real,Y_pred), 'Recall' : recall_score(y_real,Y_pred), 'F1Score' : f1_score(y_real,Y_pred)}, ignore_index = True)
ind

### Modelo XGBoost

In [None]:
xgb_mod=xgb.XGBClassifier()
xgb_mod.fit(X_train, y_train)
Y_pred= xgb_mod.predict(X_test)

In [None]:
print("Matriz confusion: Test")
print(confusion_matrix(y_test,Y_pred))

ind = ind.append({'Modelo' : 'XGBoost SMOTE', 'Muestra' : 'Test', 'Tamaño': len(Y_pred), 'Accuracy' : accuracy_score(y_test,Y_pred), 
                  'Precision' : precision_score(y_test,Y_pred), 'Recall' : recall_score(y_test,Y_pred), 'F1Score' : f1_score(y_test,Y_pred)}, ignore_index = True)

In [None]:
# Almacenamos las probabilidades
y_real = df_unseen['target']
Y_pred = xgb_mod.predict(df_unseen.drop(['id','target'], axis=1))
U_pred = xgb_mod.predict_proba(df_unseen.drop(['id','target'], axis=1))[:,1]
datos = np.hstack((df_unseen['id'].values.reshape(-1,1), df_unseen['target'].values.reshape(-1,1), U_pred.reshape(-1,1)))
df_submmit = pd.DataFrame(datos, columns=['id','real','prob'])
df_submmit.to_csv('../data/smote_xgb.csv', index=False)

print("Matriz confusion: Unseen")
print(confusion_matrix(y_real,Y_pred))

ind = ind.append({'Modelo' : 'XGBoost SMOTE', 'Muestra' : 'Reserva', 'Tamaño': len(Y_pred), 'Accuracy' : accuracy_score(y_real,Y_pred), 
                  'Precision' : precision_score(y_real,Y_pred), 'Recall' : recall_score(y_real,Y_pred), 'F1Score' : f1_score(y_real,Y_pred)}, ignore_index = True)
ind

### Modelo RandomForest

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
Y_pred=rf.predict(X_test)

In [None]:
print("Matriz confusion: Test")
print(confusion_matrix(y_test,Y_pred))

ind = ind.append({'Modelo' : 'RandomForest SMOTE', 'Muestra' : 'Test', 'Tamaño': len(Y_pred), 'Accuracy' : accuracy_score(y_test,Y_pred), 
                  'Precision' : precision_score(y_test,Y_pred), 'Recall' : recall_score(y_test,Y_pred), 'F1Score' : f1_score(y_test,Y_pred)}, ignore_index = True)

In [None]:
# Almacenamos las probabilidades
y_real = df_unseen['target']
Y_pred = rf.predict(df_unseen.drop(['id','target'], axis=1))
U_pred = rf.predict_proba(df_unseen.drop(['id','target'], axis=1))[:,1]
datos = np.hstack((df_unseen['id'].values.reshape(-1,1), df_unseen['target'].values.reshape(-1,1), U_pred.reshape(-1,1)))
df_submmit = pd.DataFrame(datos, columns=['id','real','prob'])
df_submmit.to_csv('../data/smote_rf.csv', index=False)

print("Matriz confusion: Unseen")
print(confusion_matrix(y_real,Y_pred))

ind = ind.append({'Modelo' : 'RandomForest SMOTE', 'Muestra' : 'Reserva', 'Tamaño': len(Y_pred), 'Accuracy' : accuracy_score(y_real,Y_pred), 
                  'Precision' : precision_score(y_real,Y_pred), 'Recall' : recall_score(y_real,Y_pred), 'F1Score' : f1_score(y_real,Y_pred)}, ignore_index = True)
ind

Copyright 2021. Elaborado por Luis Cajachahua