# Caso: Predicción de Churn con Árboles y Ensamblados

#### Cantidad de Clientes: 400
#### Horizonte de tiempo: De Nov-2012 a Ene-2014 (15 meses)


In [None]:
## Podemos hacer el balanceo de manera artesanal , para entender las lógicas.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### **Carga de la base de datos**

Desarrollar el mejor modelo de propensión que prediga si se logrará la venta de un préstamo digital.

In [None]:
df = pd.read_csv('../data/data_matrix.csv')

In [None]:
df.head(5)

In [None]:
df.churn.value_counts(normalize=True)

In [None]:
df.churn.value_counts()

In [None]:
sns.factorplot('churn',data=df,kind="count")

In [None]:
df.dtypes

In [None]:
# limpieza básica
df = df.drop(['customerid'], axis=1)

In [None]:
cor_mat = df.corr()
mask = np.array(cor_mat)
mask[np.tril_indices_from(mask)] = False
fig = plt.gcf()
fig.set_size_inches(30,12)
sns.heatmap(data=cor_mat,mask=mask,square=True,annot=True,cbar=True)

### Modelos con la muestra completa

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

In [None]:
#Separación de predictoras y predicha
X = df.drop('churn', axis=1)
y = df['churn']

In [None]:
#Creación de muestras de train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=22)

In [None]:
X_train.shape, y_train.shape

In [None]:
X_test.shape, y_test.shape

### Modelo Decision Tree

In [None]:
tree = DecisionTreeClassifier()
tree_model = tree.fit(X_train, y_train)
Y_pred = tree_model.predict(X_test)

In [None]:
ind = pd.DataFrame(columns = ['Modelo', 'Muestra', 'Tamaño', 'Accuracy', 'Precision', 'Recall', 'F1Score'])

In [None]:
print("Matriz confusion: Test")
print(confusion_matrix(y_test,Y_pred))

ind = ind.append({'Modelo' : 'DecisionTree', 'Muestra' : 'Test', 'Tamaño': len(Y_pred), 'Accuracy' : accuracy_score(y_test,Y_pred), 
                  'Precision' : precision_score(y_test,Y_pred), 'Recall' : recall_score(y_test,Y_pred), 'F1Score' : f1_score(y_test,Y_pred)}, ignore_index = True)
ind.head()

### Modelo XGBoost

In [None]:
xgb_mod=xgb.XGBClassifier()
xgb_mod.fit(X_train, y_train)
Y_pred= xgb_mod.predict(X_test)

In [None]:
print("Matriz confusion: Test")
print(confusion_matrix(y_test,Y_pred))

ind = ind.append({'Modelo' : 'XGBoost', 'Muestra' : 'Test', 'Tamaño': len(Y_pred), 'Accuracy' : accuracy_score(y_test,Y_pred), 
                  'Precision' : precision_score(y_test,Y_pred), 'Recall' : recall_score(y_test,Y_pred), 'F1Score' : f1_score(y_test,Y_pred)}, ignore_index = True)
ind.head()

### Modelo RandomForest

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
Y_pred=rf.predict(X_test)

In [None]:
print("Matriz confusion: Test")
print(confusion_matrix(y_test,Y_pred))

ind = ind.append({'Modelo' : 'RandomForest', 'Muestra' : 'Test', 'Tamaño': len(Y_pred), 'Accuracy' : accuracy_score(y_test,Y_pred), 
                  'Precision' : precision_score(y_test,Y_pred), 'Recall' : recall_score(y_test,Y_pred), 'F1Score' : f1_score(y_test,Y_pred)}, ignore_index = True)
ind.head()

In [None]:
TablaImportancia = pd.concat([pd.DataFrame({'Feature':list(df.drop(['churn'],axis=1).columns)}),
                              pd.DataFrame({'Importance':list(rf.feature_importances_)})], axis = 1)
TablaImportancia[['Feature','Importance']].sort_values('Importance', ascending = False).reset_index(drop = True)

Copyright 2021. Elaborado por Luis Cajachahua