In [None]:
# Importamos las librerias mínimas necesarias
import numpy as np
import plotly.graph_objects as go
import dash
from dash import dcc
import dash_html_components as html
from dash import html

# A la hora de desarrollar una aplicación para visualizar datos tendremos que combinar 
# elementos de HTML y CSS con elementos propios de Dash. Lo primero que tendremos que 
# hacer siempre es inicializar una aplicación de Dash

app = dash.Dash()

# Una vez hemos inicializado la aplicacion, modificamos el diseño de la aplicacion

# IMPORTANTE: Hay que ser extremadamente ordenado con el código para que se entienda
# correctamente que se está haciendo en cada parte. Se recomienda un vistazo a la 
# libreria Black para formateo del código.

# Primer dashboard 
app.layout = html.Div(  # Creamos una componente que realice la primera división del dashboard
    children = [
        html.H1( # Primera fila
            children = [
                "Introducción a Dash"
            ],
            id = "titulo",
            style = {  # Aquí aplico todo lo que necesite de CSS
                "text-align": "center", # Alineo el texto al centro
                "color": "lightsteelblue", # Cambio el color de la fuente, se puede usar codigo hexagesimal
                "font-family": "Arial", # Cambio el tipo de fuente
                "backgroundColor": "darkslategray", # Cambio el color del fondo
                "text-decoration": "underline" # Subrayar el texto
            }
        ),
        html.Div( # Segunda fila 
            children = [
                html.H2(
                    children = [
                        "Esto es un subtítulo H2 "
                    ],
                    id = "primer_subtitulo",
                    style = {
                        "text-align": "left",
                        "color": "lightsteelblue",
                        "backgroundColor": "darkslategray",
                        "width": "250px", # Pongo una anchura máxima para limitar este componente
                        "display": "inline-block"

                    }
                ),
                html.P(
                    children = [
                        "Esto es un párrafo escrito a continuación en la misma linea que el título anterior"
                    ],
                    id = "componentes_css",
                    style = {
                        "font-family": "Arial",
                        "display": "inline-block",
                        "width": "600px",
                        "margin-left": "100px" # Modificar el margen, comentar diferencias entre padding-border-margin
                    }
                )
            ],
            id = "segunda_fila"
        ),
        html.Div(
            children = [
                dcc.Graph(
                    figure = go.Figure(
                        data = [
                            go.Bar(
                                x = ["Clase 1", "Clase 2", "Clase 3"],
                                y = [10,6,13],
                                marker_color = ["gold","darkorange","firebrick"],
                            )
                        ],
                        layout = go.Layout(
                            title = "Primer gráfico de prueba",
                            xaxis_title = "Clases",
                            yaxis_title = "Elementos",
                            width = 600,
                            height = 600
                        )
                    ),
                    id = "primera_figura",
                    style = {
                        "display": "block", # Diferenciar entre block, inline-block , inline
                        "margin-left": "25%",
                        "margin-right": "30%", # margin : auto
                    }
                )
            ],
            id = "tercera_fila"
        ),
        html.Div( # Cuarta fila
            children = [
                dcc.Graph(
                    figure = go.Figure(
                        data = [
                            go.Histogram(
                                x = np.random.normal(size = 1000),
                                marker_color = "steelblue",
                                name = "Histograma",
                                histnorm = "probability"
                            ),
                        ],
                        layout = go.Layout(
                            title = "Histograma de valores",
                            xaxis_title = "Valores de una normal de media 0 y std 1",
                            width = 600,
                            height = 600,
                            bargap = 0.1
                        )
                    ),
                    id = "segunda_figura",
                    style = {
                        "display": "inline-block",
                    }
                ),

                dcc.Graph(
                    figure = go.Figure(
                        data = [
                            go.Histogram(
                                x = np.random.gamma(shape = 1/2, scale = 1/2, size = 1000),
                                marker_color = "indigo",
                                name = "Histograma",
                                histnorm = "probability"
                            ),
                        ],
                        layout = go.Layout(
                            title = "Histograma de valores",
                            xaxis_title = "Valores de una gamma",
                            width = 600,
                            height = 600,
                            bargap = 0.1
                        )
                    ),
                    id = "tercera_figura",
                    style = {
                        "display": "inline-block"
                    }
                ),
            ],
            id = "cuarta_fila",
        )

    ],
    id = "primera_fila",
    style = {
        "margin-right": "125px",
        "margin-left": "125px",
        "margin-top": "100px",
        "border-style": "groove",
    } 
)

if __name__ == '__main__':
  app.run_server(host='localhost',port=8005)

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np #linear algebra
import seaborn as sns #data visualization

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/lastriita/DataVisualization2022/main/Bank%20Customer%20Churn%20Prediction.csv")

In [3]:
df.head()

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15634602,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,15701354,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,15737888,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
del df["customer_id"]
df.isna().sum()

credit_score        0
country             0
gender              0
age                 0
tenure              0
balance             0
products_number     0
credit_card         0
active_member       0
estimated_salary    0
churn               0
dtype: int64

In [7]:
fig = make_subplots(rows = 1,
                    cols = 3,
                    specs =[[{"type": "pie"}, {"type": "bar"},{"type": "bar"}]],
                    subplot_titles=("Composición de los datos por Género", "Churn % por Género", 
                                    "Violin Plot: Salario por Género"))


# primer gráfico
level_count = df["gender"].value_counts()

fig.add_trace(
        go.Pie(
        labels=["Mujeres","Hombres"],
        values=level_count,
        textinfo='label + percent',
        insidetextorientation='radial', 
        marker_colors = ["lightblue", "mediumseagreen"],
        showlegend = False,
        domain=dict(x=[0, 0.5])
    ),
    row = 1,
    col = 1
)

# Segundo gráfico
groups = df.groupby(["churn","gender"])["churn"].count()

fig.add_trace(
    go.Bar(
        y = [groups.values[0],groups.values[1],groups.values[2],groups.values[3]],
        x = ["Female Clients that Never Left the Bank","Male Clients that Never Left the Bank","Female Clients that Have Left the Bank", "Male Clients that Have Left the Bank"],
        showlegend = False,
        marker_color = ["lightblue", "mediumseagreen", "gold", "darkorange"],
    ),
    row = 1,
    col = 2
)

# Tercer gráfico
fig.add_trace(
    go.Violin(
        x=df['gender'][ df['churn'] == 1 ],
        y=df['estimated_salary'][ df['churn'] == 1 ],
        legendgroup=' Has Left the Bank', scalegroup=' Have Left the Bank', name=' Have Left the Bank',
        side='negative',
        line_color='blue'
     ),
    row = 1,
    col = 3
)

  
fig.add_trace(
    go.Violin(
        x=df['gender'][ df['churn'] == 0 ],
        y=df['estimated_salary'][ df['churn'] == 0],
        legendgroup=' Never Left the Bank', scalegroup=' Never Left the Bank', name=' Never Left the Bank',
        side='positive',
        line_color='orange'
    ),
    row = 1,
    col = 3
)
# Modifico las dimensiones totales y el titulo global
fig.update_layout(title = "Información sobre los clientes del banco", bargap = 0.1)

fig.show()


In [6]:
level_count_churn = df["churn"].value_counts()
data = [
    go.Pie(
        labels=["Never Left the Bank","Has Left the Bank at Some Point"],
        values=level_count_churn,
        textinfo='percent',
        insidetextorientation='radial', 
        marker_colors = ["lightblue", "mediumseagreen"],
        rotation = -55,
    )
]

layout = go.Layout(title = "Representación del Churn del Banco")

fig = go.Figure(data = data, layout = layout)

fig.show()

In [53]:
data = [
    go.Box(
        
        y = df["balance"].loc[df["churn"]==0],
        marker_color = "firebrick",
        name = "Clients that Never Left the Bank",
        boxmean=True
    ),
    go.Box(
        
        y = df["balance"].loc[df["churn"]==1],
        marker_color = "lightblue",
        name = "Client that has Left the Bank at Some Point",
        boxmean=True
    )
]

layout = go.Layout(title = "Balance en Cuenta vs Churn", yaxis_title = "Balance en Cuenta")

fig = go.Figure(data = data, layout = layout)

fig.show()

In [54]:
import plotly.figure_factory as ff #importing a new function from plotly
x1=df[df["churn"]==0]["tenure"]
x2=df[df["churn"]==1]["tenure"]
hist_data = [x1,x2]

group_labels = ['Churn no','Churn yes']
colors = ['#DC3912', '#FFA15A']

# Create distplot with curve_type set to 'normal'
distplot1 = ff.create_distplot(hist_data, group_labels, curve_type = 'normal', show_hist=False, colors=colors)

# Add title
distplot1.update_layout(title_text='Churn Distribution based on Tenure')
distplot1.update_xaxes(title_text='Years')
distplot1.update_yaxes(title_text='Density')
distplot1.show()

Al ser un dataset muy comun en trabajos, se han aplicado una gran cantidad de modelos a los datos. Estudiando los resultados obtenidos de diferentes estudios, utilizaremos dos modelos de existo extraidos de internet. Por un lado, utilizando el estudio de Raphael Marconato (https://www.kaggle.com/code/raphaelmarconato/churn-eda-balancing-and-machine-learning) aplicaremos el algoritmo de KNN a nuestro dataset. Además, para poder darle profundidad al análisi, también se utilizará el algoritmo de gradient boosting que realizó Salma Khaleed (https://www.kaggle.com/code/salmakhaleed/bank-customer-churn).

In [55]:
#Antes de comenzar con la aplicación de los modelos debemos lidiar con las variables categóricas
#Para ello usaremos la función LabelEncoder para poder convertirlas en variables continuas
X = df.iloc[:, 0:10].values
y = df.iloc[:, 10].values
label_encoder_country = LabelEncoder()
label_encoder_gender = LabelEncoder()
X[:,1] = label_encoder_country.fit_transform(X[:,1])
X[:,2] = label_encoder_gender.fit_transform(X[:,2])

In [56]:
#Primero separaremos los datos en train y test para poder medir la precisión de nuestro modelo cuando se enfrenta a datos desconocidos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [57]:
#Antes de aplicar el modelo KNN, utilizaremos la función gridsearch para saber cuales son los parametros optimos
knn = KNeighborsClassifier(metric = 'minkowski', p = 2)

In [58]:
k_list = list(range(1,31))
k_values = dict(n_neighbors = k_list)
grid = GridSearchCV(knn, k_values, cv = 5, scoring = 'accuracy')
grid.fit(X_train, y_train)
grid.best_params_, grid.best_score_

({'n_neighbors': 28}, 0.7977142857142857)

In [59]:
#Ahora aplicaremos el algoritmo KNN utilizando 28 Neighbors como sugiere el gridsearch anterior
knn = KNeighborsClassifier(n_neighbors = 28, metric = 'minkowski', p = 2)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)
classification_knn = (classification_report(y_test, predictions))
print(classification_knn)
acc = accuracy_score(y_test, predictions)
print(acc)

              precision    recall  f1-score   support

           0       0.79      1.00      0.88      2379
           1       0.25      0.00      0.00       621

    accuracy                           0.79      3000
   macro avg       0.52      0.50      0.44      3000
weighted avg       0.68      0.79      0.70      3000

0.7923333333333333


Por lo general, las bases de datos con información tan dispar (Género vs Balance en Cuenta) tienden a dar demasiado peso a las variables con valores altos. Por lo tanto, para que el modelo pueda interpretar las características de cada cliente bajo la misma escala es importante tratar la información. En este caso, probaremos dos modelos de escalado y analizaremos los datos obtenidos.

In [60]:
#MinMax Scaler
obj_norm = MinMaxScaler().fit(X)
X_normalization = obj_norm.transform(X)
X_train_n, X_test_n, y_train, y_test = train_test_split(X_normalization, y, test_size = 0.3, random_state = 0)
grid.fit(X_train_n, y_train)
grid.best_params_, grid.best_score_


({'n_neighbors': 5}, 0.8119999999999999)

In [61]:
knn_2 = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn_2.fit(X_train_n, y_train)
predictions_n = knn_2.predict(X_test_n)
classification_knn = (classification_report(y_test, predictions_n))
print(classification_knn)
acc_n = accuracy_score(y_test, predictions_n)
print(acc_n)

              precision    recall  f1-score   support

           0       0.85      0.95      0.89      2379
           1       0.63      0.36      0.46       621

    accuracy                           0.82      3000
   macro avg       0.74      0.65      0.68      3000
weighted avg       0.80      0.82      0.80      3000

0.8236666666666667


In [62]:
#Standard Scaler
scaler = StandardScaler()
X_standard = scaler.fit_transform(X)
X_train_s, X_test_s, y_train, y_test = train_test_split(X_standard, y, test_size = 0.3, random_state = 0)
grid.fit(X_train_s, y_train)
grid.best_params_, grid.best_score_

({'n_neighbors': 17}, 0.8315714285714286)

In [63]:
knn_3 = KNeighborsClassifier(n_neighbors = 17, metric = 'minkowski', p = 2)
knn_3.fit(X_train_s, y_train)
predictions_s = knn_3.predict(X_test_s)
classification_knn = (classification_report(y_test, predictions_s))
print(classification_knn)
acc_s = accuracy_score(y_test, predictions_s)
print(acc_s)

              precision    recall  f1-score   support

           0       0.85      0.98      0.91      2379
           1       0.81      0.32      0.46       621

    accuracy                           0.84      3000
   macro avg       0.83      0.65      0.69      3000
weighted avg       0.84      0.84      0.82      3000

0.8443333333333334


In [64]:
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
predictions_gbc = gbc.predict(X_test)
acc_gbc = accuracy_score(y_test, predictions_gbc)
print(acc_gbc)
print(gbc.feature_importances_)

0.864
[0.0266611  0.04662705 0.01519627 0.39014235 0.00391091 0.07246232
 0.31951837 0.00054916 0.10796953 0.01696294]


In [75]:
data = [
    go.Bar(
        y = gbc.feature_importances_,
        x =df.iloc[:,1:10].columns,
        name = "Importancia de las Variables para el Gradient Boost Classifier",
       )
]

layout = go.Layout(title = "Importancia de las Variables para el Gradient Boost Classifier", yaxis_title = "Importancia", 
                   xaxis_title = "Variable",
                  )

fig = go.Figure(data = data, layout = layout)

fig.show()


In [72]:
df.iloc[:,1:10].columns

Index(['country', 'gender', 'age', 'tenure', 'balance', 'products_number',
       'credit_card', 'active_member', 'estimated_salary'],
      dtype='object')

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
predictions_xgb = xgb.predict(X_test)
acc_xgb = accuracy_score(y_test, predictions_xgb)
print(acc_xgb)
print(xgb.feature_importances_)

In [None]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
predictions_dtc = dtc.predict(X_test)
acc_dtc = accuracy_score(y_test, predictions_dtc)
print(acc_dtc)
print(dtc.feature_importance