# **Modelamiento**

## Importamos librerías necesarias

In [1]:
# importemos las librerías básicas a usar

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

## Lectura de datos

In [2]:
df = pd.read_csv('datasets/data_to_model.csv', index_col=0)
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

scaler = MinMaxScaler()
aux = pd.get_dummies(df)
scaled = pd.DataFrame(scaler.fit_transform(aux.drop(columns=['Exited'])), \
                      columns= aux.drop(columns=['Exited']).columns.to_list())
pca = PCA(n_components=scaled.shape[1])
pca.fit(scaled)

In [4]:
X = pd.get_dummies(df.drop(columns=['Exited']), dtype=float)

In [5]:
from sklearn import cluster

clusterers = [cluster.KMeans, cluster.SpectralClustering]

for cluster in clusterers:
    if cluster.__name__ == 'KMeans':
        clusterer = cluster(n_clusters=2)
        scaled['kmeans_label'] = clusterer.fit_predict(scaled)
    else:
        clusterer = cluster(n_clusters=2, n_jobs=-1, affinity= 'rbf', gamma = 2, random_state = 1234, assign_labels = 'discretize')
        scaled['spectral_label'] = clusterer.fit_predict(scaled)

scaled['exited'] = df.Exited
scaled.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male,kmeans_label,spectral_label,exited
0,0.538,0.324324,0.2,0.0,0.0,1.0,1.0,0.506735,1.0,0.0,0.0,1.0,0.0,0,1,1
1,0.516,0.310811,0.1,0.334031,0.0,0.0,1.0,0.562709,0.0,0.0,1.0,1.0,0.0,0,1,0
2,0.304,0.324324,0.8,0.636357,0.666667,1.0,0.0,0.569654,1.0,0.0,0.0,1.0,0.0,0,1,1
3,0.698,0.283784,0.1,0.0,0.333333,0.0,0.0,0.46912,1.0,0.0,0.0,1.0,0.0,0,1,0
4,1.0,0.337838,0.2,0.500246,0.0,1.0,1.0,0.3954,0.0,0.0,1.0,1.0,0.0,0,1,0


In [16]:
from sklearn.metrics import accuracy_score

print('Con KMeans se obtuvieron los siguientes resultados:')
print('----------------------------------------------------------------')
print(f"De los que no cancelaron que son {scaled[(scaled.exited == 0)].shape[0]}, agrupó correctamente {scaled[(scaled.exited == 0) & (scaled.kmeans_label == 0)]['exited'].count()}")
print(f"De los que sí cancelaron que son {scaled[(scaled.exited == 1)].shape[0]}, agrupó correctamente {scaled[(scaled.exited == 1) & (scaled.kmeans_label == 1)]['exited'].count()}")
print(f"Accuracy: {accuracy_score(scaled.exited.values, scaled.kmeans_label):.2%}")
print('\nCon Spectral Clustering se obtuvieron los siguientes resultados:')
print('----------------------------------------------------------------')
print(f"De los que no cancelaron que son {scaled[(scaled.exited == 0)].shape[0]}, agrupó correctamente {scaled[(scaled.exited == 0) & (scaled.spectral_label == 0)]['exited'].count()}")
print(f"De los que sí cancelaron que son {scaled[(scaled.exited == 1)].shape[0]}, agrupó correctamente {scaled[(scaled.exited == 1) & (scaled.spectral_label == 1)]['exited'].count()}")
print(f"Accuracy: {accuracy_score(scaled.exited.values, scaled.spectral_label):.2%}")

Con KMeans se obtuvieron los siguientes resultados:
----------------------------------------------------------------
De los que no cancelaron que son 7963, agrupó correctamente 3404
De los que sí cancelaron que son 2037, agrupó correctamente 898
Accuracy: 43.02%

Con Spectral Clustering se obtuvieron los siguientes resultados:
----------------------------------------------------------------
De los que no cancelaron que son 7963, agrupó correctamente 4559
De los que sí cancelaron que son 2037, agrupó correctamente 1139
Accuracy: 56.98%


In [14]:
from sklearn.metrics import accuracy_score

print(accuracy_score(scaled.exited.values, scaled.spectral_label))
print(accuracy_score(scaled.exited.values, scaled.kmeans_label))

0.5698
0.4302
