## 1. Aquisição dos Dados

In [None]:
import pandas as pd

data = pd.read_csv('CC GENERAL.csv')
data.head()

Unnamed: 0,CUST_ID,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,C10001,40.900749,0.818182,95.4,0.0,95.4,0.0,0.166667,0.0,0.083333,0.0,0,2,1000.0,201.802084,139.509787,0.0,12
1,C10002,3202.467416,0.909091,0.0,0.0,0.0,6442.945483,0.0,0.0,0.0,0.25,4,0,7000.0,4103.032597,1072.340217,0.222222,12
2,C10003,2495.148862,1.0,773.17,773.17,0.0,0.0,1.0,1.0,0.0,0.0,0,12,7500.0,622.066742,627.284787,0.0,12
3,C10004,1666.670542,0.636364,1499.0,1499.0,0.0,205.788017,0.083333,0.083333,0.0,0.083333,1,1,7500.0,0.0,,0.0,12
4,C10005,817.714335,1.0,16.0,16.0,0.0,0.0,0.083333,0.083333,0.0,0.0,0,1,1200.0,678.334763,244.791237,0.0,12


## 2. Pré-processamento dos Dados

In [48]:
data.drop(columns=["CUST_ID", "TENURE"], inplace=True)

data.fillna(data.median(), inplace=True)

data.isnull().sum()

BALANCE                             0
BALANCE_FREQUENCY                   0
PURCHASES                           0
ONEOFF_PURCHASES                    0
INSTALLMENTS_PURCHASES              0
CASH_ADVANCE                        0
PURCHASES_FREQUENCY                 0
ONEOFF_PURCHASES_FREQUENCY          0
PURCHASES_INSTALLMENTS_FREQUENCY    0
CASH_ADVANCE_FREQUENCY              0
CASH_ADVANCE_TRX                    0
PURCHASES_TRX                       0
CREDIT_LIMIT                        0
PAYMENTS                            0
MINIMUM_PAYMENTS                    0
PRC_FULL_PAYMENT                    0
dtype: int64

In [49]:
## Feature Scaling

from sklearn.preprocessing import Normalizer

normalizer = Normalizer()
data_normalized = normalizer.fit_transform(data)
pd.DataFrame(data_normalized).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.039356,0.000787,0.091796,0.0,0.091796,0.0,0.00016,0.0,8e-05,0.0,0.0,0.001924,0.962221,0.194178,0.134239,0.0
1,0.293876,8.3e-05,0.0,0.0,0.0,0.59124,0.0,0.0,0.0,2.3e-05,0.000367,0.0,0.642358,0.376517,0.098404,2e-05
2,0.310798,0.000125,0.096307,0.096307,0.0,0.0,0.000125,0.000125,0.0,0.0,0.0,0.001495,0.934207,0.077485,0.078135,0.0
3,0.208887,8e-05,0.187872,0.187872,0.0,0.025792,1e-05,1e-05,0.0,1e-05,0.000125,0.000125,0.939988,0.0,0.039147,0.0
4,0.504298,0.000617,0.009867,0.009867,0.0,0.0,5.1e-05,5.1e-05,0.0,0.0,0.0,0.000617,0.74006,0.41834,0.150967,0.0


## 3. Modelagem

In [50]:
## Model Building

from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=6, n_init=10, max_iter=300, random_state=1)
kmeans.fit(data_normalized)

clusters = kmeans.predict(data_normalized)

### Validação dos clusters

In [51]:
## Cluster Validation

from sklearn import metrics

silhouette = metrics.silhouette_score(data_normalized, clusters)
davies_bouldin = metrics.davies_bouldin_score(data_normalized, clusters)
calinski_harabasz = metrics.calinski_harabasz_score(data_normalized, clusters)

print(f"Silhouette Score: {silhouette}")
print(f"David Bouldin Score: {davies_bouldin}")
print(f"Calinski Harabasz Score: {calinski_harabasz}")

Silhouette Score: 0.3648647701235422
David Bouldin Score: 1.045372580434123
Calinski Harabasz Score: 3523.514774549864


### Cluster Tuning

In [52]:
## Cluster Tuning

def clustering_algorithm(n_clusters, dataset):
    kmeans = KMeans(n_clusters=n_clusters, n_init=10, max_iter=300, random_state=1)
    clusters = kmeans.fit_predict(dataset)
    silhouette = metrics.silhouette_score(dataset, clusters)
    davies_bouldin = metrics.davies_bouldin_score(dataset, clusters)
    calinski_harabasz = metrics.calinski_harabasz_score(dataset, clusters)
    return silhouette, davies_bouldin, calinski_harabasz

n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10]

for n in n_clusters:
    silhouette, davies_bouldin, calinski_harabasz = clustering_algorithm(n, data_normalized)
    print(f"Number of Clusters: {n}")
    print(f"Silhouette Score: {silhouette}")
    print(f"David Bouldin Score: {davies_bouldin}")
    print(f"Calinski Harabasz Score: {calinski_harabasz}")
    print("\n")

Number of Clusters: 2
Silhouette Score: 0.299138657814818
David Bouldin Score: 1.5180889522508143
Calinski Harabasz Score: 3321.846819585206


Number of Clusters: 3
Silhouette Score: 0.3272203126696238
David Bouldin Score: 1.3096073640088426
Calinski Harabasz Score: 3526.440519908274


Number of Clusters: 4
Silhouette Score: 0.34815517629328857
David Bouldin Score: 1.2215290915603927
Calinski Harabasz Score: 3528.683142637313


Number of Clusters: 5
Silhouette Score: 0.3644845919974304
David Bouldin Score: 1.0757138590613295
Calinski Harabasz Score: 3431.79374284143


Number of Clusters: 6
Silhouette Score: 0.3648647701235422
David Bouldin Score: 1.045372580434123
Calinski Harabasz Score: 3523.514774549864


Number of Clusters: 7
Silhouette Score: 0.32882534690828996
David Bouldin Score: 1.1505381282320164
Calinski Harabasz Score: 3398.2556006396167


Number of Clusters: 8
Silhouette Score: 0.31980663388027036
David Bouldin Score: 1.1506241484683115
Calinski Harabasz Score: 3213.878643

## 4. Análise gráfica

In [53]:
## Graphical Representation | Scatter 3d

import plotly.express as px
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
data_pca = pca.fit_transform(data_normalized)

fig = px.scatter_3d(data_pca, x=0, y=1, z=2, color=clusters)
fig.update_traces(marker_size=2)
fig.show()

In [54]:
## Graphical Representation | Scatter 2d (Payments vs Purchases)

fig = px.scatter(data, x="PURCHASES", y="PAYMENTS", color=clusters)
fig.update_traces(marker_size=5)
fig.show()

In [55]:
import seaborn as sns

data["cluster"] = clusters
# sns.pairplot(data, hue="cluster")

### Interpretação dos clusters

In [59]:
## Cluster Interpretation
## Obtemos os centroides de cada cluster e calculamos a variância de cada um deles

centroids = kmeans.cluster_centers_ 

for i in range(len(centroids[0])):
    label = data.columns.values[i]
    variance = centroids[:, i].var()
    print(f"{label} | {variance:.4f}")

BALANCE | 0.0254
BALANCE_FREQUENCY | 0.0000
PURCHASES | 0.0183
ONEOFF_PURCHASES | 0.0070
INSTALLMENTS_PURCHASES | 0.0035
CASH_ADVANCE | 0.0345
PURCHASES_FREQUENCY | 0.0000
ONEOFF_PURCHASES_FREQUENCY | 0.0000
PURCHASES_INSTALLMENTS_FREQUENCY | 0.0000
CASH_ADVANCE_FREQUENCY | 0.0000
CASH_ADVANCE_TRX | 0.0000
PURCHASES_TRX | 0.0000
CREDIT_LIMIT | 0.0348
PAYMENTS | 0.0300
MINIMUM_PAYMENTS | 0.0539
PRC_FULL_PAYMENT | 0.0000


In [75]:
## Escolhemos as features com maior variância para interpretar os clusters
## Balance, Purchases, Cash Advance, Credit Limit, Payments

grouped = data.groupby("cluster")[["BALANCE", "PURCHASES", "CASH_ADVANCE", "CREDIT_LIMIT", "PAYMENTS"]]
n_clients = grouped.size()

Unnamed: 0_level_0,BALANCE,PURCHASES,CASH_ADVANCE,CREDIT_LIMIT,PAYMENTS
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2429.505074,238.797229,3109.017091,4474.213287,976.915841
1,418.043217,628.78167,119.403237,5132.733927,816.706836
2,1765.701095,528.570286,3085.881331,3951.29848,4999.034523
3,3242.269276,519.334337,762.755415,4418.179287,1052.052473
4,1118.416733,3330.059495,185.882816,4111.615176,3052.726585
5,1959.062058,856.281725,451.824341,2198.684211,1339.886001


CLUSTER 0: Clientes que gastam pouco. Clientes com o maior limite. Bons pagadores. Maior número de clientes.

CLUSTER 1: Clientes que mais gastam. O foco deles é o saque. Piores pagadores. Boa quantidade de clientes.

CLUSTER 2: Clientes que gastam muito com compras. Melhores pagadores.

CLUSTER 3: Clientes que gastam muito com saques. Pagam as vezes.

CLUSTER 4: Clientes com o menor limite. Não são bons pagadores. Menor quantidade de clientes.