# Algoritmo K-Means

In [52]:
#!pip install plotly --upgrade

In [53]:
# gráficos e gerenciamento de arrays
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# padronização
from sklearn.preprocessing import StandardScaler

# algoritmo k-means
from sklearn.cluster import KMeans

# faz a geração de dados aleatórios
from sklearn.datasets import make_blobs

# fazer redução de dimensionalidade
from sklearn.decomposition import PCA

# Base idade e salário

In [54]:
x = [20,  27,  21,  37,  46, 53, 55,  47,  52,  32,  39,  41,  39,  48,  48]  # idade
y = [1000,1200,2900,1850,900,950,2000,2100,3000,5900,4100,5100,7000,5000,6500]  # salário

In [55]:
# Visualizando os valores em gráfico, idade e salário

grafico = px.scatter(x = x, y = y)
grafico.show()

In [56]:
# Criando uma base de dados
base_salario = np.column_stack((x, y))
base_salario

array([[  20, 1000],
       [  27, 1200],
       [  21, 2900],
       [  37, 1850],
       [  46,  900],
       [  53,  950],
       [  55, 2000],
       [  47, 2100],
       [  52, 3000],
       [  32, 5900],
       [  39, 4100],
       [  41, 5100],
       [  39, 7000],
       [  48, 5000],
       [  48, 6500]])

In [57]:
# Fazendo a padronização usando StandardScaler

scaler_salario = StandardScaler()
base_salario = scaler_salario.fit_transform(base_salario)
base_salario

array([[-1.87963884, -1.11413572],
       [-1.23255006, -1.01725435],
       [-1.78719758, -0.19376273],
       [-0.30813751, -0.70238991],
       [ 0.52383377, -1.1625764 ],
       [ 1.17092255, -1.13835606],
       [ 1.35580506, -0.62972888],
       [ 0.61627503, -0.5812882 ],
       [ 1.0784813 , -0.14532205],
       [-0.77034379,  1.25945777],
       [-0.12325501,  0.38752547],
       [ 0.0616275 ,  0.8719323 ],
       [-0.12325501,  1.79230528],
       [ 0.70871628,  0.82349162],
       [ 0.70871628,  1.55010187]])

In [58]:
# Criando o algortimo K-Means e fazendo treinamento

kmeans_salario = KMeans(n_clusters=3, n_init=10)
kmeans_salario.fit(base_salario)

In [59]:
# Visualizando os centroides com valores padronizados

centroides = kmeans_salario.cluster_centers_
centroides

# lembrando que esses valores estão padronizados

array([[ 0.73953003, -0.72661025],
       [-1.63312883, -0.77505093],
       [ 0.07703438,  1.11413572]])

In [60]:
# Visualizando os centroides com os valores reais

scaler_salario.inverse_transform(kmeans_salario.cluster_centers_) # revertendo novamente para o normal

array([[  48.33333333, 1800.        ],
       [  22.66666667, 1700.        ],
       [  41.16666667, 5600.        ]])

In [61]:
# Verificando os rótulos atribuídos a cada ponto de dados.

rotulos = kmeans_salario.labels_
rotulos

array([1, 1, 1, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2], dtype=int32)

In [62]:
# Gráfico de visualização que mostra os dados dividos e os centróides

grafico1 = px.scatter(x = base_salario[:, 0], y = base_salario[:, 1], color = rotulos)
grafico2 = px.scatter(x = centroides[:, 0], y = centroides[:, 1], size = [12, 12, 12])
grafico3 = go.Figure(data = grafico1.data + grafico2.data)
grafico3.show()

# Base de dados aleatórios

In [63]:
# Criando dados aleatórios

x_random, y_random = make_blobs(n_samples=200, centers=5, random_state=1) # center = cluster

In [64]:
# Gerando gráfico para mostrar a disposição dos dados

grafico = px.scatter(x = x_random[:, 0], y = x_random[:, 1])
grafico.show()

In [65]:
# Criando K-Means e fazendo treino

kmeans_blobs = KMeans(n_clusters= 5, n_init=10)
kmeans_blobs.fit(x_random)

In [66]:
# Fazendo previsão

rotulos = kmeans_blobs.predict(x_random)
rotulos

array([1, 3, 0, 3, 3, 0, 2, 0, 3, 1, 0, 0, 0, 4, 0, 1, 1, 4, 3, 3, 1, 0,
       2, 0, 2, 4, 3, 2, 2, 4, 3, 2, 1, 0, 3, 1, 4, 3, 4, 1, 4, 4, 2, 2,
       3, 2, 4, 0, 3, 2, 1, 0, 0, 1, 0, 4, 1, 0, 3, 3, 4, 0, 0, 0, 3, 2,
       1, 4, 2, 3, 4, 2, 3, 2, 2, 4, 3, 4, 1, 2, 4, 4, 0, 3, 0, 1, 3, 3,
       3, 4, 2, 0, 1, 2, 0, 4, 2, 1, 3, 4, 1, 3, 2, 4, 2, 3, 4, 3, 0, 1,
       0, 0, 3, 4, 0, 1, 4, 2, 1, 1, 3, 4, 4, 0, 1, 4, 3, 2, 3, 2, 1, 2,
       0, 0, 4, 4, 0, 1, 1, 0, 0, 0, 1, 1, 2, 3, 2, 1, 3, 0, 2, 0, 1, 2,
       0, 4, 4, 4, 1, 3, 2, 1, 4, 1, 2, 4, 4, 4, 0, 2, 4, 2, 2, 0, 1, 3,
       2, 0, 3, 1, 3, 1, 1, 3, 4, 2, 4, 1, 1, 0, 3, 2, 1, 4, 4, 2, 0, 2,
       3, 4], dtype=int32)

In [67]:
# Visualizando os centróides

centroides = kmeans_blobs.cluster_centers_
centroides

array([[-5.90368078, -3.04489641],
       [-1.58338528,  4.50520457],
       [-2.17069756,  1.02591979],
       [-6.87958999, -8.11648104],
       [-9.85620522, -3.91021738]])

In [68]:
centroides.shape

(5, 2)

In [69]:
# Gráfico para fazer a verificação

grafico1 = px.scatter(x = x_random[:, 0], y= x_random[:, 1], color = rotulos) # mostrando gráfico e divivindo os clusters por cores
grafico2 = px.scatter(x= centroides[:, 0], y= centroides[:, 1], size = [5, 5, 5, 5, 5])# mostrando os centróides no gráfico
grafico3 = go.Figure(data = grafico1.data + grafico2.data)
grafico3.show()

# Base de dados de cartão de crédito - 1 atibuto

In [70]:
base_cartao = pd.read_csv('/content/drive/MyDrive/datasets/credit_card_clients.csv', header=1)
base_cartao

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,29996,220000,1,3,1,39,0,0,0,0,...,88004,31237,15980,8500,20000,5003,3047,5000,1000,0
29996,29997,150000,1,3,2,43,-1,-1,-1,-1,...,8979,5190,0,1837,3526,8998,129,0,0,0
29997,29998,30000,1,2,2,37,4,3,2,-1,...,20878,20582,19357,0,0,22000,4200,2000,3100,1
29998,29999,80000,1,3,1,41,1,-1,0,0,...,52774,11855,48944,85900,3409,1178,1926,52964,1804,1


In [71]:
base_cartao.columns

Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default payment next month'],
      dtype='object')

In [72]:
# Somando todas as colunas BILL
base_cartao['BILL_TOTAL'] = base_cartao['BILL_AMT1'] + base_cartao['BILL_AMT2'] + base_cartao['BILL_AMT3'] + base_cartao['BILL_AMT4'] + base_cartao['BILL_AMT5'] + base_cartao['BILL_AMT6']
base_cartao

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month,BILL_TOTAL
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,689,0,0,0,0,1,7704
1,2,120000,2,2,2,26,-1,2,0,0,...,3455,3261,0,1000,1000,1000,0,2000,1,17077
2,3,90000,2,2,2,34,0,0,0,0,...,14948,15549,1518,1500,1000,1000,1000,5000,0,101653
3,4,50000,2,2,1,37,0,0,0,0,...,28959,29547,2000,2019,1200,1100,1069,1000,0,231334
4,5,50000,1,2,1,57,-1,0,-1,0,...,19146,19131,2000,36681,10000,9000,689,679,0,109339
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,29996,220000,1,3,1,39,0,0,0,0,...,31237,15980,8500,20000,5003,3047,5000,1000,0,725349
29996,29997,150000,1,3,2,43,-1,-1,-1,-1,...,5190,0,1837,3526,8998,129,0,0,0,21182
29997,29998,30000,1,2,2,37,4,3,2,-1,...,20582,19357,0,0,22000,4200,2000,3100,1,70496
29998,29999,80000,1,3,1,41,1,-1,0,0,...,11855,48944,85900,3409,1178,1926,52964,1804,1,266611


In [73]:
# Pegando os previsores

# peguei somente o limite e a soma
x_cartao = base_cartao.iloc[:, [1, 25]].values # .values, para converter no formato de numpy array
x_cartao

array([[ 20000,   7704],
       [120000,  17077],
       [ 90000, 101653],
       ...,
       [ 30000,  70496],
       [ 80000, 266611],
       [ 50000, 230874]])

In [74]:
# Fazendo a padronização ou escalonamento

scaler_cartao = StandardScaler()
x_cartao = scaler_cartao.fit_transform(x_cartao)
x_cartao

array([[-1.13672015, -0.69069198],
       [-0.3659805 , -0.66599747],
       [-0.59720239, -0.44316987],
       ...,
       [-1.05964618, -0.52525745],
       [-0.67427636, -0.00856436],
       [-0.90549825, -0.10271861]])

In [75]:
# Testando com varios clusters

wcss = []
for i in range(1, 11):
  #print(i)
  kmeans_cartao = KMeans(n_clusters=i, random_state=0, n_init=10)
  kmeans_cartao.fit(x_cartao)
  wcss.append(kmeans_cartao.inertia_)
wcss

[59999.99999999978,
 35196.57080583435,
 20128.132558347686,
 14943.809173865899,
 10707.537016753624,
 8603.395384848965,
 7400.069599689482,
 6353.473303353463,
 5665.823857740179,
 5052.188166408734]

In [76]:
# Plotando gráfico para decidir qual o melhor

grafico = px.line(x = range(1,11), y = wcss)
grafico.show()

In [77]:
# Criando o algoritmo e fazendo o treinamento com a melhor valor de cluster

kmeans_cartao = KMeans(n_clusters=4, random_state=0, n_init=10)
rotulos = kmeans_cartao.fit_predict(x_cartao) # fit_predict, faz o treinamento e obtém os resultados que forem atribuídos a uma variável
rotulos

array([0, 0, 0, ..., 0, 0, 0], dtype=int32)

In [78]:
x_cartao.shape

(30000, 2)

In [79]:
# Gerando gráficos

grafico = px.scatter(x = x_cartao[:, 0], y = x_cartao[:, 1], color = rotulos)
grafico.show()

In [80]:
# Criando lista de clientes por grupo

lista_clientes = np.column_stack((base_cartao, rotulos))
lista_clientes

array([[     1,  20000,      2, ...,      1,   7704,      0],
       [     2, 120000,      2, ...,      1,  17077,      0],
       [     3,  90000,      2, ...,      0, 101653,      0],
       ...,
       [ 29998,  30000,      1, ...,      1,  70496,      0],
       [ 29999,  80000,      1, ...,      1, 266611,      0],
       [ 30000,  50000,      1, ...,      1, 230874,      0]])

In [81]:
# Ordenando esses dados pelos grupos (coluna 26)

lista_clientes = lista_clientes[lista_clientes[:, 26].argsort()]
lista_clientes

array([[     1,  20000,      2, ...,      1,   7704,      0],
       [ 16789,  30000,      1, ...,      1,  14274,      0],
       [ 16788,  30000,      1, ...,      1,  90453,      0],
       ...,
       [ 20882, 110000,      1, ...,      0, 554308,      3],
       [ 20744, 220000,      1, ...,      0, 452961,      3],
       [ 20270, 150000,      2, ...,      0, 546939,      3]])

# Base de dados cartão de crédito - vários atributos

In [82]:
# Verifica os nomes das colunas

base_cartao.columns

Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default payment next month', 'BILL_TOTAL'],
      dtype='object')

In [83]:
# Criando uma array numpy com as colunas escolhidas

x_cartao_mais = base_cartao.iloc[:, [1, 2, 3, 4, 5, 25]].values
x_cartao_mais

array([[ 20000,      2,      2,      1,     24,   7704],
       [120000,      2,      2,      2,     26,  17077],
       [ 90000,      2,      2,      2,     34, 101653],
       ...,
       [ 30000,      1,      2,      2,     37,  70496],
       [ 80000,      1,      3,      1,     41, 266611],
       [ 50000,      1,      2,      1,     46, 230874]])

In [84]:
# Deixando todos os valores na mesma escala

scaler_cartao_mais = StandardScaler()
x_cartao_mais = scaler_cartao_mais.fit_transform(x_cartao_mais)
x_cartao_mais

array([[-1.13672015,  0.81016074,  0.18582826, -1.05729503, -1.24601985,
        -0.69069198],
       [-0.3659805 ,  0.81016074,  0.18582826,  0.85855728, -1.02904717,
        -0.66599747],
       [-0.59720239,  0.81016074,  0.18582826,  0.85855728, -0.16115646,
        -0.44316987],
       ...,
       [-1.05964618, -1.23432296,  0.18582826,  0.85855728,  0.16430256,
        -0.52525745],
       [-0.67427636, -1.23432296,  1.45111372, -1.05729503,  0.59824792,
        -0.00856436],
       [-0.90549825, -1.23432296,  0.18582826, -1.05729503,  1.14067961,
        -0.10271861]])

In [85]:
# Função para encontrar cluster

def find_num_cluster(data, num_cluster):
    # Testando com varios clusters

    wcss = []
    for i in range(1, num_cluster+1):
        kmeans_cartao = KMeans(n_clusters=i, random_state=0, n_init=10)
        kmeans_cartao.fit(x_cartao)
        wcss.append(kmeans_cartao.inertia_)

    # Fazendo o plot do gráfico
    grafico = px.line(x = range(1,11), y = wcss)
    grafico.show()

    return wcss

In [86]:
find_num_cluster(x_cartao_mais, 10)

[59999.99999999978,
 35196.57080583435,
 20128.132558347686,
 14943.809173865899,
 10707.537016753624,
 8603.395384848965,
 7400.069599689482,
 6353.473303353463,
 5665.823857740179,
 5052.188166408734]

In [91]:
kmeans_cartao_mais = KMeans(n_clusters=4, n_init=10, random_state=1)
rotulos = kmeans_cartao_mais.fit_predict(x_cartao_mais)
rotulos

array([3, 0, 0, ..., 2, 3, 3], dtype=int32)

In [92]:
# Fazendo a redução de dimensionalidade para conseguir plotar o gráfico

pca = PCA(n_components=2) # quero obter como retorno apenas 2 atributos
x_cartao_mais_pca = pca.fit_transform(x_cartao_mais)
x_cartao_mais_pca.shape # todos os registros e somente 2 colunas o necessário para fazer um gráfico

(30000, 2)

In [93]:
# Plotando o gráfico

grafico = px.scatter(x = x_cartao_mais_pca[:, 0], y = x_cartao_mais_pca[:, 1], color = rotulos)
grafico.show()

In [94]:
# Criando lista de clientes por grupos

lista_cliente = np.column_stack((base_cartao, rotulos))
lista_clientes = lista_clientes[lista_clientes[:, 26].argsort()]
lista_clientes

array([[     1,  20000,      2, ...,      1,   7704,      0],
       [  6525,  30000,      1, ...,      0, 152514,      0],
       [  6414,  70000,      1, ...,      1, 316564,      0],
       ...,
       [  7159, 180000,      2, ...,      0, 355103,      3],
       [  6938, 200000,      1, ...,      0, 702756,      3],
       [ 20270, 150000,      2, ...,      0, 546939,      3]])