In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os

if os.path.basename(os.getcwd()) == "notebooks":
    os.chdir("..")

In [None]:
from src.prepare import prepare_data

train, test = prepare_data(
    project_id="ca-churn-project",
    database_name="customer_churn",
    table_name="customer_churn_data",
)

In [None]:
categoric_columns = train.select_dtypes(include=["category"]).columns


def extract_code(df, column):
    return df[column].cat.codes


for column in categoric_columns:
    train[column] = extract_code(train, column)
    test[column] = extract_code(test, column)


train["receita_total"] = train["receita_total"].fillna(train["receita_mensal"])
test["receita_total"] = test["receita_total"].fillna(test["receita_mensal"])

# train.set_index("id", inplace=True)
# test.set_index("id", inplace=True)

In [None]:
y_train = train.pop("churn")
y_test = test.pop("churn")

In [None]:
import pandas as pd
#convert receita_mensal and receita_total to buckets
train["receita_mensal"] = pd.qcut(train["receita_mensal"], 4, labels=False)
train["receita_total"] = pd.qcut(train["receita_total"], 4, labels=False)

## Raw data

In [None]:
from sklearn.cluster import KMeans
from tqdm.auto import tqdm
import matplotlib.pyplot as plt


# calcular o ponto de inflexão (Elbow)
def elbow_point(sse, max_k=15):
    x1, y1 = 2, sse[0]
    x2, y2 = max_k, sse[-1]

    distances = []
    for i in range(len(sse)):
        x0 = i + 2
        y0 = sse[i]
        numerator = abs((y2 - y1) * x0 - (x2 - x1) * y0 + x2 * y1 - y2 * x1)
        denominator = ((y2 - y1) ** 2 + (x2 - x1) ** 2) ** 0.5
        distances.append(numerator / denominator)

    return distances.index(max(distances)) + 2


# Lista para armazenar os valores de SSE (Soma dos Quadrados dos Erros)
sse = []
max_k = 15
for k in tqdm(range(2, max_k + 1)):
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(train)
    sse.append(kmeans.inertia_)  # Inertia é o SSE


best_k = elbow_point(sse, max_k)

plt.figure(figsize=(8, 6))

plt.plot(range(2, max_k + 1), sse, marker="o")
plt.plot([2, max_k], [sse[0], sse[-1]], "g--")
plt.plot(best_k, sse[best_k - 2], "ro")

plt.xlabel("Número de clusters")
plt.ylabel("SSE (Inertia)")
plt.title("Método Elbow para seleção do número de clusters")

plt.grid(True)
plt.xticks(range(2, max_k + 1))
plt.show()

In [None]:
from sklearn.decomposition import PCA
#

kmeans = KMeans(n_clusters=best_k, random_state=0)
kmeans.fit(train)

train["cluster"] = kmeans.predict(train)

pca = PCA(n_components=2)
train_pca = pca.fit_transform(train)

plt.figure(figsize=(8, 6))

plt.scatter(train_pca[:, 0], train_pca[:, 1], c=train["cluster"], cmap="viridis")
plt.xlabel("Componente Principal 1")
plt.ylabel("Componente Principal 2")

plt.title("Clusters em 2D")

plt.show()

In [None]:
train

In [None]:
kmeans.cluster_centers_.shape

In [None]:
train.values[:,1:].shape

In [None]:
#print samples near centroids
for i in range(best_k):
    print(f"Cluster {i}")
    print(train.iloc[(train.values[:,1:] - kmeans.cluster_centers_[i]).sum(axis=1).argsort()[:5]])

## Gower Distance

In [None]:
# pip install gower scikit-learn-extra

In [None]:
train=train[y_train]
train.shape

In [None]:
#convert all to int8
train = train.astype('float64')

In [None]:
import gower

dist_matrix = gower.gower_matrix(train)
dist_matrix.shape

In [None]:
from sklearn_extra.cluster import KMedoids

clusters = (KMedoids(n_clusters=5, 
                           metric='precomputed',
                           method='pam', init='build', 
                           max_iter=300, 
                           random_state=123)
            .fit(dist_matrix)
            .labels_)

In [None]:
#pca
from sklearn.decomposition import PCA
#reduce distance matrix to 2D
pca = PCA(n_components=2)
dist_matrix_pca = pca.fit_transform(dist_matrix)
dist_matrix_pca.shape

In [None]:
#visualize distance matrix and clusters
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6))

plt.scatter(dist_matrix_pca[:, 0], dist_matrix_pca[:, 1], c=clusters, cmap="viridis")
plt.xlabel("Componente Principal 1")
plt.ylabel("Componente Principal 2")

plt.title("Clusters em 2D")

In [None]:
#umap
import umap
reducer = umap.UMAP(n_neighbors=5, min_dist=0.3, n_components=2)
embedding = reducer.fit_transform(dist_matrix)
embedding.shape

In [None]:
#visualize distance matrix and clusters
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6))

plt.scatter(embedding[:, 0], embedding[:, 1], c=clusters, cmap="viridis")
plt.xlabel("Componente Principal 1")
plt.ylabel("Componente Principal 2")

plt.title("Clusters em 2D")