In [None]:
from pathlib import Path

import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import Normalizer, MaxAbsScaler, MinMaxScaler, RobustScaler, StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
data_profile = pd.read_excel(Path("../data/processed/clientes_perfil.xlsx"))
data_features = pd.read_excel(Path("../data/processed/clientes_features.xlsx"))

# Cluster grupo 0 - Clientes Cativos

In [None]:
data_features_0 = data_features.loc[data_features["grupo"]==0]
data_features_0

In [None]:
data_features_0 = data_features_0.drop("grupo", axis=1)
data_features_0 = data_features_0.loc[:,(data_features_0 != 0).any()]

In [None]:
data_features_0.describe().T

In [None]:
X0 = data_features_0.loc[
    :,
    ~data_features_0.columns.str.contains("count|initial|last|cliente", regex=True, case=False)
]
X0

In [None]:
# normalizer = Normalizer()
# normalizer = MaxAbsScaler()
# normalizer = MinMaxScaler()
normalizer = StandardScaler()
# normalizer = RobustScaler()
X_normalized = normalizer.fit_transform(X0)

inertias = []
k_range = list(range(2, 11))
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_normalized)
    inertias.append(kmeans.inertia_)

plt.plot(k_range, inertias)
plt.ylabel("Inercias")
plt.xlabel("Numero Clusters")
plt.show()

In [None]:
reduction = PCA(n_components=2, random_state=42)
X_reduced = reduction.fit_transform(X_normalized)
print(f"Explained Variance Ratio: {reduction.explained_variance_ratio_}")

k = 5 # based on elbon plot
kmeans = KMeans(n_clusters=k, random_state=42)
labels = kmeans.fit_predict(X_normalized)
X0["cluster"] = labels

df = pd.DataFrame(X_reduced)
df["cluster"] = labels
print(f"Clusters count")
print(df["cluster"].value_counts())
# Plot reduction
fig1, ax1 = plt.subplots(figsize=(10, 6))
sns.scatterplot(df, x=1, y=0, hue="cluster", palette="Set1", ax=ax1)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')

# plot original data
fig2, ax2 = plt.subplots(figsize=(10, 6))
sns.scatterplot(X0, x="mean_Consumo", y="mean_Demanda", hue="cluster", palette="Set1", ax=ax2)
# plt.yscale("log")
# plt.xscale("log")
plt.show()

In [None]:
# normalizer = Normalizer()
# normalizer = MaxAbsScaler()
# normalizer = MinMaxScaler()
# normalizer = StandardScaler()
normalizer = RobustScaler()
X_normalized = normalizer.fit_transform(X0)

reduction = PCA(n_components=2, random_state=42)
X_reduced = reduction.fit_transform(X_normalized)
print(f"Explained Variance Ratio: {reduction.explained_variance_ratio_}")

clusterizer = DBSCAN(eps=4, min_samples=10)
labels = clusterizer.fit_predict(X_normalized)
X0["cluster"] = labels

df = pd.DataFrame(X_reduced)
df["cluster"] = labels
print(f"Clusters count")
print(df["cluster"].value_counts())
# Plot reduction
fig1, ax1 = plt.subplots(figsize=(10, 6))
sns.scatterplot(df, x=1, y=0, hue="cluster", palette="Set1", ax=ax1)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')

# plot original data
fig2, ax2 = plt.subplots(figsize=(10, 6))
sns.scatterplot(X0, x="mean_Consumo", y="mean_Demanda", hue="cluster", palette="Set1", ax=ax2)
# plt.yscale("log")
# plt.xscale("log")
plt.show()

In [None]:
data_features_0.loc[data_features_0["count_Consumo"]<12]

In [None]:
data_profile_0 = data_profile.loc[data_profile["grupo"]==0]
print(data_profile_0["grupo_fatura"].unique())

In [None]:
fig, ax = plt.subplots(1,3,figsize=(10,4))
sns.scatterplot(data_profile_0, x="")