In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import MiniBatchKMeans
import matplotlib.pyplot as plt
import seaborn as sns

# Cargar dataset
df = pd.read_csv('../data/ventas_featurev4.csv', sep=',')



In [2]:
# Generar estadísticas históricas de tn
lag_cols = [f'tn_lag_{i}' for i in range(1, 37)]
df['tn_mean'] = df[lag_cols].mean(axis=1)
df['tn_std'] = df[lag_cols].std(axis=1)
df['tn_max'] = df[lag_cols].max(axis=1)
df['tn_min'] = df[lag_cols].min(axis=1)

In [3]:
# Codificar variables categóricas relevantes
le_cat1 = LabelEncoder()
le_brand = LabelEncoder()
df['cat1_enc'] = le_cat1.fit_transform(df['cat1'].astype(str))
df['brand_enc'] = le_brand.fit_transform(df['brand'].astype(str))

In [4]:
# Seleccionar columnas de lags y deltas
columnas_cluster = [col for col in df.columns if col.startswith('tn_lag_') or col.startswith('delta_tn_lag_')]
columnas_cluster += ['tn_mean', 'tn_std', 'tn_max', 'tn_min',
                     'sku_size', 'productos_estrella', 'cliente_estrella',
                     'cat1_enc', 'brand_enc']

# Eliminar filas con nulos
df_filtrado = df.dropna(subset=columnas_cluster)

# Tomar una muestra para análisis (por rendimiento)
df_sample = df_filtrado.sample(n=100_000, random_state=42)



In [5]:
# Escalado
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_sample[columnas_cluster])



In [6]:
# Clustering
k = 6
mbk = MiniBatchKMeans(n_clusters=k, batch_size=10000, random_state=42)
df_sample['cluster'] = mbk.fit_predict(X_scaled)


In [7]:
# Ver resumen
resumen_clusters = df_sample.groupby('cluster')[columnas_cluster].mean()
print(resumen_clusters)



         tn_lag_1  tn_lag_2  tn_lag_3  tn_lag_4  tn_lag_5  tn_lag_6  tn_lag_7  \
cluster                                                                         
0        0.075793  0.089259  0.095387  0.094208  0.107327  0.119904  0.116771   
1        1.696072  1.629516  1.398339  1.478072  1.383248  1.368870  0.889433   
2        0.004472  0.004497  0.005070  0.004820  0.005393  0.006109  0.004857   
3        0.012874  0.013639  0.012956  0.011654  0.012788  0.013977  0.014156   
4        0.016531  0.016232  0.017577  0.017124  0.018280  0.019048  0.017225   
5        0.014797  0.014453  0.015571  0.015859  0.015465  0.016104  0.017456   

         tn_lag_8  tn_lag_9  tn_lag_10  ...  delta_tn_lag_35   tn_mean  \
cluster                                 ...                              
0        0.117737  0.123790   0.133275  ...        -0.178423  0.158026   
1        1.115618  0.846360   0.623461  ...         2.424598  0.548946   
2        0.006366  0.005624   0.006255  ...        -0.0

In [8]:
# Guardar modelo para aplicar luego
import joblib
joblib.dump(mbk, 'modelo_cluster.joblib')
joblib.dump(scaler, 'scaler_cluster.joblib')


['scaler_cluster.joblib']

In [9]:
# Cargar modelo y scaler
mbk = joblib.load('modelo_cluster.joblib')
scaler = joblib.load('scaler_cluster.joblib')

# Escalar todo el dataset
X_all = scaler.transform(df_filtrado[columnas_cluster])

# Predecir
df_filtrado['cluster_predicho'] = mbk.predict(X_all)

# Ver distribución
print(df_filtrado['cluster_predicho'].value_counts())


cluster_predicho
0    5057408
4    3554049
2    2961130
5    2917318
3    1934861
1     310914
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtrado['cluster_predicho'] = mbk.predict(X_all)


In [10]:
df_filtrado.to_csv('../data/ventas_featurev4_cluster.csv', index=False)

