### Imports

In [1]:
import pandas as pd
import polars as pl
import numpy as np
#import matplotlib.pyplot as plt
#from feature_baggingV2 import FeatureBaggingWithHyperparamTuning
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
#import lightgbm as lgb
import datetime
from sklearn.preprocessing import RobustScaler
#from keras.models import Sequential
#from keras.layers import LSTM, Dense
import labolibrary as labo

import os, sys, gc, time, warnings, pickle, psutil, random

import warnings
warnings.filterwarnings("ignore")


from sklearn.model_selection import train_test_split
from sktime.clustering.k_means import TimeSeriesKMeans
from sktime.clustering.utils.plotting._plot_partitions import plot_cluster_algorithm
from sktime.datasets import load_arrow_head

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



### Función para escalar/desescalar y métrica

In [2]:
#DATOS_DIR = '~/buckets/b1/datasets/'
DATOS_DIR = '../data/'

# Leer datos
df_final = pd.read_parquet(DATOS_DIR+'FE_09_dataset.parquet') 
df_final.columns = df_final.columns.str.replace(' ', '_').str.replace(r'[^A-Za-z0-9_]', '', regex=True)


In [3]:

df_final.reset_index(inplace=True)


In [4]:
df_final['periodo'] = df_final['periodo'].dt.to_timestamp()
df_final = pl.from_pandas(df_final)

In [5]:
# Ensure 'periodo' column is string type, truncate to fit the datetime format, and convert to datetime
df_final = df_final.with_columns([
    pl.col("periodo").cast(pl.Utf8).str.slice(0, 19).str.strptime(pl.Datetime, format="%Y-%m-%d %H:%M:%S", strict=False).alias("periodo_dt")
])


In [6]:
primer_periodo = df_final['periodo_dt'].min()
df_final['periodo','periodo_dt']
 
df_final = df_final.with_columns(
    ((pl.col('periodo_dt').dt.year() - primer_periodo.year) * 12 +
    (pl.col('periodo_dt').dt.month() - primer_periodo.month)).alias('mes_indice')
)

In [7]:
df_prod = df_final.group_by("product_id").agg(
    pl.sum("tn").alias("total_tn")
)

In [8]:
fijos = 150
sample = 150
 
productos_fijos = df_prod.select(['product_id', 'total_tn']).sort('total_tn',descending = True)[:fijos][:,0]
productos_sample = df_prod.select(['product_id', 'total_tn']).sort('total_tn',descending = True)[fijos:][:,0].sample(sample, seed=42)
productos_otros = df_prod['product_id'].filter(~df_prod['product_id'].is_in(pl.concat([productos_fijos, productos_sample])))

In [9]:
df_final = df_final.sort(by=['product_id', 'customer_id', 'periodo'])

# Concatenar los productos fijos y de muestra en una sola serie
productos_fijos_sample = pl.concat([productos_fijos, productos_sample])

# Filtrar para el conjunto de entrenamiento y test
df_train = df_final.filter(pl.col('product_id').is_in(productos_fijos_sample))
df_fit = df_final.filter(pl.col('product_id').is_in(productos_otros))

print(f"df_final shape: ({df_final.shape[0]},{df_final.shape[1]})")
print(f"df_train shape: ({df_train.shape[0]},{df_final.shape[1]})")
print(f"df_fit shape: ({df_fit.shape[0]},{df_final.shape[1]})")

df_final shape: (4835264,212)
df_train shape: (2354909,212)
df_fit shape: (2480355,212)


In [10]:
df_train = df_train.to_pandas()
df_fit = df_fit.to_pandas()

In [11]:
prodcust_train = df_train[['product_id', 'customer_id']].drop_duplicates().reset_index(drop=True)
prodcust_fit = df_fit[['product_id', 'customer_id']].drop_duplicates().reset_index(drop=True)
meses = pd.DataFrame({'mes_indice': df_train['mes_indice'].unique()})
 
print(f"prodcust_train series:  ({prodcust_train.shape[0]:>9_d},{df_train.shape[1]:_d}, productos unicos: {df_train['product_id'].nunique()})")
print(f"prodcust_fit series:    ({prodcust_fit.shape[0]:>9_d},{df_fit.shape[1]:_d}, productos unicos: {df_fit['product_id'].nunique()})")

prodcust_train series:  (   71_455,212, productos unicos: 300)
prodcust_fit series:    (   85_175,212, productos unicos: 480)


In [12]:
X_train = prodcust_train.merge(meses, how='cross')
X_train = X_train.merge(df_train[['product_id', 'customer_id', 'mes_indice','tn']], on=['product_id', 'customer_id', 'mes_indice'], how='left')
X_train['prodcust'] = X_train['product_id'].astype(str) + '_' + X_train['customer_id'].astype(str)
X_train = X_train.drop(columns=['product_id', 'customer_id'])
X_train.set_index(['prodcust', 'mes_indice'], inplace=True)
print(f"Nulos en X_train: {X_train['tn'].isna().sum()}. Se reemplazan por ceros")
X_train['tn'] = X_train['tn'].fillna(0)
print(f"X_train series:    ({X_train.shape[0]:>9_d},{X_train.shape[1]:_d})")


Nulos en X_train: 217471. Se reemplazan por ceros
X_train series:    (2_572_380,1)


In [30]:
X_fit = prodcust_fit.merge(meses, how='cross')
X_fit = X_fit.merge(df_train[['product_id', 'customer_id', 'mes_indice','tn']], on=['product_id', 'customer_id', 'mes_indice'], how='left')
X_fit['prodcust'] = X_fit['product_id'].astype(str) + '_' + X_fit['customer_id'].astype(str)
X_fit = X_fit.drop(columns=['product_id', 'customer_id'])
X_fit.set_index(['prodcust', 'mes_indice'], inplace=True)
print(f"Nulos en X_fit: {X_fit['tn'].isna().sum()}. Se reemplazan por ceros")
X_fit['tn'] = X_fit['tn'].fillna(0)
print(f"X_fit series:    ({X_fit.shape[0]:>9_d},{X_fit.shape[1]:_d})")

Nulos en X_fit: 3066300. Se reemplazan por ceros
X_fit series:    (3_066_300,1)


In [14]:
model = TimeSeriesKMeans(n_clusters=10, metric="dtw",
                        init_algorithm = 'kmeans++',
                        n_init=2, max_iter=200, tol=10000,
                        verbose = True,
                        random_state=42
                        )
x_clusters_dtw = model.fit_predict(X_train)


Iteration 0, inertia 224611320.21662325.
Iteration 1, inertia 252704555.12117383.
Iteration 2, inertia 261174107.936738.
Iteration 3, inertia 259088476.8021542.
Iteration 4, inertia 260287534.11396164.
Iteration 5, inertia 265722763.53666273.
Iteration 6, inertia 268357600.49698058.
Iteration 7, inertia 271555400.70724875.
Iteration 8, inertia 269771780.25965273.
Iteration 9, inertia 270291663.40666276.
Iteration 10, inertia 274191204.17505556.
Iteration 11, inertia 270991415.1435683.
Iteration 12, inertia 272398512.10443896.
Iteration 13, inertia 280531005.4046988.
Iteration 14, inertia 282560191.3438333.
Iteration 15, inertia 291649751.08353573.
Iteration 16, inertia 300423998.68496025.
Iteration 17, inertia 305132737.33123547.
Iteration 18, inertia 309733716.020664.
Iteration 19, inertia 315200986.4326564.
Iteration 20, inertia 319899917.8793718.
Iteration 21, inertia 323401324.40343434.
Iteration 22, inertia 326938772.5515343.
Iteration 23, inertia 332667029.6376064.
Iteration 24, 

AttributeError: module 'datetime' has no attribute 'now'

In [15]:
model.save(serialization_format='pickle')

(sktime.clustering.k_means._k_means.TimeSeriesKMeans,