In [None]:
from tslearn.metrics import cdist_dtw
import pandas as pd
import numpy as np
# Cargar tu dataset
# Cargar y preparar dataset
df = pd.read_csv("../../data/preprocessed/base.csv", sep=",")
df["periodo"] = pd.to_datetime(df["periodo"], format="%Y%m")
df = df.groupby(["product_id", "periodo"])["tn"].sum().reset_index()
df = df.sort_values(["product_id", "periodo"])


# Agrupar y pivotear: filas = productos, columnas = periodos
df_pivot = df.pivot(index="product_id", columns="periodo", values="tn").fillna(0)

# Convertir a array de forma (n_series, n_timestamps)
series = df_pivot.values

# Calcular la matriz de distancias DTW
dist_matrix = cdist_dtw(series)

# Opcional: convertir a DataFrame para visualizar
df_dist = pd.DataFrame(dist_matrix, index=df_pivot.index, columns=df_pivot.index)
print(df_dist.head())

In [None]:
top_product_ids = [1001, 1002, 1003]  # Ejemplo
top_idx = [df_pivot.index.get_loc(pid) for pid in top_product_ids]

dist_features = {}
for i, pid in enumerate(df_pivot.index):
    distances = dist_matrix[i, top_idx]
    dist_features[pid] = {
        'dist_to_top_min': np.min(distances),
        'dist_to_top_mean': np.mean(distances)
    }

df_dtw_feats = pd.DataFrame.from_dict(dist_features, orient='index').reset_index().rename(columns={'index': 'product_id'})
df_final = df.merge(df_dtw_feats, on='product_id', how='left')

Muy buena pregunta: encontrar correlaciones entre series de productos que **se reemplazan o se complementan** a lo largo del tiempo es clave para entender din√°micas de **sustituci√≥n**, **moda**, o **canibalizaci√≥n**. Te dejo un enfoque completo dividido en tres niveles, desde lo m√°s b√°sico hasta lo m√°s potente:

---

### üîπ Nivel 1: Correlaci√≥n Pearson o Spearman entre series

#### ‚û§ ¬øC√≥mo hacerlo?

1. Convert√≠ tus datos a una matriz `periodo x product_id`, con toneladas (`tn`) como valores.
2. Calcul√° la matriz de correlaci√≥n entre productos.
3. Filtr√° por las categor√≠as que te interesen (`cat1 == "JABONES"` por ejemplo).

#### ‚úÖ Ejemplo en c√≥digo:

```python
# Pivot para crear la matriz periodo x producto
df_pivot = df.pivot_table(index='periodo', columns='product_id', values='tn', aggfunc='sum').fillna(0)

# Correlaci√≥n entre productos (Pearson por default, pod√©s usar Spearman)
cor_matrix = df_pivot.corr()

# Ver productos m√°s correlacionados con un jab√≥n espec√≠fico
jab√≥n_id = 12345
correlados = cor_matrix[jab√≥n_id].sort_values(ascending=False)
print(correlados.head(10))
```

Esto te muestra **qu√© productos evolucionaron hist√≥ricamente igual** al jab√≥n dado.

---

### üîπ Nivel 2: Correlaciones *en el tiempo* (no est√°ticas)

A veces la correlaci√≥n entre dos productos **cambia a lo largo del tiempo**. Pod√©s:

* Calcular rolling correlation (ventana m√≥vil de 6 o 12 meses).
* Detectar **inversiones o rupturas en la relaci√≥n**.

```python
window = 6
prod1 = df[df['product_id'] == 12345].set_index('periodo')['tn']
prod2 = df[df['product_id'] == 67890].set_index('periodo')['tn']
rolling_corr = prod1.rolling(window).corr(prod2)
rolling_corr.plot(title='Rolling Correlation entre productos')
```

---

### üîπ Nivel 3: Detecci√≥n de sustituci√≥n / moda con t√©cnicas m√°s avanzadas

#### ‚úÖ Opci√≥n A: **Granger Causality**

Determina si **una serie anticipa el comportamiento de otra**. Muy √∫til para identificar productos "l√≠deres" o que **predicen** tendencias.

```python
from statsmodels.tsa.stattools import grangercausalitytests

data = df_pivot[[12345, 67890]].dropna()
grangercausalitytests(data, maxlag=3)
```

#### ‚úÖ Opci√≥n B: **Dynamic Time Warping (DTW)**

Detecta similitudes en la forma de las series aunque est√©n desplazadas en el tiempo (ideal para modas que se mueven).

#### ‚úÖ Opci√≥n C: **Cross-correlation con lag**

Pod√©s encontrar si un producto reacciona con delay a otro. Por ejemplo, si al bajar el jab√≥n A, sube el jab√≥n B dos meses despu√©s.

```python
from scipy.signal import correlate

serie_a = df_pivot[12345]
serie_b = df_pivot[67890]
corr = correlate(serie_a - serie_a.mean(), serie_b - serie_b.mean(), mode='full')
lags = np.arange(-len(serie_a)+1, len(serie_a))
```

---

### üîπ Bonus: Agrupar por categor√≠a jer√°rquica

Pod√©s hacer lo mismo a nivel:

* `cat1`, `cat2`, o `cat3`, agregando toneladas y observando si una **subcategor√≠a se canibaliza con otra**.
* O incluso usar embeddings (Word2Vec) de productos entrenados con secuencias de compras.

---

### üéØ Conclusi√≥n

* Para **reemplazos/modas**, te conviene usar **correlaci√≥n temporal + Granger + DTW**.
* Para clusters de comportamiento com√∫n, pod√©s aplicar **clustering sobre la matriz de correlaci√≥n** o usar la matriz DTW como hiciste antes.

¬øQuer√©s que te arme un pipeline completo para una de estas opciones?


In [7]:
import pandas as pd
import numpy as np
from tslearn.metrics import cdist_dtw, dtw
from tslearn.preprocessing import TimeSeriesScalerMeanVariance
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Cargar datos
df = pd.read_csv("./../data/preprocessed/base.csv", sep=",")
df["periodo"] = pd.to_datetime(df["periodo"], format="%Y%m")
df = df.groupby(["product_id", "periodo"])["tn"].sum().reset_index()
df = df.sort_values(["product_id", "periodo"])

# Pivotear para tener series como filas y periodos como columnas
pivot = df.pivot(index="product_id", columns="periodo", values="tn").fillna(0)

# Escalar las series temporalmente
scaler = TimeSeriesScalerMeanVariance()
series_scaled = scaler.fit_transform(pivot.values)

# Calcular matriz DTW
dtw_matrix = cdist_dtw(series_scaled)

# Clustering con KMeans sobre la matriz de distancia
n_clusters = 50
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(dtw_matrix)

# Asignar cluster
clusters = pd.DataFrame({
    "product_id": pivot.index,
    "dtw_cluster": kmeans.labels_
})

# Calcular distancia al centroide por producto
dist_to_centroid = []
for i, serie in enumerate(dtw_matrix):
    centroide_idx = np.where(kmeans.labels_ == kmeans.labels_[i])[0]
    centroide_series = dtw_matrix[i, centroide_idx].mean()
    dist_to_centroid.append(centroide_series)
clusters["dist_to_centroid"] = dist_to_centroid

# Calcular similitud con productos clave
top_product_ids = df.groupby("product_id")["tn"].sum().sort_values(ascending=False).head(20).index
key_products_series = pivot.loc[top_product_ids].values

similarity_to_top = []
for i, row in pivot.iterrows():
    similarities = [dtw(row.values, pivot.loc[pid].values) for pid in top_product_ids]
    similarity_to_top.append(np.min(similarities))
clusters["simil_to_top"] = similarity_to_top

# Exportar features
features_dtw = clusters
features_dtw.to_csv("dtw_features.csv", index=False)
print("‚úÖ Features DTW generadas y guardadas en 'dtw_features.csv'")
features_dtw




‚úÖ Features DTW generadas y guardadas en 'dtw_features.csv'


Unnamed: 0,product_id,dtw_cluster,dist_to_centroid,simil_to_top
0,20001,26,4.051282,0.000000
1,20002,18,3.813902,0.000000
2,20003,24,4.232970,0.000000
3,20004,7,4.166777,0.000000
4,20005,7,4.145783,0.000000
...,...,...,...,...
1228,21295,23,1.098818,2258.565544
1229,21296,10,0.816134,2258.565181
1230,21297,23,1.098818,2258.565720
1231,21298,10,0.816134,2258.565347


In [6]:
def dwt_features_serie(df):
    """    
    Calcula caracter√≠sticas adicionales para la serie temporal utilizando DWT y m√∫ltiples m√©tricas de correlaci√≥n.
    """
    from scipy.stats import pearsonr, spearmanr, kendalltau
    from tslearn.preprocessing import TimeSeriesScalerMeanVariance
    from tslearn.metrics import cdist_dtw, dtw
    from sklearn.cluster import KMeans
    import numpy as np
    import pandas as pd

    df = df.sort_values(["product_id", "periodo"])

    # Pivotear para tener series como filas y periodos como columnas
    pivot = df.pivot(index="product_id", columns="periodo", values="tn").fillna(0)

    # Escalar las series temporalmente
    scaler = TimeSeriesScalerMeanVariance()
    series_scaled = scaler.fit_transform(pivot.values)

    # Calcular matriz DTW
    dtw_matrix = cdist_dtw(series_scaled)

    # Clustering con KMeans sobre la matriz de distancia
    n_clusters = 50
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(dtw_matrix)

    # Asignar cluster
    clusters = pd.DataFrame({
        "product_id": pivot.index,
        "dtw_cluster": kmeans.labels_
    })

    # Calcular distancia al centroide por producto
    dist_to_centroid = []
    for i, serie in enumerate(dtw_matrix):
        centroide_idx = np.where(kmeans.labels_ == kmeans.labels_[i])[0]
        centroide_series = dtw_matrix[i, centroide_idx].mean()
        dist_to_centroid.append(centroide_series)
    clusters["dist_to_centroid"] = dist_to_centroid

    # Calcular similitud con productos clave y m√©tricas de correlaci√≥n
    top_product_ids = df.groupby("product_id")["tn"].sum().sort_values(ascending=False).head(20).index
    
    # Inicializar listas para nuevas m√©tricas
    similarity_metrics = {
        'simil_to_top_dtw': [],
        'max_pearson_top': [],
        'max_spearman_top': [],
        'max_kendall_top': [],
        'mean_pearson_cluster': [],
        'mean_spearman_cluster': [],
        'mean_kendall_cluster': []
    }

    for i, product_id in enumerate(pivot.index):
        current_series = pivot.loc[product_id].values
        
        # M√©tricas con productos top
        dtw_distances = []
        pearson_values = []
        spearman_values = []
        kendall_values = []
        
        for pid in top_product_ids:
            target_series = pivot.loc[pid].values
            dtw_distances.append(dtw(current_series, target_series))
            pearson_values.append(pearsonr(current_series, target_series)[0])
            spearman_values.append(spearmanr(current_series, target_series)[0])
            kendall_values.append(kendalltau(current_series, target_series)[0])
        
        similarity_metrics['simil_to_top_dtw'].append(np.min(dtw_distances))
        similarity_metrics['max_pearson_top'].append(np.max(pearson_values))
        similarity_metrics['max_spearman_top'].append(np.max(spearman_values))
        similarity_metrics['max_kendall_top'].append(np.max(kendall_values))
        
        # M√©tricas con series del mismo cluster
        cluster_mates = pivot.index[kmeans.labels_ == kmeans.labels_[i]]
        if len(cluster_mates) > 1:
            pearson_cluster = []
            spearman_cluster = []
            kendall_cluster = []
            
            for mate_id in cluster_mates:
                if mate_id != product_id:
                    mate_series = pivot.loc[mate_id].values
                    pearson_cluster.append(pearsonr(current_series, mate_series)[0])
                    spearman_cluster.append(spearmanr(current_series, mate_series)[0])
                    kendall_cluster.append(kendalltau(current_series, mate_series)[0])
            
            similarity_metrics['mean_pearson_cluster'].append(np.mean(pearson_cluster))
            similarity_metrics['mean_spearman_cluster'].append(np.mean(spearman_cluster))
            similarity_metrics['mean_kendall_cluster'].append(np.mean(kendall_cluster))
        else:
            similarity_metrics['mean_pearson_cluster'].append(np.nan)
            similarity_metrics['mean_spearman_cluster'].append(np.nan)
            similarity_metrics['mean_kendall_cluster'].append(np.nan)

    # A√±adir nuevas m√©tricas al DataFrame de clusters
    for metric, values in similarity_metrics.items():
        clusters[metric] = values

    # Exportar features
    features_dtw = clusters
    features_dtw.to_csv("dtw_features.csv", index=False)
    print("‚úÖ Features DTW + correlaciones generadas y guardadas en 'dtw_features.csv'")

    df = df.merge(features_dtw, on="product_id", how="left")

    return df


# Cargar datos
df = pd.read_csv("./../data/preprocessed/base.csv", sep=",")
df["periodo"] = pd.to_datetime(df["periodo"], format="%Y%m")
df = df.groupby(["product_id", "periodo"])["tn"].sum().reset_index()
df = df.sort_values(["product_id", "periodo"])
dwt_features_serie(df)



‚úÖ Features DTW + correlaciones generadas y guardadas en 'dtw_features.csv'


Unnamed: 0,product_id,periodo,tn,dtw_cluster,dist_to_centroid,simil_to_top_dtw,max_pearson_top,max_spearman_top,max_kendall_top,mean_pearson_cluster,mean_spearman_cluster,mean_kendall_cluster
0,20001,2017-01-01,934.77222,26,4.051282,0.000000,1.000000,1.000000,1.000000,0.225197,0.249685,0.172559
1,20001,2017-02-01,798.01620,26,4.051282,0.000000,1.000000,1.000000,1.000000,0.225197,0.249685,0.172559
2,20001,2017-03-01,1303.35771,26,4.051282,0.000000,1.000000,1.000000,1.000000,0.225197,0.249685,0.172559
3,20001,2017-04-01,1069.96130,26,4.051282,0.000000,1.000000,1.000000,1.000000,0.225197,0.249685,0.172559
4,20001,2017-05-01,1502.20132,26,4.051282,0.000000,1.000000,1.000000,1.000000,0.225197,0.249685,0.172559
...,...,...,...,...,...,...,...,...,...,...,...,...
31238,21295,2017-01-01,0.00699,23,1.098818,2258.565544,0.102406,0.187120,0.154890,0.920446,0.717738,0.711495
31239,21296,2017-08-01,0.00651,10,0.816134,2258.565181,0.352790,0.284747,0.235702,0.477303,0.552538,0.550304
31240,21297,2017-01-01,0.00579,23,1.098818,2258.565720,0.102406,0.187120,0.154890,0.920446,0.717738,0.711495
31241,21298,2017-08-01,0.00573,10,0.816134,2258.565347,0.352790,0.284747,0.235702,0.477303,0.552538,0.550304


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from tslearn.metrics import cdist_dtw
from tslearn.clustering import TimeSeriesKMeans

# Cargar dataset
df = pd.read_csv("/mnt/data/base.csv")
df["periodo"] = pd.to_datetime(df["periodo"], format="%Y%m")
df = df.groupby(["product_id", "periodo"])["tn"].sum().reset_index()
df = df.sort_values(["product_id", "periodo"])

# Crear pivot con productos como filas y fechas como columnas
pivot = df.pivot(index="product_id", columns="periodo", values="tn").fillna(0)

# Escalar por producto
scaler = StandardScaler()
pivot_scaled = scaler.fit_transform(pivot)

# --- Clustering con DTW ---
# Determinamos una cantidad razonable de clusters (puede ajustarse)
n_clusters = 10
model = TimeSeriesKMeans(n_clusters=n_clusters, metric="dtw", random_state=42)
cluster_labels = model.fit_predict(pivot_scaled)

# Calcular distancia al centroide de cada serie
distancias = []
for i, serie in enumerate(pivot_scaled):
    centroide = model.cluster_centers_[cluster_labels[i]]
    distancia = np.mean(np.abs(serie - centroide))
    distancias.append(distancia)

# Crear dataframe con product_id y features
df_features = pd.DataFrame({
    "product_id": pivot.index,
    "cluster_dtw": cluster_labels,
    "distancia_centroide": distancias
})

# --- Similitud con productos clave (top 5 vendidos en total) ---
productos_top = df.groupby("product_id")["tn"].sum().sort_values(ascending=False).head(5).index.tolist()
pivot_top = pivot_scaled[np.isin(pivot.index, productos_top)]

# Calcular similitud m√≠nima (inversa de distancia) con cada top
dist_matrix = cdist_dtw(pivot_scaled, pivot_top)
min_similitud = 1 / (1 + dist_matrix.min(axis=1))  # Similaridad inversa (m√°s cerca = mayor valor)

df_features["similitud_productos_top"] = min_similitud


