In [None]:
import pandas as pd
import numpy as np
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean
from sklearn.neighbors import KNeighborsClassifier
from scipy.sparse import lil_matrix
from tqdm import tqdm


In [None]:
#DATOS_DIR = '~/buckets/b1/datasets/'
DATOS_DIR = '../data/'

# Leer datos
df = pd.read_parquet(DATOS_DIR+'FE_dataset-CARLA.parquet') 
df.columns = df.columns.str.replace(' ', '_').str.replace(r'[^A-Za-z0-9_]', '', regex=True)



In [None]:


# Pivot the dataframe to create time series for each (product_id, customer_id) pair
pivoted_df = df.pivot_table(index=df.index, columns=['product_id', 'customer_id'], values='tn_2')
pivoted_df = pivoted_df.fillna(0)  # Fill NaNs with 0 for missing periods



In [None]:
# Example of applying DTW
def compute_dtw(ts1, ts2):
    distance, path = fastdtw(ts1, ts2, dist=2)
    return distance



# Create a list to store DTW features
dtw_features = []

# Get all pairs of (product_id, customer_id) time series
columns = pivoted_df.columns

for i in range(len(columns)):
    for j in range(i+1, len(columns)):
        ts1 = pivoted_df[columns[i]].values
        ts2 = pivoted_df[columns[j]].values
        distance = compute_dtw(ts1, ts2)
        dtw_features.append({
            'product_id_1': columns[i][0],
            'customer_id_1': columns[i][1],
            'product_id_2': columns[j][0],
            'customer_id_2': columns[j][1],
            'dtw_distance': distance
        })
        
# Convert DTW features to a DataFrame
dtw_df = pd.DataFrame(dtw_features)

dtw_df.to_parquet(DATOS_DIR+'dtw_features.parquet')

In [None]:
# Perform k-means clustering using the DTW distance matrix
n_clusters = 3  # Choose the number of clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
kmeans.fit(dtw_distances)

# Create a dataframe for the DTW distances and cluster labels
dtw_features_df = pd.DataFrame(dtw_distances, index=keys, columns=[f'dtw_feature_{i}' for i in range(len(keys))])
dtw_features_df['predicted_class'] = kmeans.labels_


In [None]:

# Merge the cluster labels and DTW features back into the original dataframe
df_merged = df.reset_index().merge(dtw_features_df[['predicted_class']], left_on=['product_id', 'customer_id'], right_index=True)
for i in range(len(keys)):
    df_merged = df_merged.merge(dtw_features_df[[f'dtw_feature_{i}']], left_on=['product_id', 'customer_id'], right_index=True)

# Setting 'periodo' back as the index
df_merged.set_index('periodo', inplace=True)

In [None]:
df_merged.to_parquet(DATOS_DIR+'/FE_dataset-DTW.parquet', engine='pyarrow')  

# Display results
print(df_merged.head())

import ace_tools as tools; tools.display_dataframe_to_user(name="Time Series with DTW Features and Predicted Classes", dataframe=df_merged)