In [12]:
import pandas as pd
import numpy as np
import fastdtw as dtw
from sklearn.cluster import AgglomerativeClustering
from joblib import Parallel, delayed
from tqdm import tqdm
from tqdm.notebook import tqdm
from sklearn.decomposition import PCA
import multiprocessing

In [3]:
#DATOS_DIR = '~/buckets/b1/datasets/'
DATOS_DIR = '../data/'

# Leer datos
df = pd.read_parquet(DATOS_DIR+'FE_dataset-CARLA.parquet') 
df.columns = df.columns.str.replace(' ', '_').str.replace(r'[^A-Za-z0-9_]', '', regex=True)

### Filtrar datos
df = df.loc['2019-01-01':'2019-11-01']


#Filtro de no compradores
#Step 1: Ensure the index is a datetime type
df.index = df.index.to_timestamp()
# Step 2: Determine the last date and calculate the date 3 months prior
ls_date  = df.index.max()
three_months_prior = ls_date - pd.DateOffset(months=3)

# Step 3: Filter the dataframe to include onl+y rows within the last 3 months
last_3_months_df = df[df.index >= three_months_prior]

# Step 4: Identify the unique client_id that have purchased within this period
active_clients = last_3_months_df['customer_id'].unique()

# Step 5: Filter the original dataframe to include only these client_id
df = df[df['customer_id'].isin(active_clients)]

df.index = pd.PeriodIndex(df.index, freq='M')

In [4]:


# Pivot the dataframe to create time series for each (product_id, customer_id) pair
pivoted_df = df.pivot_table(index=df.index, columns=['product_id', 'customer_id'], values='tn_2')
pivoted_df = pivoted_df.fillna(0)  # Fill NaNs with 0 for missing periods



In [14]:
pca = PCA(n_components=1)  # Reduce to 1 dimension for example
reduced_data = pca.fit_transform(pivoted_df.T).T

# Compute DTW distances in parallel with progress bar and constrained window (Sakoe-Chiba band)
def compute_dtw_parallel(ts1, ts2, col1, col2, window=5):
    distance, path = fastdtw(ts1, ts2, dist=2)
    return {
        'product_id_1': col1[0],
        'customer_id_1': col1[1],
        'product_id_2': col2[0],
        'customer_id_2': col2[1],
        'dtw_distance': distance
    }

columns = pivoted_df.columns
total_combinations = len(columns) * (len(columns) - 1) // 2  # Number of unique pairs

num_cores = multiprocessing.cpu_count()  # Get the number of available CPU cores

dtw_features = Parallel(n_jobs=num_cores)(
    delayed(compute_dtw_parallel)(
        reduced_data[:, i], reduced_data[:, j], columns[i], columns[j]
    ) for i in tqdm(range(len(columns)), desc="Computing DTW distances", total=len(columns))
    for j in range(i+1, len(columns))
)

dtw_features.to_parquet(DATOS_DIR+'dtw_features.parquet')

Computing DTW distances:   0%|          | 0/197231 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:

# Create a list of unique (product_id, customer_id) pairs
pairs = list(set((row['product_id_1'], row['customer_id_1']) for index, row in dtw_df.iterrows())
             .union(set((row['product_id_2'], row['customer_id_2']) for index, row in dtw_df.iterrows())))
pairs.sort()  # Ensure the pairs are sorted

# Create a distance matrix
distance_matrix = np.zeros((len(pairs), len(pairs)))

for index, row in dtw_df.iterrows():
    i = pairs.index((row['product_id_1'], row['customer_id_1']))
    j = pairs.index((row['product_id_2'], row['customer_id_2']))
    distance_matrix[i, j] = row['dtw_distance']
    distance_matrix[j, i] = row['dtw_distance']

# Perform clustering
num_clusters = 3  # Example number of clusters
clustering_model = AgglomerativeClustering(n_clusters=num_clusters, affinity='precomputed', linkage='average')
clusters = clustering_model.fit_predict(distance_matrix)


# Map (product_id, customer_id) pairs to their cluster labels
pair_to_cluster = {pair: cluster for pair, cluster in zip(pairs, clusters)}

# Add cluster labels to the original dataset
df['cluster'] = df.apply(lambda row: pair_to_cluster[(row['product_id'], row['customer_id'])], axis=1)

# Display the original dataframe with the cluster labels
df.head()

In [None]:
df.to_parquet(DATOS_DIR+'/FE_dataset-DTW.parquet', engine='pyarrow')  
