In [5]:
import pandas as pd
import numpy as np
import fastdtw as dtw
from sklearn.cluster import AgglomerativeClustering
from joblib import Parallel, delayed
from tqdm import tqdm
from tqdm.notebook import tqdm
from sklearn.decomposition import PCA
import multiprocessing
import polars as pl
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

In [6]:
#DATOS_DIR = '~/buckets/b1/datasets/'
DATOS_DIR = '../data/'

# Leer datos
df_orig = pd.read_parquet(DATOS_DIR+'FE_07_dataset.parquet') 
df_orig.columns = df_orig.columns.str.replace(' ', '_').str.replace(r'[^A-Za-z0-9_]', '', regex=True)



In [7]:

df = df_orig.reset_index()
df = df[df['product_id'] < 20002]

df['periodo'] = pd.to_datetime(df['periodo'].astype(str))
df.drop(columns=['descripcion'], inplace=True)
df['periodo'] = pd.to_datetime(df['periodo'].astype(str))
df = pl.from_pandas(df)


In [8]:
# Ensure the selected columns are valid
selected_columns = ['periodo', 'customer_id', 'product_id', 'tn', 'tn_2', 'diff_tn_tn2']  


# Select and sort the dataframe
df = df.select(selected_columns).sort(['product_id', 'customer_id', 'periodo'])


In [11]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import polars as pl
from tqdm import tqdm
from joblib import Parallel, delayed


def create_lstm_features(customer_data):
    if customer_data.shape[0] < 2:
        return [], []

    # Prepare the data for LSTM
    series = customer_data.drop(['tn_2', 'periodo']).to_numpy().astype(np.float32)
    target = customer_data['tn_2'].to_numpy().astype(np.float32)
    periodos = customer_data['periodo'].to_numpy()
    
    n_features = series.shape[1]
    
    X, y, time_indexes = [], [], []
    for i in range(len(series) - 1):
        seq_x, seq_y = series[i:i + 1], target[i + 1]
        X.append(seq_x)
        y.append(seq_y)
        time_indexes.append(periodos[i + 1])
    
    X, y = np.array(X), np.array(y)
    X = X.reshape((X.shape[0], X.shape[1], X.shape[2]))
    
    # Define LSTM model
    model = Sequential()
    model.add(LSTM(10, activation='relu', input_shape=(X.shape[1], X.shape[2])))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse')
    
    # Fit the model
    model.fit(X, y, epochs=10,batch_size=32, verbose=1)
    
    # Generate LSTM features
    lstm_features = model.predict(X, verbose=0)
    
    return lstm_features, time_indexes

def process_customer(product_id, customer_id):
    df_p = df.filter((pl.col('product_id') == product_id) & (pl.col('customer_id') == customer_id))
    customer_data_count = df_p.shape[0]
    if customer_data_count < 2:
        return []
    lstm_features, time_indexes = create_lstm_features(df_p)
    return [(product_id, customer_id, feature[0], time_index) for feature, time_index in zip(lstm_features, time_indexes)]

# Apply the function to all customers
customer_ids = df.select(pl.col('customer_id')).unique().to_series().to_list()
product_ids = df.select(pl.col('product_id')).unique().to_series().to_list()
lstm_features_list = []
for proid in tqdm(product_ids, desc="Processing products"):
    results = Parallel(n_jobs=-1)(
        delayed(process_customer)(proid, cid) for cid in customer_ids
    )
    for result in results:
        if result:
            lstm_features_list.extend(result)

# Convert list to DataFrame
lstm_features_df = pl.DataFrame({
    'product_id': [item[0] for item in lstm_features_list],
    'customer_id': [item[1] for item in lstm_features_list],
    'lstm_feature': [item[2] for item in lstm_features_list],
    'periodo': [item[3] for item in lstm_features_list]
})


Processing products: 100%|██████████| 1/1 [01:49<00:00, 109.17s/it]


In [12]:

# Join the new columns to the original dataframe
df = df.join(lstm_features_df, on=['product_id', 'customer_id', 'periodo'], how='left')



ComputeError: datatypes of join keys don't match - `product_id`: i32 on left does not match `product_id`: null on right

In [None]:
df = df.to_pandas()
df.set_index('periodo', inplace=True)
df.index = df.index.to_period('M')

df.to_parquet(DATOS_DIR+'/FE_10_dataset-LSTM.parquet', engine='pyarrow')  
