In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.seasonal import seasonal_decompose
from tqdm.notebook import tqdm

In [2]:
#DATOS_DIR = '~/buckets/b1/datasets/'
DATOS_DIR = '../data/'

# Leer datos
df = pd.read_parquet(DATOS_DIR+'FE_02_dataset.parquet') 
df.columns = df.columns.str.replace(' ', '_').str.replace(r'[^A-Za-z0-9_]', '', regex=True)

### Filtrar datos
df = df.loc['2018-01-01':'2019-11-01']


In [3]:
# Pivot dataframe
pivot_df = df.pivot_table(index='periodo', columns=['product_id', 'customer_id'], values='tn', fill_value=0)



In [4]:
# Initialize list to hold feature dataframes
feature_list = []

# Minimum required observations for seasonal decomposition
min_observations = 24

# Calculate features for each product-customer time series with progress bar
for col in tqdm(pivot_df.columns, desc="Calculating features"):
    series = pivot_df[col]
    
    # Check if the series has enough observations
    if series.count() < min_observations:
        # Interpolate to fill missing values
        series = series.interpolate(method='linear')
    
    mean_val = series.mean()
    std_val = series.std()
    
    # Try to perform seasonal decomposition
    try:
        decomposition = seasonal_decompose(series, model='additive', period=12)
        trend_val = decomposition.trend.mean()
        seasonality_val = decomposition.seasonal.mean()
    except ValueError:
        # If seasonal decomposition fails, set trend and seasonality to NaN
        trend_val = np.nan
        seasonality_val = np.nan
    
    # Create a temporary dataframe for the features
    temp_df = pd.DataFrame({
        'product_id': [col[0]],
        'customer_id': [col[1]],
        'mean': [mean_val],
        'std': [std_val],
        'trend': [trend_val],
        'seasonality': [seasonality_val]
    })
    
    # Append the temporary dataframe to the feature list
    feature_list.append(temp_df)

# Concatenate all the temporary dataframes into a single dataframe
features = pd.concat(feature_list, ignore_index=True)

# Fill NaN values with 0 or another appropriate value
features.fillna(0, inplace=True)

# Scale the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features[['mean', 'std', 'trend', 'seasonality']])


Calculating features:   0%|          | 0/237102 [00:00<?, ?it/s]

In [5]:

# Apply PCA
pca = PCA(n_components=4)
pca_features = pca.fit_transform(scaled_features)


In [6]:

# Add PCA features to the dataframe
features[['pca1', 'pca2', 'pca3', 'pca4']] = pca_features

# Clustering (example with KMeans)
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)
clusters = kmeans.fit_predict(pca_features)

# Add cluster information to the dataframe
features['cluster'] = clusters

# Merge the features back to the original dataframe
# First, reset the index of the original dataframe
df_reset = df.reset_index()


In [7]:

# Merge the original dataframe with the features dataframe
result_df = df_reset.merge(features, on=['product_id', 'customer_id'])


result_df.set_index('periodo',inplace=True)

In [8]:
result_df.to_parquet(DATOS_DIR+'/FE_dataset-PCA-Decompose.parquet', engine='pyarrow')  
