In [1]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [None]:
levels = [
    [],                        # Level 1: Total
    ['state_id'],              # Level 2: State
    ['store_id'],              # Level 3: Store
    ['cat_id'],                # Level 4: Category
    ['dept_id'],               # Level 5: Department
    ['state_id', 'cat_id'],    # Level 6: State-Category
    ['state_id', 'dept_id'],   # Level 7: State-Department
    ['store_id', 'cat_id'],    # Level 8: Store-Category
    ['store_id', 'dept_id'],   # Level 9: Store-Department
    ['item_id'],               # Level 10: Item
    ['item_id', 'state_id'],   # Level 11: Item-State
    ['item_id', 'store_id']    # Level 12: Individual
]

def fit_distributions(data, key, distributions = [
    'weibull_min',
    'loggamma',
    'pareto',
    'exponnorm',
    'tukeylambda',
    'genlogistic',
    'pearson3',
    'truncexpon',
    'levy',
    't',
    'rice',
    'halfcauchy',
    'halfnorm',
    'norm',
    'gumbel_l',
    'halflogistic',
    'burr',
    'lognorm',
    'beta',
    'nct',
    'f',
    'gumbel_r',
    'laplace',
    'johnsonsu',
    'fatiguelife',
    'truncnorm',
    'exponweib',
    'wald',
    'chi2',
    'levy_l',
    'vonmises_line',
    'invgauss',
    'skewnorm',
    'powernorm',
    'powerlognorm',
    'gamma',
    'cauchy',
    'genexpon',
    'genhalflogistic',
    'loglaplace',
    'vonmises',
    'genextreme',
    'gengamma',
    'gennorm',
    'genpareto'
]):
    results = pd.DataFrame(columns=['level', 'group', 'distribution', 'parameters', 'p_value', 'significant', 'aic', 'bic'])

    for dist_name in distributions:
        dist = getattr(scipy.stats, dist_name)
        params = dist.fit(data)
        
        fitted_dist = dist(*params)
        _, ks_p_value = scipy.stats.kstest(data, fitted_dist.cdf)

        loglik = np.sum(fitted_dist.logpdf(data))
        n_params = len(params)
        n_samples = len(data)
        aic = 2 * n_params - 2 * loglik
        bic = n_params * np.log(n_samples) - 2 * loglik
        
        result = {
            'level': '_'.join(key.split('_')[:2]),
            'group': '_'.join(key.split('_')[2:]) if len(key.split('_')) > 2 else 'Total',
            'distribution': dist_name,
            'parameters': str(params),
            'p_value': ks_p_value,
            'significant': ks_p_value > 0.01,
            'aic': aic,
            'bic': bic
        }
        results = results.append(result, ignore_index=True)

    results = results.sort_values(['significant','aic','bic'], ascending=[False,True,True])
    
    return results

# def plot_distribution_fit(data, dist_name, params, save_path, level=None, group=None):
#     plt.figure(figsize=(15, 8))
    
#     plt.hist(data, bins='auto', density=True, alpha=0.7, color='skyblue', label='data')

#     dist = getattr(scipy.stats, dist_name)
#     x = np.linspace(min(data), max(data), 100)
#     y = dist.pdf(x, *params)
#     plt.plot(x, y, 'r-', label=f'{dist_name}')

#     if level is not None and group is not None:
#         title = f'Level {level} {group}'
#     elif level is not None:
#         title = f'Level {level}'
        
#     plt.title(title)
#     plt.xlabel('Value')
#     plt.ylabel('Density')
#     plt.legend(loc='upper right')
#     plt.savefig(save_path)
#     plt.close()

for level_idx, group_cols in enumerate(levels):
    print(f"\nProcessing Level {level_idx+1}")
    df = pd.read_csv(f'../data/preprocessed/agg_df_level_{level_idx+1}.csv')
    
    if not group_cols: # Level 1
        data = df['sales_sum']
        dist_results = fit_distributions(data, f'level_{level_idx+1}')
        
        # if best_result:
        #     all_level_results.append(best_result)
        #     plot_distribution_fit(
        #         data,
        #         best_result['distribution'],
        #         eval(best_result['parameters']),
        #         f'../result/distribution_fitting/distribution_fit_level_{level_idx+1}.png',
        #         level=level_idx+1
        #     )

        dist_results.to_excel(f'../result/distribution_fitting/distribution_fit_level_{level_idx+1}.xlsx', index=False)
    
    else: # Level 2~12
        for group_name, group_data in df.groupby(group_cols):
            group_key = '_'.join(map(str, group_name)) if isinstance(group_name, tuple) else str(group_name)
            
            dist_results = fit_distributions(group_data['sales_sum'], f'level_{level_idx+1}_{group_key}')
            
            # if best_result:
            #     all_level_results.append(best_result)
            #     plot_distribution_fit(
            #         group_data['sales_sum'],
            #         best_result['distribution'],
            #         eval(best_result['parameters']),
            #         f'../result/distribution_fitting/distribution_fit_level_{level_idx+1}_{group_key}.png',
            #         level=level_idx+1,
            #         group=group_key
            #     )

            dist_results.to_excel(f'../result/distribution_fitting/distribution_fit_level_{level_idx+1}_{group_key}.xlsx', index=False)


Processing Level 1

Processing Level 2

Processing Level 3

Processing Level 4

Processing Level 5

Processing Level 6

Processing Level 7

Processing Level 8

Processing Level 9

Processing Level 10

Processing Level 11
