In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
import pandas as pd

print('Loading data...')
sell_price = pd.read_csv('../data/sell_prices.csv')
calendar = pd.read_csv('../data/calendar.csv')
sales = pd.read_csv('../data/sales_train_evaluation.csv')
sales['id'] = sales['id'].str.replace('_evaluation', '', regex=False)
sales.set_index('id', inplace=True)

cat_cols = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
ts_cols = [col for col in sales.columns if col not in cat_cols]
ts_dict = {t: int(t[2:]) for t in ts_cols}

print('   N_unique %s: %i' % ('id', len(sales)))
for col in cat_cols:
    print('   N_unique %s: %i' % (col, sales[col].nunique()))

Loading data...
   N_unique id: 30490
   N_unique item_id: 3049
   N_unique dept_id: 7
   N_unique cat_id: 3
   N_unique store_id: 10
   N_unique state_id: 3


In [3]:
ids = ['dept_id', 'store_id']

### aggregate and plot

In [4]:
# aggregated_sales = sales.groupby(ids)[ts_cols].sum()

In [5]:
%matplotlib inline
from utils import plot_with_matplotlib_1, plot_with_plotly

# plot_with_matplotlib_1(aggregated_sales)
# plot_with_plotly(sales)

### filter and group

In [6]:
from utils import filter_sales_after_launch_and_group_sales

filtered_and_grouped_sales = filter_sales_after_launch_and_group_sales(sales, ids)

for group, sale in filtered_and_grouped_sales.items():
    print(f"Group: {group}, Sales: {sale}")
    break

30490
20879
Group: ('FOODS_1', 'CA_1'), Sales: {0: 297, 1: 284, 2: 214, 3: 175, 4: 182, 5: 191, 6: 224, 7: 263, 8: 245, 9: 176, 10: 217, 11: 156, 12: 242, 13: 327, 14: 409, 15: 446, 16: 269, 17: 169, 18: 164, 19: 167, 20: 226, 21: 301, 22: 303, 23: 228, 24: 160, 25: 156, 26: 193, 27: 238, 28: 327, 29: 264, 30: 177, 31: 276, 32: 203, 33: 216, 34: 230, 35: 355, 36: 283, 37: 184, 38: 232, 39: 204, 40: 187, 41: 252, 42: 354, 43: 304, 44: 239, 45: 186, 46: 218, 47: 188, 48: 266, 49: 306, 50: 266, 51: 239, 52: 187, 53: 149, 54: 161, 55: 207, 56: 252, 57: 218, 58: 152, 59: 155, 60: 141, 61: 142, 62: 188, 63: 206, 64: 225, 65: 161, 66: 117, 67: 164, 68: 166, 69: 183, 70: 205, 71: 191, 72: 144, 73: 116, 74: 135, 75: 132, 76: 182, 77: 217, 78: 246, 79: 141, 80: 169, 81: 157, 82: 183, 83: 358, 84: 452, 85: 186, 86: 121, 87: 130, 88: 114, 89: 125, 90: 188, 91: 205, 92: 209, 93: 167, 94: 215, 95: 163, 96: 185, 97: 235, 98: 272, 99: 211, 100: 175, 101: 162, 102: 135, 103: 167, 104: 179, 105: 274, 10

In [7]:
%matplotlib inline
from utils import plot_with_matplotlib_2

# plot_with_matplotlib_2(filtered_and_grouped_sales)

### analyze seasonality

https://otexts.com/fpp2/seasonal-strength.html

- ACF ?

def estimate_period_acf(time_series, max_lag=50):
    from statsmodels.tsa.stattools import acf
    acf_values = acf(time_series, nlags=max_lag)
    lags = np.arange(len(acf_values))
    peaks = lags[(acf_values > 0.5) & (lags > 0)] 
    if len(peaks) > 0:
        return peaks[0]
    return None

In [8]:
from utils import process_each_group

seasonality_results = []
for group in filtered_and_grouped_sales.keys():
    group_label = "_".join(group) if isinstance(filtered_and_grouped_sales.keys(), pd.MultiIndex) and isinstance(group, tuple) else group
    group_values = filtered_and_grouped_sales[group].values()

    dominant_period, seasonal_strength = process_each_group(group_label, group_values, visualize=False)

    seasonality_results.append({"Group": group_label, "Dominant Period": dominant_period, "Seasonal Strength": seasonal_strength})

seasonality_results = pd.DataFrame(seasonality_results)

1941
1641
FOODS_1_CA_1's Dominant Frequency: 0.14259597806215724 1/days
FOODS_1_CA_1's Dominant Period: 7.012820512820512 days
FOODS_1_CA_1's Seasonal Strength (0~1): 0.48035873485106584
1941
1641
FOODS_1_CA_2's Dominant Frequency: 0.14259597806215724 1/days
FOODS_1_CA_2's Dominant Period: 7.012820512820512 days
FOODS_1_CA_2's Seasonal Strength (0~1): 0.5493381719149644
1941
1641
FOODS_1_CA_3's Dominant Frequency: 0.14259597806215724 1/days
FOODS_1_CA_3's Dominant Period: 7.012820512820512 days
FOODS_1_CA_3's Seasonal Strength (0~1): 0.3914848077947135
1941
1641
FOODS_1_CA_4's Dominant Frequency: 0.14259597806215724 1/days
FOODS_1_CA_4's Dominant Period: 7.012820512820512 days
FOODS_1_CA_4's Seasonal Strength (0~1): 0.32982512973062394
1941
1641
FOODS_1_TX_1's Dominant Frequency: 0.14320536258379038 1/days
FOODS_1_TX_1's Dominant Period: 6.982978723404255 days
FOODS_1_TX_1's Seasonal Strength (0~1): 0.31800195956879995
1941
1641
FOODS_1_TX_2's Dominant Frequency: 0.14259597806215724 1/

In [9]:
seasonality_results.to_excel("seasonality_results.xlsx")