In [1]:
import pandas as pd
import numpy as np
from PyEMD import CEEMDAN

In [18]:
def TStoSazEMD(dataframe, col_name, freq, epsilon=0.05):
    '''
    returns a dataframe with the following columns:
    - ds: original index
    - col_name: col_name
    - freq: time frequecy of input data: 'D' and '30min' available
    
    - imf_seasonals + imf_residual: CEEMDAN imfs groupped by seasonal pattern
    - imf1 ... imfN: all N CEEMDAN imfs
    '''
    
    # check for valid freq
    if freq not in ['D', '30min']:
        print(freq)
        print("TStoSazEMD is cannot deal with the defined frequency!")
        return -1
    
    s = dataframe[col_name].to_numpy()
    
    ceemdan = CEEMDAN(epsilon=epsilon)
    imfs = ceemdan.ceemdan(s)
    
    # create new dataframe
    new_df = pd.DataFrame()
    new_df['ds'] = dataframe.index
    new_df.set_index('ds', inplace=True)
    new_df[col_name] = s
    
    if 'D' == freq:
        # create saz_imfs
        imf_trend = imfs[-1]
        imf_annual = imfs[-2] + imfs[-3]
        imf_weekly = imfs[0] + imfs[1]
        imf_remain = s - imf_annual - imf_weekly - imf_trend

        # add saz_imfs to dafaframe
        new_df['imf_trend'] = imf_trend
        new_df['imf_annual'] = imf_annual
        new_df['imf_weekly'] = imf_weekly
        new_df['imf_remain'] = imf_remain
        
    elif '30min' == freq:
        imf_trend = imfs[-1]
        imf_annual = imfs[-2] + imfs[-3]
        imf_weekly = imfs[len(imfs)//2] + imfs[len(imfs)//2 + 1]
        imf_daily = imfs[1] + imfs[2] + imfs[0]    # not 100 sure about imfs[0], could only be noise; its mostly half-daily
        imf_remain = s - imf_annual - imf_weekly - imf_daily - imf_trend

        new_df['imf_trend'] = imf_trend
        new_df['imf_annual'] = imf_annual
        new_df['imf_weekly'] = imf_weekly
        new_df['imf_daily'] = imf_daily
        new_df['imf_remain'] = imf_remain
        
    else:
        return -1        

    # add result to dataframe as columns
    for i, imf in enumerate(imfs):
        name = 'imf%02d'% (i+1)
        new_df[name] = imf
        
    return new_df

In [19]:
df = pd.read_csv('linear_interp_store4969.csv', sep=',', index_col='ds', parse_dates=True)
#df = pd.read_csv('C:\\Users\\BrunoMendes\\Desktop\\bolsa\\git\\SchTrmt\\Treatment\\new_stores\\store_0002.csv', header=0, infer_datetime_format=True, parse_dates=['ds'], index_col=['ds'])

# day sample
#df = df[['sales','n_clients']]#.resample('D').sum()
df

Unnamed: 0_level_0,sales,n_clients,imputed
ds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-06-12 09:00:00,3.00,2.0,yes
2017-06-12 09:30:00,3.00,2.0,yes
2017-06-12 10:00:00,3.00,2.0,yes
2017-06-12 10:30:00,3.00,2.0,yes
2017-06-12 11:00:00,3.00,2.0,yes
...,...,...,...
2020-10-18 19:00:00,103.44,22.0,no
2020-10-18 19:30:00,162.26,35.0,no
2020-10-18 20:00:00,138.26,24.0,no
2020-10-18 20:30:00,53.57,16.0,no


In [24]:
imf_data = TStoSazEMD(df, 'sales', freq='30min')
imf_data

Unnamed: 0_level_0,sales,imf_trend,imf_annual,imf_weekly,imf_daily,imf_remain,imf01,imf02,imf03,imf04,...,imf06,imf07,imf08,imf09,imf10,imf11,imf12,imf13,imf14,imf15
ds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-06-12 09:00:00,3.00,108.335121,-34.309854,-37.940202,5.655688,-38.740753,0.075105,-0.002918,5.583501,-20.956894,...,8.915567,-5.815568,-15.804342,-22.135860,-5.423453,-0.727433,1.711279,-0.816399,-33.493455,108.335121
2017-06-12 09:30:00,3.00,108.335562,-34.312258,-38.250272,7.339452,-40.112484,-0.870547,0.805531,7.404468,-21.650847,...,9.501019,-6.219053,-16.094076,-22.156196,-5.414799,-0.724401,1.715410,-0.817906,-33.494352,108.335562
2017-06-12 10:00:00,3.00,108.336004,-34.314661,-38.537412,8.892953,-41.376884,-0.059783,0.021959,8.930777,-22.444917,...,9.868739,-6.610047,-16.367040,-22.170372,-5.405524,-0.721361,1.719544,-0.819411,-33.495250,108.336004
2017-06-12 10:30:00,3.00,108.336445,-34.317062,-38.801289,10.343671,-42.561766,0.172746,-0.130089,10.301014,-23.397522,...,10.006392,-6.987727,-16.622955,-22.178333,-5.395626,-0.718312,1.723681,-0.820914,-33.496148,108.336445
2017-06-12 11:00:00,3.00,108.336887,-34.319462,-39.041569,11.691830,-43.667686,0.324481,-0.223484,11.590832,-24.533937,...,9.906161,-7.351099,-16.861546,-22.180023,-5.385108,-0.715253,1.727821,-0.822415,-33.497047,108.336887
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-10-18 19:00:00,103.44,148.453046,-74.592616,2.447868,32.973963,-5.842261,-37.460157,20.838071,49.596050,-45.689631,...,6.346890,10.758683,2.565697,-0.117830,4.628492,-10.891217,28.611746,-71.235521,-3.357095,148.453046
2020-10-18 19:30:00,162.26,148.454118,-74.555550,2.420617,95.628198,-9.687383,30.716744,16.903877,48.007577,-49.555854,...,6.350104,10.872168,2.546788,-0.126171,4.628069,-10.878990,28.560078,-71.197776,-3.357774,148.454118
2020-10-18 20:00:00,138.26,148.455190,-74.518470,2.393174,72.975504,-11.045399,32.776067,1.936221,38.263216,-50.931709,...,6.319869,10.978752,2.527717,-0.134543,4.627599,-10.866739,28.508399,-71.160018,-3.358451,148.455190
2020-10-18 20:30:00,53.57,148.456262,-74.481376,2.365541,-12.625701,-10.144726,-23.118826,-12.284903,22.778028,-50.051163,...,6.257594,11.078205,2.508484,-0.142943,4.627084,-10.854463,28.456709,-71.122248,-3.359129,148.456262


In [25]:
#imf_data['imputed'] = df['imputed']

In [26]:
#imf_data.to_csv('store4969_sales_imf_data30min.csv')
#imf_data.to_csv('imf_data.csv')