### Purpose:

1) Merge raw data files into single feature table

2) split table into multiple sets to resolve memory overload problems

In [2]:
import pandas as pd

# load raw data files
df_train = pd.read_csv('../../data/raw/sales_train.csv')
df_items = pd.read_csv('../../data/raw/items_weekly_sell_prices.csv')
df_cal = pd.read_csv('../../data/raw/calendar.csv')
df_events = pd.read_csv('../../data/raw/calendar_events.csv')

In [3]:
stores = df_train['store_id'].unique()

In [4]:
## save subset files for each store
for store in stores:
    df = df_train.loc[df_train['store_id'] == store]
    df.to_csv(f'../../data/interim/sales_train_{store}.csv', index=False)

In [116]:
## clear memory of df_train
del(df_train)

In [38]:
## develop functions to clean df_train
def convert_train(df):
    df.pop('id')
    df = pd.melt(df, id_vars=['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'd', value_name = 'items_sold')

    return df

def merge_cal(df):
    df_cal = pd.read_csv('../../data/raw/calendar.csv')
    df = pd.merge(df, df_cal, on='d', how='left')
    df.pop('d')

    del(df_cal)
    return df

def merge_item_prices(df):
    df_items = pd.read_csv('../../data/raw/items_weekly_sell_prices.csv')
    df = pd.merge(df, df_items, left_on = ['store_id', 'item_id', 'wm_yr_wk'], right_on = ['store_id', 'item_id', 'wm_yr_wk'], how = 'left')
    df.pop('wm_yr_wk')
    df['items_sold'] = df['items_sold'].fillna(0)
    df['sell_price'] = df['sell_price'].fillna(0)

    df['revenue'] = df['items_sold'] * df['sell_price']
    df.pop('items_sold')
    df.pop('sell_price')

    del(df_items)
    return df

def merge_events(df):
    df_events = pd.read_csv('../../data/raw/calendar_events.csv')
    df_events = pd.pivot_table(df_events, index='date', values='event_name', columns='event_type', aggfunc='count', fill_value= 0)
    df_events = df_events.rename_axis(columns= None).reset_index()
    ### rename cols for better readbility
    df_events = df_events.rename(columns={"Cultural": "event_cultural", "National": "event_national", "Religious": "event_religious", "Sporting": "event_sport"})

    df = pd.merge(df, df_events, left_on = ['date'], right_on = ['date'], how = 'left')
    df = df.fillna(0)
    del(df_events)
    return df

def separate_dates(df):
    df['date'] = pd.to_datetime(df['date'])
    df['day_of_year'] = df['date'].dt.day_of_year
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df.pop('date')
    return df

def sum_revenue(df):
    df = df.drop(['store_id','item_id','dept_id','cat_id','state_id'], axis=1)
    df = df.groupby(by=['event_cultural','event_national','event_religious', 'event_sport','day_of_year','month','year']).sum()
    df = df.rename_axis(columns= None).reset_index()
    return df


In [28]:
df_train = pd.read_csv('../../data/interim/sales_train_CA_1.csv')

In [29]:
df_train

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1532,d_1533,d_1534,d_1535,d_1536,d_1537,d_1538,d_1539,d_1540,d_1541
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,1,1,0,1,0,1,0,0,1
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,8,2,0,8,2,3,1,1,3,8
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,0,1,3,2,1,1,2,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,FOODS_3_823_CA_1_evaluation,FOODS_3_823,FOODS_3,FOODS,CA_1,CA,0,0,0,0,...,1,0,0,2,0,0,1,0,0,3
3045,FOODS_3_824_CA_1_evaluation,FOODS_3_824,FOODS_3,FOODS,CA_1,CA,1,0,5,0,...,0,0,0,0,0,0,0,0,0,0
3046,FOODS_3_825_CA_1_evaluation,FOODS_3_825,FOODS_3,FOODS,CA_1,CA,0,0,0,0,...,0,4,1,0,1,1,1,1,0,2
3047,FOODS_3_826_CA_1_evaluation,FOODS_3_826,FOODS_3,FOODS,CA_1,CA,0,0,0,0,...,3,2,0,2,0,0,1,4,2,0


In [30]:
df_train = convert_train(df_train)
df_train = merge_cal(df_train)
df_train = merge_item_prices(df_train)
df_train = merge_events(df_train)
df_train = separate_dates(df_train)

In [31]:
df_train = sum_revenue(df_train)

In [32]:
df_train

Unnamed: 0,store_id,event_cultural,event_national,event_religious,event_sport,day_of_year,month,year,revenue
0,CA_1,0.0,0.0,0.0,0.0,2,1,2012,11013.95
1,CA_1,0.0,0.0,0.0,0.0,2,1,2013,10151.68
2,CA_1,0.0,0.0,0.0,0.0,2,1,2014,12371.62
3,CA_1,0.0,0.0,0.0,0.0,2,1,2015,15226.02
4,CA_1,0.0,0.0,0.0,0.0,3,1,2012,9352.61
...,...,...,...,...,...,...,...,...,...
1536,CA_1,1.0,0.0,0.0,0.0,305,10,2012,6332.10
1537,CA_1,1.0,0.0,0.0,1.0,166,6,2014,14698.71
1538,CA_1,1.0,0.0,1.0,0.0,110,4,2014,13308.95
1539,CA_1,1.0,0.0,1.0,0.0,114,4,2011,8946.23


In [33]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1541 entries, 0 to 1540
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   store_id         1541 non-null   object 
 1   event_cultural   1541 non-null   float64
 2   event_national   1541 non-null   float64
 3   event_religious  1541 non-null   float64
 4   event_sport      1541 non-null   float64
 5   day_of_year      1541 non-null   int32  
 6   month            1541 non-null   int32  
 7   year             1541 non-null   int32  
 8   revenue          1541 non-null   float64
dtypes: float64(5), int32(3), object(1)
memory usage: 90.4+ KB


In [47]:
def pre_engineering(df):
    df = convert_train(df)
    df = merge_cal(df)
    df = merge_item_prices(df)
    df = merge_events(df)
    df = separate_dates(df)
    df = sum_revenue(df)
    return df


In [35]:
df_train = pd.DataFrame()

In [48]:
import pandas as pd
from glob import glob
df_train = pd.DataFrame()

for f in glob('../../data/interim/sales_train_*.csv'):
    df = pd.read_csv(f)
    df = pre_engineering(df)

    if df_train.empty:
        df_train = df
    else:
        df_train = pd.concat([df_train, df], ignore_index=True)
    
    del(df)
del(glob)       ## recover memory
        

In [49]:
df_train

Unnamed: 0,event_cultural,event_national,event_religious,event_sport,day_of_year,month,year,revenue
0,0.0,0.0,0.0,0.0,2,1,2012,11013.95
1,0.0,0.0,0.0,0.0,2,1,2013,10151.68
2,0.0,0.0,0.0,0.0,2,1,2014,12371.62
3,0.0,0.0,0.0,0.0,2,1,2015,15226.02
4,0.0,0.0,0.0,0.0,3,1,2012,9352.61
...,...,...,...,...,...,...,...,...
15405,1.0,0.0,0.0,0.0,305,10,2012,6638.20
15406,1.0,0.0,0.0,1.0,166,6,2014,9737.29
15407,1.0,0.0,1.0,0.0,110,4,2014,7459.14
15408,1.0,0.0,1.0,0.0,114,4,2011,6690.13


In [50]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15410 entries, 0 to 15409
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   event_cultural   15410 non-null  float64
 1   event_national   15410 non-null  float64
 2   event_religious  15410 non-null  float64
 3   event_sport      15410 non-null  float64
 4   day_of_year      15410 non-null  int32  
 5   month            15410 non-null  int32  
 6   year             15410 non-null  int32  
 7   revenue          15410 non-null  float64
dtypes: float64(5), int32(3)
memory usage: 782.7 KB


In [51]:
df_train.to_parquet('../../data/processed/df_forecast.parquet')