In [1]:
import warnings
import pandas as pd

warnings.filterwarnings("ignore")

In [2]:
'''
id (item_id + store_id)
item_id
dept_id: ['HOBBIES_1', 'HOBBIES_2', 'HOUSEHOLD_1', 'HOUSEHOLD_2', 'FOODS_1', 'FOODS_2', 'FOODS_3']
cat_id: ['HOBBIES', 'HOUSEHOLD', 'FOODS']
store_id: ['CA_1', 'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2', 'TX_3', 'WI_1', 'WI_2', 'WI_3']
state_id" ['CA', 'TX', 'WI']
'''

sales = pd.read_csv('../data/original/sales_train_evaluation.csv')

sales = sales[['state_id','item_id'] + ['d_%d' % x for x in range(1,1941+1)]]

# state_id, item_id 그룹으로 묶고 합산
sales = sales.groupby(['state_id', 'item_id']).sum().reset_index()

In [4]:
calendar = pd.read_csv(f'../data/original/calendar.csv')

calendar = calendar[calendar['d'].isin(['d_%d' % x for x in range(1,1941+1)])] # d_1 ~ d_1941
calendar['is_weekend'] = calendar['weekday'].apply(lambda x: 1 if x in ['Saturday', 'Sunday'] else 0) # 주말이면 1, 평일이면 0

week_and_days = calendar[['wm_yr_wk', 'd']] # sell_prices에서 사용할 것

In [5]:
is_weekend = calendar[['d', 'is_weekend']] # 0(평일) / 1(주말)

is_weekend = is_weekend.pivot_table(values='is_weekend', columns='d')
is_weekend = is_weekend.reindex(columns=['d_%d' % x for x in range(1,1941+1)]) # d_1 ~ d_1941 순서로 정렬

In [6]:
events = calendar[['d', 'event_type_1', 'event_type_2']]

# event_type_1, event_type_2 원핫인코딩
events = pd.get_dummies(events, columns=['event_type_1', 'event_type_2'])

# event_type_1, event_type_2 통합
events['event_type_Cultural'] = events['event_type_1_Cultural'] + events['event_type_2_Cultural']
events['event_type_National'] = events['event_type_1_National']
events['event_type_Religious'] = events['event_type_1_Religious'] + events['event_type_2_Religious']
events['event_type_Sporting'] = events['event_type_1_Sporting']
events = events.drop([
    'event_type_1_Cultural', 'event_type_1_National', 'event_type_1_Religious', 'event_type_1_Sporting',
    'event_type_2_Cultural', 'event_type_2_Religious'
], axis=1)

# bool -> int
events[['event_type_Cultural', 'event_type_National', 'event_type_Religious', 'event_type_Sporting']] = events[['event_type_Cultural', 'event_type_National', 'event_type_Religious', 'event_type_Sporting']].astype(int)

# d_1 ~ d_1941을 열로 피봇
events = events.pivot_table(values=['event_type_Cultural', 'event_type_National', 'event_type_Religious', 'event_type_Sporting'], columns='d')
events = events.reindex(columns=['d_%d' % x for x in range(1,1941+1)]) # d_1 ~ d_1941 순서로 정렬

# event_type별로 csv 저장
event_types = ["event_type_Cultural", "event_type_National", "event_type_Religious", "event_type_Sporting"]
for event_type in event_types:
    event = events.loc[[event_type]]
    event.to_csv(f'../data/preprocessed/{event_type}.csv', index=False)

In [12]:
sell_prices = pd.read_csv(f'../data/original/sell_prices.csv')

# 주차별 가격을 일자별로 확장
sell_prices = sell_prices[sell_prices['wm_yr_wk'] <= 11617]
sell_prices = pd.merge(week_and_days, sell_prices, on=['wm_yr_wk'], how='left')
sell_prices = sell_prices.drop(['wm_yr_wk'], axis=1)

# store를 state로 묶고 평균
sell_prices['state_id'] = sell_prices['store_id'].apply(lambda x: x[:2])
sell_prices = sell_prices.drop(['store_id'], axis=1)
sell_prices = sell_prices.groupby(['state_id', 'item_id', 'd'])['sell_price'].mean().reset_index()

# d_1 ~ d_1941을 열로 피봇
sell_prices = sell_prices.pivot_table(values='sell_price', index=['state_id', 'item_id'], columns='d') 
sell_prices.reset_index(drop=False, inplace=True)
sell_prices = sell_prices.reindex(['state_id','item_id'] + ['d_%d' % x for x in range(1,1941+1)], axis=1) # 정렬