In [157]:
import pandas as pd
import numpy as np
import warnings

data_path = '../data/competitive-data-science-predict-future-sales/'

warnings.filterwarnings(action='ignore')

sales_train = pd.read_csv(data_path + 'sales_train.csv')
shops = pd.read_csv(data_path + 'shops.csv')
items = pd.read_csv(data_path + 'items.csv')
item_categories = pd.read_csv(data_path + 'item_categories.csv')
test = pd.read_csv(data_path + 'test.csv')
submission = pd.read_csv(data_path + 'sample_submission.csv')

In [158]:
sales_train = sales_train.rename(columns={'date': '날짜',
                                          'date_block_num': '월ID',
                                          'shop_id': '상점ID',
                                          'item_id': '상품ID',
                                          'item_price': '판매가',
                                          'item_cnt_day': '판매량'})

shops = shops.rename(columns={'shop_name': '상점명',
                              'shop_id': '상점ID'})

items = items.rename(columns={'item_name': '상품명',
                              'item_id': '상품ID',
                              'item_category_id': '상품분류ID'})

item_categories = item_categories.rename(columns={
    'item_category_name': '상품분류명',
    'item_category_id': '상품분류ID',
})

test = test.rename(columns={'shop_id': '상점ID',
                            'item_id': '상품ID'})

## BaseLine: lightgbm

#### 데이터 다운캐스팅

In [159]:
def downcast(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast='integer')
        else:
            df[col] = pd.to_numeric(df[col], downcast='float')

    end_mem = df.memory_usage().sum() / 1024 ** 2

    if verbose:
        print("{:.1f}% 압축됨".format(100 * (start_mem - end_mem) / start_mem))

    return df

In [160]:
all_df = [sales_train, shops, items, item_categories, test]
for df in all_df:
    df = downcast(df)

54.2% 압축됨
38.5% 압축됨
54.2% 압축됨
39.8% 압축됨
70.8% 압축됨


#### 원형 데이터 확보하기

In [161]:
from itertools import product

train = []

for i in sales_train['월ID'].unique():
    all_shop = sales_train.loc[sales_train['월ID'] == i, '상점ID'].unique()
    all_item = sales_train.loc[sales_train['월ID'] == i, '상품ID'].unique()
    train.append(np.array(list(product([i], all_shop, all_item))))

idx_features = ['월ID', '상점ID', '상품ID']
train = pd.DataFrame(np.vstack(train), columns= idx_features)

#### 타깃 관련 값 추가하기

In [162]:
# 타깃 (판매량) 추가

group = sales_train.groupby(idx_features).agg({'판매량': 'sum'})
group = group.reset_index()

group =  group.rename(columns={'판매량': '월간 판매량'})
train = train.merge(group, on=idx_features, how='left')

In [163]:
train

Unnamed: 0,월ID,상점ID,상품ID,월간 판매량
0,0,59,22154,1.0
1,0,59,2552,
2,0,59,2554,
3,0,59,2555,
4,0,59,2564,
...,...,...,...,...
10913845,33,21,7635,
10913846,33,21,7638,
10913847,33,21,7640,
10913848,33,21,7632,


In [164]:
# test 추가

test['월ID'] = 34

all_data = pd.concat([train, test.drop('ID', axis=1)],
                     ignore_index=True,
                     keys=idx_features)

In [165]:
all_data = all_data.fillna(0)
all_data.head()

Unnamed: 0,월ID,상점ID,상품ID,월간 판매량
0,0,59,22154,1.0
1,0,59,2552,0.0
2,0,59,2554,0.0
3,0,59,2555,0.0
4,0,59,2564,0.0


#### 값 추가하기

In [166]:
print(shops.columns)
print(all_data.columns)

Index(['상점명', '상점ID'], dtype='object')
Index(['월ID', '상점ID', '상품ID', '월간 판매량'], dtype='object')


In [167]:
all_data = all_data.merge(shops, on='상점ID', how='left')
all_data = all_data.merge(items, on='상품ID', how='left')
all_data = all_data.merge(item_categories, on='상품분류ID', how='left')

all_data = downcast(all_data)

31.1% 압축됨


In [168]:
all_data.shape

(11128050, 8)

In [169]:
all_data = all_data.drop(['상점명', '상품명', '상품분류명'], axis=1)

In [170]:
del shops, items, item_categories
gc.collect()

231

#### 최종 데이터 분류

In [171]:
X_train = all_data[all_data['월ID'] < 33]
X_train = X_train.drop(['월간 판매량'], axis=1)

X_valid = all_data[all_data['월ID'] == 33]
X_valid = X_valid.drop(['월간 판매량'], axis=1)

X_test = all_data[all_data['월ID'] ==  34]
X_test = X_test.drop(['월간 판매량'], axis=1)

y_train = all_data[all_data['월ID'] < 33]['월간 판매량']
y_train = y_train.clip(0, 20) # 대회 규정에 따라 최소, 최대 값 지정

y_valid = all_data[all_data['월ID'] == 33]['월간 판매량']
y_valid = y_valid.clip(0, 20) # 대회 규정에 따라 최소, 최대 값 지정


In [172]:
del all_data
gc.collect();

### 모델 훈련

In [173]:
import lightgbm as lgb

params = {
    'metric': 'rmse',
    'num_leaves': 255,
    'learning_rate': 0.01,
    'force_col_wise': True,
    'random_state': 10
}

cat_features = ['상점ID', '상품분류ID']

dtrain = lgb.Dataset(X_train, y_train)
dvalid = lgb.Dataset(X_valid, y_valid)

lgb_model = lgb.train(params=params,
                      train_set=dtrain,
                      num_boost_round=500,
                      valid_sets=(dtrain, dvalid),
                      categorical_feature=cat_features)

preds = lgb_model.predict(X_test).clip(0,20)

submission['item_cnt_month'] = preds
submission.to_csv(data_path + 'submission.csv', index=False)

[LightGBM] [Info] Total Bins 426
[LightGBM] [Info] Number of data points in the train set: 10675678, number of used features: 4
[LightGBM] [Info] Start training from score 0.299125


In [174]:
del X_train, y_train, X_valid, y_valid, X_test, lgb_model, dvalid, dtrain
gc.collect()

22