# 0. 개요

- 시계열 데이터

# 1. 라이브러리 로드 및 데이터 불러오기

## 1.1 라이브러리 로드 및 메모리 감소 함수 정의

In [None]:
import pandas as pd
import numpy as np 
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn import metrics
from sklearn.model_selection import TimeSeriesSplit

import matplotlib.pyplot as plt
import seaborn as sns

import time
import datetime
import os

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_colwidth', None)

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

def write_record(features, params):
    record = open("record model and features.txt", 'a')
    record.write("\n")
    record.write(str(datetime.datetime.now())+"\n")

    check = 0
    for _ in features:
        check += 1
        if check % 5 == 0:
            record.write("\n")
        record.write(_+"  ")
    record.write("\n")
    for i  in params.items():
        record.write(str(i) + "\n")

    record.write('--------------------------------\n')
    record.close()

## 1.2 데이터 불러오기 및 pd.melt를 활용해 데이터 정렬

prepare training and test data.
- 2011-01-29 ~ 2016-04-24 : d_1    ~ d_1913
- 2016-04-25 ~ 2016-05-22 : d_1914 ~ d_1941 (public)
- 2016-05-23 ~ 2016-06-19 : d_1942 ~ d_1969 (private)

In [None]:
train = pd.read_csv('inputs/sales_train_validation.csv')
train = pd.melt(train, id_vars=['id','item_id','dept_id','cat_id','store_id','state_id'], var_name='d', value_name='target')

test = pd.read_csv('inputs/sample_submission.csv')
test2 = test[30490:]

test = test[:30490]
test = pd.melt(test, id_vars=['id'], var_name='d', value_name='target')
for i in range(1, 29):
    test = test.replace({f'F{i}': f'd_{1913+i}'})

test[['cat_id', 'dept_id', 'item_id', 'state_id', 'store_id', 'tmp']] = pd.DataFrame(test['id'].str.split('_').tolist())
del test['tmp']
test['store_id'] = test['state_id'] + '_' + test['store_id']
test['dept_id'] = test['cat_id'] + '_' + test['dept_id']
test['item_id'] = test['dept_id'] + '_' + test['item_id']

test = test[train.columns]

calendar = pd.read_csv('inputs/calendar.csv')

sell_prices = pd.read_csv('inputs/sell_prices.csv')

In [None]:
test.tail()

# 2. 데이터 탐색

## 2.1 train

In [None]:
train.head(2)

In [None]:
# 아웃라이어를 제거해서 boxplot 더 잘보이게끔
train200 = train[train['target'] < 10]

### 2.1.1 target

In [None]:
pd.set_option('display.max_rows', 419)
pd.DataFrame(train['target'].value_counts())

In [None]:
plt.figure(figsize=(10,7))
display(
    train[train['target'] == 763],
    train[train['id'] == 'FOODS_3_090_CA_3_validation'],
    sns.distplot(train[train['id'] == 'FOODS_3_090_CA_3_validation']['target'])    
)

In [None]:
plt.figure(figsize=(10,7))
display(
    train[train['target'] == 370],
    train[train['id'] == 'FOODS_3_318_CA_3_validation'],
    sns.distplot(train[train['id'] == 'FOODS_3_318_CA_3_validation']['target'])    
)

In [None]:
train[train['target'] == 763]

In [None]:
# 시간 오래걸림

plt.figure(figsize=(10,7))
# sns.distplot(train['target'])

### 2.1.2 id

In [None]:
train.groupby('id')['target'].mean()

### 2.1.3 item_id

In [None]:
train.groupby('item_id')['target'].mean()

In [None]:
train['item_id'].value_counts()

### 2.1.4 dept_id

In [None]:
train.groupby('dept_id')['target'].mean()

In [None]:
train['dept_id'].value_counts()

In [None]:
plt.figure(figsize=(10,7))
sns.boxplot(train['dept_id'], train200['target'])

### 2.1.5 cat_id

In [None]:
train.groupby('cat_id')['target'].mean()

In [None]:
train['cat_id'].value_counts()

### 2.1.6 store_id

In [None]:
train.groupby('store_id')['target'].mean()

In [None]:
train['store_id'].value_counts()

store_id 를 기준으로 10가지 모델을 만들어봐도 좋을 듯 싶다.  
우선 검증을 해야함. 각각이 많이 다른지부터

### 2.1.7 state_id

In [None]:
train.groupby('state_id')['target'].mean()

In [None]:
train['state_id'].value_counts()

### 2.1.8 d

In [None]:
train.groupby('d')['target'].mean()

## 2.2 test

In [None]:
test.head(2)

## 2.3 calendar

- 대부분 date 데이터고, event 및 snap(정부보조금 적용되는 날)에 집중해보자.
- event_name_2가 너무 적다. 우선을 빼고 모델 만들어 볼 것임.
- 이벤트 유무(is_event) 컬럼 만들 예정

In [None]:
calendar.head()

In [None]:
calendar = calendar.drop(['event_name_2', 'event_type_2'], axis=1)
calendar['is_event'] = calendar['event_name_1'].notna().astype('int8')
del calendar['wday']
del calendar['wm_yr_wk'] # 모든 주차에 인덱스 붙여놓음. 282개

In [None]:
calendar['day'] = calendar['date'].astype('datetime64').dt.day
calendar['week'] = calendar['date'].astype('datetime64').dt.week


In [None]:
calendar.head()

In [None]:
train = train.merge(calendar, how='left')
test = test.merge(calendar, how='left')

In [None]:
train.head()

In [None]:
train['day'] = train['date'].astype('datetime64').dt.day
test['day'] = test['date'].astype('datetime64').dt.day

train['week'] = train['date'].astype('datetime64').dt.week
test['week'] = test['date'].astype('datetime64').dt.week

## 2.4 sell_prices

In [None]:
sell_prices.info()

In [None]:
sell_prices.nunique()

In [None]:
for i in sell_prices['wm_yr_wk'].unique():
    print(i)

In [None]:
sell_prices.groupby('wm_yr_wk')['sell_price'].mean().index

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10,7))
sns.lineplot(range(len(sell_prices.groupby('wm_yr_wk')['sell_price'].mean())), list(sell_prices.groupby('wm_yr_wk')['sell_price'].mean()))

# 3. 피쳐 엔지니어링

## 3.1 train, test

prepare training and test data.
- 2011-01-29 ~ 2016-04-24 : d_1    ~ d_1913
- 2016-04-25 ~ 2016-05-22 : d_1914 ~ d_1941 (public)
- 2016-05-23 ~ 2016-06-19 : d_1942 ~ d_1969 (private)

In [None]:
import pandas as pd
import numpy as np 
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn import metrics
from sklearn.model_selection import TimeSeriesSplit, KFold

import matplotlib.pyplot as plt
import seaborn as sns

import time
import datetime
import os

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_colwidth', None)

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

def write_record(features, params):
    record = open("record model and features.txt", 'a')
    record.write("\n")
    record.write(str(datetime.datetime.now())+"\n")

    check = 0
    for _ in features:
        check += 1
        if check % 5 == 0:
            record.write("\n")
        record.write(_+"  ")
    record.write("\n")
    for i  in params.items():
        record.write(str(i) + "\n")

    record.write('--------------------------------\n')
    record.close()

In [None]:
train = pd.read_csv('inputs/sales_train_validation.csv')
train = pd.melt(train, id_vars=['id','item_id','dept_id','cat_id','store_id','state_id'], var_name='d', value_name='sales')
train = reduce_mem_usage(train)

In [None]:
test = pd.read_csv('inputs/sample_submission.csv')
test = test[:30490]
test = pd.melt(test, id_vars=['id'], var_name='d', value_name='sales')
for i in range(1, 29):
    test = test.replace({f'F{i}': f'd_{1913+i}'})

test[['cat_id', 'dept_id', 'item_id', 'state_id', 'store_id', 'tmp']] = pd.DataFrame(test['id'].str.split('_').tolist())
del test['tmp']
test['store_id'] = test['state_id'] + '_' + test['store_id']
test['dept_id'] = test['cat_id'] + '_' + test['dept_id']
test['item_id'] = test['dept_id'] + '_' + test['item_id']

test = test[train.columns]
test = reduce_mem_usage(test)

## 3.2 calendar (date 데이터)

In [None]:
calendar = pd.read_csv('inputs/calendar.csv')

calendar = calendar.drop(['event_name_2', 'event_type_2'], axis=1)
calendar['is_event'] = calendar['event_name_1'].notna().astype('int8')
del calendar['wday']  # weekday랑 똑같은 컬럼.
calendar['day'] = calendar['date'].astype('datetime64').dt.day
calendar['week'] = calendar['date'].astype('datetime64').dt.week

In [None]:
train = train.merge(calendar, how='left')
test = test.merge(calendar, how='left')

## 3.3 sell_prices

In [None]:
sell_prices = pd.read_csv('inputs/sell_prices.csv')

In [None]:
train = train.merge(sell_prices, on = ['store_id', 'item_id', 'wm_yr_wk'], how = 'left')
test = test.merge(sell_prices, on = ['store_id', 'item_id', 'wm_yr_wk'], how = 'left')

In [None]:
train.head()

## 3.4 라벨인코딩

In [None]:
all_df = pd.concat([train, test])
all_df['revenue'] = all_df['sales'] * all_df['sell_price']

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for i in all_df.columns[all_df.dtypes == 'object']:
    if i == 'id' or i == 'date':
        continue
    all_df[i] = le.fit_transform(list(all_df[i]))



In [None]:
all_df = reduce_mem_usage(all_df)

In [None]:
all_df.head()

## 3.5 lag 데이터 제작

In [None]:
for i in range(28, 35):
    all_df[f'lag_t{i}'] = all_df.groupby(['id'])['sales'].transform(lambda x: x.shift(i))



In [None]:
all_df = reduce_mem_usage(all_df)

In [None]:
for i in range(28, 35):
    all_df[f'revenue_lag_t{i}'] = all_df.groupby(['id'])['revenue'].transform(lambda x: x.shift(i))


In [None]:
all_df = reduce_mem_usage(all_df)

In [None]:
all_df.columns

In [None]:
import pickle

def make_sales_lag(all_df, lag_day):
    for i in range(lag_day, lag_day + 14):
        all_df[f'lag_t{i}'] = all_df.groupby(['id'])['sales'].transform(lambda x: x.shift(i))
        if i % 7 == 0:
            all_df = reduce_mem_usage(all_df)
    
    weeks = [7, 28, 56, 112, 168] # 7 30 60 120 180
    
    for i in weeks:
        all_df[f'rolling_max_t{i}'] = all_df.groupby(['id'])['sales'].transform(lambda x: x.shift(lag_day).rolling(i).max())
        all_df[f'rolling_min_t{i}'] = all_df.groupby(['id'])['sales'].transform(lambda x: x.shift(lag_day).rolling(i).min())
        all_df[f'rolling_mean_t{i}'] = all_df.groupby(['id'])['sales'].transform(lambda x: x.shift(lag_day).rolling(i).mean())
        all_df[f'rolling_std_t{i}'] = all_df.groupby(['id'])['sales'].transform(lambda x: x.shift(lag_day).rolling(i).std())
    
    all_df = reduce_mem_usage(all_df)
    with open(f'inputs/lag_rolling_{lag_day}.pickle', 'wb') as f:
        pickle.dump(all_df, f, protocol=4)

# pickle 저장하려고 return 그냥 없앴음
#     return all_df

In [None]:
make_sales_lag(all_df, 1)

In [None]:
all_df = make_target_lag(all_df, 1)

In [None]:
# for i in range(1, 29):
#     all_df[f'lag_t{i}'] = all_df.groupby(['id'])['target'].transform(lambda x: x.shift(i)).fillna(-i)
    
#     if i % 10 == 7:
#         all_df = reduce_mem_usage(all_df)

In [None]:


# all_df['lag_t28'] = all_df.groupby(['id'])['target'].transform(lambda x: x.shift(28))
# all_df['lag_t29'] = all_df.groupby(['id'])['target'].transform(lambda x: x.shift(29))
# all_df['lag_t30'] = all_df.groupby(['id'])['target'].transform(lambda x: x.shift(30))

# # 새롭게 만들 거
# all_df['lag_t24'] = all_df.groupby(['id'])['target'].transform(lambda x: x.shift(24))
# all_df['lag_t25'] = all_df.groupby(['id'])['target'].transform(lambda x: x.shift(25))
# all_df['lag_t26'] = all_df.groupby(['id'])['target'].transform(lambda x: x.shift(26))
# all_df['lag_t27'] = all_df.groupby(['id'])['target'].transform(lambda x: x.shift(27))

## 3.6 이동평균 피처

In [None]:
all_df.head()

In [None]:
# 모델을 shift를 1한거부터 28한거까지 28개 만들라고 한다.

weeks = [7, 28, 56, 84, 112, 168]
for i in weeks:
    all_df[f'rolling_mean_t{i}'] = all_df.groupby(['id'])['target'].transform(lambda x: x.shift(1).rolling(i).max())
    all_df[f'rolling_mean_t{i}'] = all_df.groupby(['id'])['target'].transform(lambda x: x.shift(1).rolling(i).min())
    all_df[f'rolling_mean_t{i}'] = all_df.groupby(['id'])['target'].transform(lambda x: x.shift(1).rolling(i).mean())
    all_df[f'rolling_std_t{i}'] = all_df.groupby(['id'])['target'].transform(lambda x: x.shift(1).rolling(i).std())
#     all_df = reduce_mem_usage(all_df)
print('finish!!')

In [None]:
all_df.head()

In [None]:
weeks = [7, 28, 56, 84, 112, 168] # 7, 30, 60, 90, 120, 180
for i in weeks:
    all_df[f'rolling_mean_t{i}'] = all_df.groupby(['id'])['sales'].transform(lambda x: x.shift(28).rolling(i).max())
    all_df[f'rolling_mean_t{i}'] = all_df.groupby(['id'])['sales'].transform(lambda x: x.shift(28).rolling(i).min())
    all_df[f'rolling_mean_t{i}'] = all_df.groupby(['id'])['sales'].transform(lambda x: x.shift(28).rolling(i).mean())
    all_df[f'rolling_std_t{i}'] = all_df.groupby(['id'])['sales'].transform(lambda x: x.shift(28).rolling(i).std())
#     all_df = reduce_mem_usage(all_df)
print('finish!!')


In [None]:
all_df = reduce_mem_usage(all_df)

In [None]:
all_df.head()

In [None]:
all_df[(all_df.week == 17) & (all_df.year == 2015)]

### 연도별 판매량 그래프 분석중

- 2016년 4월을 맞춰야 해서, 2015년, 2014년, 2013년을 보고 있는데, 2015년의 그래프가 2014, 2013의 추이에 비해 다른 거 같다.

In [None]:
plt.figure(figsize = (15,10))
plt.plot(all_df[(all_df.week == 17) & (all_df.year == 2013)].groupby('item_id')['target'].mean()[:50])

In [None]:
plt.figure(figsize = (15,10))
plt.plot(all_df[(all_df.week == 17) & (all_df.year == 2014)].groupby('item_id')['target'].mean()[:50])

In [None]:
plt.figure(figsize = (15,10))
plt.plot(all_df[(all_df.week == 17) & (all_df.year == 2015)].groupby('item_id')['target'].mean()[:50])

In [None]:
all_df[all_df.week == 17].groupby('year')['target'].mean()

In [None]:
all_df[all_df.week == 18].groupby('year')['target'].mean()

In [None]:
all_df[all_df.week == 19].groupby('year')['target'].mean()

In [None]:
all_df[all_df.week == 20].groupby('year')['target'].mean()

## 3.7 price 통계량 피쳐

In [None]:
for i in range(28, 35):
        all_df[f'price_lag_t{i}'] = all_df.groupby(['id'])['sell_price'].transform(lambda x: x.shift(i))

In [None]:
all_df['rolling_price_std_t14'] = all_df.groupby(['id'])['sell_price'].transform(lambda x: x.rolling(14).std())


In [None]:
all_df['lag_price_t1'] = all_df.groupby(['id'])['sell_price'].transform(lambda x: x.shift(1))

all_df['price_change_t1'] = (all_df['lag_price_t1'] - all_df['sell_price']) / (all_df['lag_price_t1'])

all_df['rolling_price_max_t365'] = all_df.groupby(['id'])['sell_price'].transform(lambda x: x.shift(1).rolling(365).max())

all_df['price_change_t365'] = (all_df['rolling_price_max_t365'] - all_df['sell_price']) / (all_df['rolling_price_max_t365'])

all_df['rolling_price_std_t7'] = all_df.groupby(['id'])['sell_price'].transform(lambda x: x.rolling(7).std())

# 새롭게 만들거
# all_df['rolling_price_std_t32'] = all_df.groupby(['id'])['sell_price'].transform(lambda x: x.rolling(32).std())

all_df['rolling_price_std_t28'] = all_df.groupby(['id'])['sell_price'].transform(lambda x: x.rolling(28).std())
# all_df = reduce_mem_usage(all_df)


all_df = all_df.drop(['rolling_price_max_t365', 'lag_price_t1'], axis = 1)

In [None]:
all_df = reduce_mem_usage(all_df)

In [None]:
all_df.head()

In [None]:
import pickle

with open('inputs/all_df6.pickle', 'wb') as f:
    pickle.dump(all_df, f, protocol=4)


In [None]:
import pickle


with open('inputs/all_df.pickle', 'rb') as f:
    all_df = pickle.load(f)


# 4. 모델 제작

In [None]:
# from sklearn.ensemble import RandomForestRegressor

# rf = RandomForestRegressor(n_estimators=100, n_jobs=-1)
# rf.fit(train2, np.log(train['target'] + 1))
# result = rf.predict(test2)

In [None]:
# from lightgbm import LGBMRegressor
# # lgb = LGBMRegressor(num_leaves=2000, colsample_bytree=0.6, subsample=0.6, n_estimators=600, learning_rate=0.025, n_jobs=-1, device='gpu', max_bin = 63)
# lgb = LGBMRegressor(num_leaves=20, colsample_bytree=0.6, subsample=0.6, n_estimators=60, learning_rate=0.02, n_jobs=-1, device='cpu')

# lgb.fit(train, target)
# result = lgb.predict(test)

## 4.1 KFold - LGBM 모델

In [None]:
import lightgbm as lgb
import gc

features = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'year', 'month', 'week', 'day', 'weekday', 'event_name_1', 'event_type_1',  
            'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'lag_t28', 'lag_t29', 'lag_t30', 'rolling_mean_t7', 'rolling_std_t7', 'rolling_mean_t30', 'rolling_mean_t90', 
            'rolling_mean_t180', 'rolling_std_t30', 'price_change_t1', 'price_change_t365', 'rolling_price_std_t7', 'rolling_price_std_t30']

# 나중에 합칠 때 필요해서 test에 선언
test = all_df[len(train):]

train_set_X = all_df[:len(train)]
train_set_y = train_set_X['target']

train_set_X = train_set_X[features]

# 테스트 셋
test_set = all_df[len(train):]
test_set = test_set[features]

del all_df

gc.collect()

In [None]:
pip uninstall pandas_profiling

In [None]:
n_fold = 2
folds = KFold(n_splits=n_fold, shuffle=True)
splits = folds.split(train_set_X, train_set_y)

y_preds = np.zeros(test.shape[0])
y_oof = np.zeros(train.shape[0])

feature_importances = pd.DataFrame()
feature_importances['feature'] = train_set_X.columns
mean_score = []
eval_results = []

for fold_n, (train_index, valid_index) in enumerate(splits):
    
    print('Fold:',fold_n+1)
    
    X_train, X_valid = train_set_X.iloc[train_index], train_set_X.iloc[valid_index]
    y_train, y_valid = train_set_y.iloc[train_index], train_set_y.iloc[valid_index]
    
    lgb = LGBMRegressor(
        boosting_type = 'gbdt',
        num_leaves = 400,
        colsample_bytree = 0.8,
        subsample = 0.8,
        n_estimators = 20,
        learning_rate = 0.01,
        n_jobs = -1,
        device = 'gpu'
    )
    lgb.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds = 50, verbose = True)
    eval_results.append(lgb.evals_result_)
    # 피쳐중요도 작성
    feature_importances[f'fold_{fold_n + 1}'] = lgb.feature_importances_
    
    # validation predict
    y_pred_valid = lgb.predict(X_valid, num_iteration=lgb.best_iteration_)

    y_oof[valid_index] = y_pred_valid
    
    val_score = np.sqrt(metrics.mean_squared_error(y_pred_valid, y_valid))
    
    print(f'val rmse score is {val_score}')
    
    mean_score.append(val_score)
    
    y_preds += lgb.predict(test_set, num_iteration=lgb.best_iteration_) / n_fold
    
    del X_train, X_valid, y_train, y_valid

print('mean rmse score over folds is',np.mean(mean_score))
test['target'] = y_preds



In [None]:
tmp1 = list(eval_results[0]['valid_0'].values())[0]

In [None]:
tmp = list(eval_results[1]['valid_0'].values())[0]

In [None]:
eval_results[0].values().values()

In [None]:
eval_results[0]

In [None]:
features = train_set_X.columns
params = lgb.get_params()

## 4.2 KFold - XGB 모델

In [None]:
all_df = reduce_mem_usage(all_df)

test = all_df[len(train):]

features = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'year', 'month', 'week', 'day', 'weekday', 'event_name_1', 'event_type_1',  
            'snap_CA', 'snap_TX', 'snap_WI', 'sell_price']



train_set_X = all_df[:len(train)]
train_set_y = train_set_X['target']

train_set_X = train_set_X[features]

# 테스트 셋
test_set = all_df[len(train):]
test_set = test_set[features]

del all_df

In [None]:
train_set_X['sell_price'] = train_set_X['sell_price'].fillna(-1)

In [None]:
train_set_X.head()

In [None]:
import xgboost as xgb

n_fold = 5
folds = KFold(n_splits=5, shuffle=True)
splits = folds.split(train_set_X, train_set_y)

y_preds = np.zeros(test.shape[0])
# y_oof = np.zeros(train.shape[0])

feature_importances = pd.DataFrame()
feature_importances['feature'] = train_set_X.columns
mean_score = []

# dtest = xgb.DMatrix(data=test_set)


for fold_n, (train_index, valid_index) in enumerate(splits):
    print('Fold:',fold_n+1)
    
    X_train, X_valid = train_set_X.iloc[train_index], train_set_X.iloc[valid_index]
    y_train, y_valid = train_set_y.iloc[train_index], train_set_y.iloc[valid_index]
    
    
    xgb_model = xgb.XGBRegressor(colsample_bytree = 0.8, learning_rate = 0.02,subsample=0.8,
                max_depth = 12, n_estimators = 4000, tree_method='gpu_hist')
    
    xgb_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=50)
    
    # 피쳐중요도 작성
    feature_importances[f'fold_{fold_n + 1}'] = xgb_model.feature_importances_
    
    # validation predict
    y_pred_valid = xgb_model.predict(X_valid)
    # y_oof[valid_index] = y_pred_valid
    val_score = np.sqrt(metrics.mean_squared_error(y_pred_valid, y_valid))
    print(f'val rmse score is {val_score}')

    # test 값 예측
    y_preds += xgb_model.predict(test_set) / n_fold
    del X_train, X_valid, y_train, y_valid, y_pred_valid, val_score

print('mean rmse score over folds is',np.mean(mean_score))
test['target'] = y_preds

features = train_set_X.columns
params = xgb_model.get_params()

## 4.3 TimeSeriesSplit - LGBM 모델

In [None]:
# n_fold = 10
# folds = TimeSeriesSplit(n_splits=n_fold)
# splits = folds.split(train_set_X, train_set_y)

# y_preds = np.zeros(test.shape[0])
# y_oof = np.zeros(train.shape[0])

# feature_importances = pd.DataFrame()
# feature_importances['feature'] = train_set_X.columns
# mean_score = []

# for fold_n, (train_index, valid_index) in enumerate(splits):
#     print('Fold:',fold_n+1)
    
#     X_train, X_valid = train_set_X.iloc[train_index], train_set_X.iloc[valid_index]
#     y_train, y_valid = train_set_y.iloc[train_index], train_set_y.iloc[valid_index]
    
#     lgb = LGBMRegressor(
#         num_leaves = 1000,
#         colsample_bytree = 0.8,
#         subsample = 0.8,
#         n_estimators = 2500,
#         learning_rate = 0.01,
#         n_jobs = -1,
#         device = 'cpu'
#     )
    
#     lgb.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds = 50, verbose = False)

#     feature_importances[f'fold_{fold_n + 1}'] = lgb.feature_importances_
    
#     y_pred_valid = lgb.predict(X_valid, num_iteration=lgb.best_iteration_)
    
#     y_oof[valid_index] = y_pred_valid
    
#     val_score = np.sqrt(metrics.mean_squared_error(y_pred_valid, y_valid))
    
#     print(f'val rmse score is {val_score}')
    
#     mean_score.append(val_score)
    
#     y_preds += lgb.predict(test_set, num_iteration=lgb.best_iteration_) / n_fold
    
#     del X_train, X_valid, y_train, y_valid

# print('mean rmse score over folds is',np.mean(mean_score))

# test['target'] = y_preds

## 4.4 feature_importance 확인

In [None]:
feature_importances

# 5. 예측 및 제출

In [None]:
sub = pd.read_csv('inputs/sample_submission.csv')

predictions = test[['id', 'date', 'target']]
predictions = pd.pivot(predictions, index = 'id', columns = 'date', values = 'target').reset_index()
predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]

evaluation_rows = [row for row in sub['id'] if 'evaluation' in row] 
evaluation = sub[sub['id'].isin(evaluation_rows)]

validation = sub[['id']].merge(predictions, on = 'id')
final = pd.concat([validation, evaluation])
final.to_csv('submissions/submission.csv', index = False)

In [None]:
for i in range(1,29):
    final['F'+str(i)] *= 1.04
    
submission.to_csv('sub.csv', index=False) 

In [None]:
test.head()

In [None]:
final.head()

https://www.kaggle.com/c/m5-forecasting-accuracy/submit

In [None]:
time.sleep(2)
os.chdir("submissions")
!kaggle competitions submit -c m5-forecasting-accuracy -f submission.csv -m lgb
os.chdir("../")

# 6. 모델 파라미터, 피처 기록 및 모델 저장하기

In [None]:
write_record(features, params)

In [None]:
import joblib
# save model
# joblib.dump(lgb, 'models/lgb1.pkl')
# load model
# lgb = joblib.load('models/lgb.pkl')

In [None]:
import pandas as pd
tmp1 = pd.read_csv('submissions/submission.csv')
tmp2 = pd.read_csv('submissions/fnu050/submission.csv')



In [None]:
tmp3 = tmp1[:30490].sort_values('id').reset_index(drop=True)

In [None]:
tmp4 = tmp2[:30490]

In [None]:
for i in range(1, 29):
    tmp3[f'F{i}'] = tmp3[f'F{i}'] * 0.55 + tmp4[f'F{i}'] * 0.45

In [None]:
tmp5 = pd.concat([tmp3, tmp2[30490:]])

In [None]:
tmp5.to_csv('sub.csv', index=False)