* ## 기본 그래픽 설정

In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import seaborn as sns

def get_font_family():
    import platform
    system_name = platform.system()
    # colab 사용자는 system_name이 'Linux'로 확인
    if system_name == "Darwin" :
        font_family = "AppleGothic"
    elif system_name == "Windows":
        font_family = "Malgun Gothic"
    else:
        !apt-get update -qq
        !apt-get install fonts-nanum -qq  > /dev/null
        import matplotlib.font_manager as fm
        fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
        font = fm.FontProperties(fname=fontpath, size=9)
        fm._rebuild()
        font_family = "NanumBarunGothic"
    return font_family
get_font_family()
# 시각화를 위한 폰트설정
# 위에서 만든 함수를 통해 시스템 폰트를 불러와서 font_family 라는 변수에 할당
font_family = get_font_family()
# 폰트설정
plt.rc('font', family=font_family)
# 마이너스폰트 설정
plt.rc('axes', unicode_minus=False)
# 그래프 스타일 설정
plt.style.use('ggplot')
# 그래프에 retina display 적용
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')

* ## 랜덤 시드 설정 

In [2]:
np.random.seed(42)

* ## 데이터 불러오기

In [3]:
df = pd.read_csv('./data/2nd/실전db.csv')
df

Unnamed: 0,USER_ID,JOIN_DATE,D_TYPE,STORE_ID,GOODS_TYPE,DATE,COUNT,AD1
0,2858,2014-01-07,AA,1892,A,2020-01-01,1,GN
1,5647,2014-02-14,BB,182009,A,2020-01-01,1,J
2,33314,2014-11-20,BB,82431,A,2020-01-01,1,SC
3,37001,2014-12-04,BB,725,C,2020-01-01,1,MP
4,37819,2014-12-07,AA,220691,C,2020-01-01,1,JRR
...,...,...,...,...,...,...,...,...
879266,1830551,2020-12-31,BB,219886,B,2020-12-31,1,GN
879267,1830570,2020-12-31,BB,82433,B,2020-12-31,1,CY
879268,1830580,2020-12-31,AA,92020,B,2020-12-31,1,JRR
879269,1830589,2020-12-31,BB,92437,B,2020-12-31,1,J


* ## 1970-01-01 전처리

In [4]:
# 전처리
# JOIN_DATE가 1970-01-01인 경우 => 2013-12-16
df.loc[df['JOIN_DATE']=='1970-01-01', 'JOIN_DATE'] = '2013-12-16'

In [5]:
df['JOIN_DATE'].sort_values()

176320    2013-12-16
624928    2013-12-16
625539    2013-12-16
226658    2013-12-16
226415    2013-12-16
             ...    
879214    2020-12-31
879213    2020-12-31
879212    2020-12-31
879230    2020-12-31
879270    2020-12-31
Name: JOIN_DATE, Length: 879271, dtype: object

* ## 파생변수 생성

In [6]:
df['JOIN_DATE'] = pd.to_datetime(df['JOIN_DATE'])
df['DATE'] = pd.to_datetime(df['DATE'])
df['ELAPSED_DAY'] = (df['DATE']-df['JOIN_DATE']).dt.days
df['ELAPSED_TIME'] = df['ELAPSED_DAY']*24 *60
df['ELAPSED_WEEK'] = (df['ELAPSED_DAY'] -1)//7 + 1

df

Unnamed: 0,USER_ID,JOIN_DATE,D_TYPE,STORE_ID,GOODS_TYPE,DATE,COUNT,AD1,ELAPSED_DAY,ELAPSED_TIME,ELAPSED_WEEK
0,2858,2014-01-07,AA,1892,A,2020-01-01,1,GN,2185,3146400,313
1,5647,2014-02-14,BB,182009,A,2020-01-01,1,J,2147,3091680,307
2,33314,2014-11-20,BB,82431,A,2020-01-01,1,SC,1868,2689920,267
3,37001,2014-12-04,BB,725,C,2020-01-01,1,MP,1854,2669760,265
4,37819,2014-12-07,AA,220691,C,2020-01-01,1,JRR,1851,2665440,265
...,...,...,...,...,...,...,...,...,...,...,...
879266,1830551,2020-12-31,BB,219886,B,2020-12-31,1,GN,0,0,0
879267,1830570,2020-12-31,BB,82433,B,2020-12-31,1,CY,0,0,0
879268,1830580,2020-12-31,AA,92020,B,2020-12-31,1,JRR,0,0,0
879269,1830589,2020-12-31,BB,92437,B,2020-12-31,1,J,0,0,0


In [7]:
df['DoW'] = df['DATE'].dt.day_name()
df['MONTH'] = df['DATE'].dt.month
df['DAY'] = df['DATE'].dt.day

df

Unnamed: 0,USER_ID,JOIN_DATE,D_TYPE,STORE_ID,GOODS_TYPE,DATE,COUNT,AD1,ELAPSED_DAY,ELAPSED_TIME,ELAPSED_WEEK,DoW,MONTH,DAY
0,2858,2014-01-07,AA,1892,A,2020-01-01,1,GN,2185,3146400,313,Wednesday,1,1
1,5647,2014-02-14,BB,182009,A,2020-01-01,1,J,2147,3091680,307,Wednesday,1,1
2,33314,2014-11-20,BB,82431,A,2020-01-01,1,SC,1868,2689920,267,Wednesday,1,1
3,37001,2014-12-04,BB,725,C,2020-01-01,1,MP,1854,2669760,265,Wednesday,1,1
4,37819,2014-12-07,AA,220691,C,2020-01-01,1,JRR,1851,2665440,265,Wednesday,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
879266,1830551,2020-12-31,BB,219886,B,2020-12-31,1,GN,0,0,0,Thursday,12,31
879267,1830570,2020-12-31,BB,82433,B,2020-12-31,1,CY,0,0,0,Thursday,12,31
879268,1830580,2020-12-31,AA,92020,B,2020-12-31,1,JRR,0,0,0,Thursday,12,31
879269,1830589,2020-12-31,BB,92437,B,2020-12-31,1,J,0,0,0,Thursday,12,31


# Training / Test

* ## 기본 feature

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

feature = ['D_TYPE', 'GOODS_TYPE', 'AD1', 'COUNT']
df_new = df.loc[:,feature]

x = df_new.drop('COUNT', axis=1)
t = df_new['COUNT'].values.ravel()

ct = ColumnTransformer([('label', OrdinalEncoder(), ['AD1', 'D_TYPE', 'GOODS_TYPE'])], remainder='passthrough')
trans = ct.fit_transform(x)

x_train, t_train = trans[:623579,:], t[:623579]
x_test, t_test = trans[623579:,:], t[623579:]
model = RandomForestRegressor()
kfold = 5
kfold_score = cross_val_score(model, x_train, t_train, cv=kfold, scoring='neg_mean_squared_error')
print(f'mse : {np.mean(-kfold_score)}')
print(f'rmse : {np.sqrt(np.mean(-kfold_score))}')

mse : 0.1520321794137789
rmse : 0.38991304083574696


In [9]:
model = RandomForestRegressor()
model.fit(x_train, t_train)
pred = model.predict(x_test)
mse = mean_squared_error(pred, t_test)
rmse = np.sqrt(mse)
print(f'mse : {mse}')
print(f'rmse : {rmse}')

mse : 0.37979799279371995
rmse : 0.6162775290351904


* ## 기본 feature + 경과 시간

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

feature = ['D_TYPE', 'GOODS_TYPE', 'AD1','ELAPSED_TIME', 'COUNT']
df_new = df.loc[:,feature]

x = df_new.drop('COUNT', axis=1)
t = df_new['COUNT'].values.ravel()

ct = ColumnTransformer([('label', OrdinalEncoder(), ['AD1', 'D_TYPE', 'GOODS_TYPE'])], remainder='passthrough')
trans = ct.fit_transform(x)

x_train, t_train = trans[:623579,:], t[:623579]
x_test, t_test = trans[623579:,:], t[623579:]
model = RandomForestRegressor()
kfold = 5
kfold_score = cross_val_score(model, x_train, t_train, cv=kfold, scoring='neg_mean_squared_error')
print(f'mse : {np.mean(-kfold_score)}')
print(f'rmse : {np.sqrt(np.mean(-kfold_score))}')

mse : 0.10027181302612749
rmse : 0.3166572484976895


In [11]:
model = RandomForestRegressor()
model.fit(x_train, t_train)
pred = model.predict(x_test)
mse = mean_squared_error(pred, t_test)
rmse = np.sqrt(mse)
print(f'mse : {mse}')
print(f'rmse : {rmse}')

mse : 0.17434759600481034
rmse : 0.41754951323742473


* ## 기본 feature + 경과 일

In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

feature = ['D_TYPE', 'GOODS_TYPE', 'AD1','ELAPSED_DAY', 'COUNT']
df_new = df.loc[:,feature]

x = df_new.drop('COUNT', axis=1)
t = df_new['COUNT'].values.ravel()

ct = ColumnTransformer([('label', OrdinalEncoder(), ['AD1', 'D_TYPE', 'GOODS_TYPE'])], remainder='passthrough')
trans = ct.fit_transform(x)

x_train, t_train = trans[:623579,:], t[:623579]
x_test, t_test = trans[623579:,:], t[623579:]
model = RandomForestRegressor()
kfold = 5
kfold_score = cross_val_score(model, x_train, t_train, cv=kfold, scoring='neg_mean_squared_error')
print(f'mse : {np.mean(-kfold_score)}')
print(f'rmse : {np.sqrt(np.mean(-kfold_score))}')

mse : 0.09893048582901409
rmse : 0.31453216978397314


In [13]:
model = RandomForestRegressor()
model.fit(x_train, t_train)
pred = model.predict(x_test)
mse = mean_squared_error(pred, t_test)
rmse = np.sqrt(mse)
print(f'mse : {mse}')
print(f'rmse : {rmse}')

mse : 0.17328714868054973
rmse : 0.4162777302241254


* ## 기본 feature + 경과 주

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error

feature = ['D_TYPE', 'GOODS_TYPE', 'AD1','ELAPSED_WEEK', 'COUNT']
df_new = df.loc[:,feature]

x = df_new.drop('COUNT', axis=1)
t = df_new['COUNT'].values.ravel()

ct = ColumnTransformer([('label', OrdinalEncoder(), ['AD1', 'D_TYPE', 'GOODS_TYPE'])], remainder='passthrough')
trans = ct.fit_transform(x)

x_train, t_train = trans[:623579,:], t[:623579]
x_test, t_test = trans[623579:,:], t[623579:]
model = RandomForestRegressor()
kfold = 5
kfold_score = cross_val_score(model, x_train, t_train, cv=kfold, scoring='neg_mean_squared_error')
print(f'mse : {np.mean(-kfold_score)}')
print(f'rmse : {np.sqrt(np.mean(-kfold_score))}')

mse : 0.0972554043879821
rmse : 0.3118579875327584


In [15]:
model = RandomForestRegressor()
model.fit(x_train, t_train)
pred = model.predict(x_test)
mse = mean_squared_error(pred, t_test)
rmse = np.sqrt(mse)
print(f'mse : {mse}')
print(f'rmse : {rmse}')

mse : 0.1786166629733196
rmse : 0.42263064604133904


* ## 기본 feature + 요일

In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error

feature = ['D_TYPE', 'GOODS_TYPE', 'AD1','DoW', 'COUNT']
df_new = df.loc[:,feature]


x = df_new.drop('COUNT', axis=1)
t = df_new['COUNT'].values.ravel()

ct = ColumnTransformer([('label', OrdinalEncoder(), ['AD1', 'D_TYPE', 'GOODS_TYPE', 'DoW'])], remainder='passthrough')
trans = ct.fit_transform(x)

x_train, t_train = trans[:623579,:], t[:623579]
x_test, t_test = trans[623579:,:], t[623579:]
model = RandomForestRegressor()
kfold = 5
kfold_score = cross_val_score(model, x_train, t_train, cv=kfold, scoring='neg_mean_squared_error')
print(f'mse : {np.mean(-kfold_score)}')
print(f'rmse : {np.sqrt(np.mean(-kfold_score))}')

mse : 0.159660223431718
rmse : 0.3995750535653071


In [17]:
model = RandomForestRegressor()
model.fit(x_train, t_train)
pred = model.predict(x_test)
mse = mean_squared_error(pred, t_test)
rmse = np.sqrt(mse)
print(f'mse : {mse}')
print(f'rmse : {rmse}')

mse : 0.3789665188114296
rmse : 0.6156025656309675


* ## 기본 feature + 경과 주 + 경과 일

In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error

feature = ['D_TYPE', 'GOODS_TYPE', 'AD1','ELAPSED_WEEK','ELAPSED_DAY', 'COUNT']
df_new = df.loc[:,feature]

x = df_new.drop('COUNT', axis=1)
t = df_new['COUNT'].values.ravel()

ct = ColumnTransformer([('label', OrdinalEncoder(), ['AD1', 'D_TYPE', 'GOODS_TYPE'])], remainder='passthrough')
trans = ct.fit_transform(x)

x_train, t_train = trans[:623579,:], t[:623579]
x_test, t_test = trans[623579:,:], t[623579:]
model = RandomForestRegressor()
kfold = 5
kfold_score = cross_val_score(model, x_train, t_train, cv=kfold, scoring='neg_mean_squared_error')
print(f'mse : {np.mean(-kfold_score)}')
print(f'rmse : {np.sqrt(np.mean(-kfold_score))}')

mse : 0.10070671169721117
rmse : 0.3173432080527503


In [19]:
model = RandomForestRegressor()
model.fit(x_train, t_train)
pred = model.predict(x_test)
mse = mean_squared_error(pred, t_test)
rmse = np.sqrt(mse)
print(f'mse : {mse}')
print(f'rmse : {rmse}')

mse : 0.1746507533068325
rmse : 0.4179123751539699


* ## 기본 feature + 경과 주 + 요일

In [20]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error

feature = ['D_TYPE', 'GOODS_TYPE', 'AD1','DoW', 'COUNT', 'ELAPSED_WEEK']
df_new = df.loc[:,feature]


x = df_new.drop('COUNT', axis=1)
t = df_new['COUNT'].values.ravel()

ct = ColumnTransformer([('label', OrdinalEncoder(), ['AD1', 'D_TYPE', 'GOODS_TYPE', 'DoW'])], remainder='passthrough')
trans = ct.fit_transform(x)

x_train, t_train = trans[:623579,:], t[:623579]
x_test, t_test = trans[623579:,:], t[623579:]
model = RandomForestRegressor()
kfold = 5
kfold_score = cross_val_score(model, x_train, t_train, cv=kfold, scoring='neg_mean_squared_error')
print(f'mse : {np.mean(-kfold_score)}')
print(f'rmse : {np.sqrt(np.mean(-kfold_score))}')

mse : 0.09379597001792109
rmse : 0.3062612773726399


In [21]:
model = RandomForestRegressor()
model.fit(x_train, t_train)
pred = model.predict(x_test)
mse = mean_squared_error(pred, t_test)
rmse = np.sqrt(mse)
print(f'mse : {mse}')
print(f'rmse : {rmse}')

mse : 0.18186515177404491
rmse : 0.426456506309899


* ## 기본 feature + 경과 주 + 달 + 일 + 요일

In [22]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error

feature = ['D_TYPE', 'GOODS_TYPE', 'AD1','DoW','MONTH', 'DAY','COUNT', 'ELAPSED_WEEK']
df_new = df.loc[:,feature]


x = df_new.drop('COUNT', axis=1)
t = df_new['COUNT'].values.ravel()

ct = ColumnTransformer([('label', OrdinalEncoder(), ['AD1', 'D_TYPE', 'GOODS_TYPE', 'DoW'])], remainder='passthrough')
trans = ct.fit_transform(x)

x_train, t_train = trans[:623579,:], t[:623579]
x_test, t_test = trans[623579:,:], t[623579:]
model = RandomForestRegressor()
kfold = 5
kfold_score = cross_val_score(model, x_train, t_train, cv=kfold, scoring='neg_mean_squared_error')
print(f'mse : {np.mean(-kfold_score)}')
print(f'rmse : {np.sqrt(np.mean(-kfold_score))}')

mse : 0.07762089275573388
rmse : 0.27860526333099644


In [23]:
model = RandomForestRegressor()
model.fit(x_train, t_train)
pred = model.predict(x_test)
mse = mean_squared_error(pred, t_test)
rmse = np.sqrt(mse)
print(f'mse : {mse}')
print(f'rmse : {rmse}')

mse : 0.1814375963954365
rmse : 0.4259549229618511


* ## 기본 feature + 경과 주 + 달 + 일

In [25]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error

feature = ['D_TYPE', 'GOODS_TYPE', 'AD1','MONTH', 'DAY','COUNT', 'ELAPSED_WEEK']
df_new = df.loc[:,feature]



x = df_new.drop('COUNT', axis=1)
t = df_new['COUNT'].values.ravel()

ct = ColumnTransformer([('label', OrdinalEncoder(), ['AD1', 'D_TYPE', 'GOODS_TYPE'])], remainder='passthrough')
trans = ct.fit_transform(x)

x_train, t_train = trans[:623579,:], t[:623579]
x_test, t_test = trans[623579:,:], t[623579:]
model = RandomForestRegressor()
kfold = 5
kfold_score = cross_val_score(model, x_train, t_train, cv=kfold, scoring='neg_mean_squared_error')
print(f'mse : {np.mean(-kfold_score)}')
print(f'rmse : {np.sqrt(np.mean(-kfold_score))}')

mse : 0.07947312337494737
rmse : 0.2819097787856026


In [26]:
model = RandomForestRegressor()
model.fit(x_train, t_train)
pred = model.predict(x_test)
mse = mean_squared_error(pred, t_test)
rmse = np.sqrt(mse)
print(f'mse : {mse}')
print(f'rmse : {rmse}')

mse : 0.18556942718273697
rmse : 0.4307777004241712


* ## 기본 feature + 경과일 + 경과주 + 달 + 일

In [30]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error

feature = ['D_TYPE', 'GOODS_TYPE', 'AD1','MONTH','DAY','ELAPSED_DAY' ,'COUNT', 'ELAPSED_WEEK']
df_new = df.loc[:,feature]

x = df_new.drop('COUNT', axis=1)
t = df_new['COUNT'].values.ravel()

ct = ColumnTransformer([('label', OrdinalEncoder(), ['AD1', 'D_TYPE', 'GOODS_TYPE'])], remainder='passthrough')
trans = ct.fit_transform(x)

x_train, t_train = trans[:623579,:], t[:623579]
x_test, t_test = trans[623579:,:], t[623579:]
model = RandomForestRegressor()
kfold = 5
kfold_score = cross_val_score(model, x_train, t_train, cv=kfold, scoring='neg_mean_squared_error')
print(f'mse : {np.mean(-kfold_score)}')
print(f'rmse : {np.sqrt(np.mean(-kfold_score))}')

mse : 0.07904446909451475
rmse : 0.281148482290968


In [29]:
model = RandomForestRegressor()
model.fit(x_train, t_train)
pred = model.predict(x_test)
mse = mean_squared_error(pred, t_test)
rmse = np.sqrt(mse)
print(f'mse : {mse}')
print(f'rmse : {rmse}')

mse : 0.18413113459322947
rmse : 0.42910503911423536


In [42]:
final_result = pd.DataFrame(columns = ['mse', 'rmse'])
final_result.loc['기본 feature', : ] = [0.3797, 0.6162]
final_result.loc['기본 feature + 경과 시간', : ] = [0.1743, 0.4175]
final_result.loc['기본 feature + 경과 일', : ] = [0.1732, 0.4162]
final_result.loc['기본 feature + 경과 주', : ] = [0.1786,  0.4226]
final_result.loc['기본 feature + 요일', : ] = [0.3789,  0.6156]
final_result.loc['기본 feature + 경과 주 + 경과 일', : ] = [0.1746,  0.4179]
final_result.loc['기본 feature + 경과 주 + 요일', : ] = [0.1818,   0.4264]
final_result.loc['기본 feature + 경과 주 + 달 + 일 + 요일', : ] = [0.1814,   0.4259]
final_result.loc['기본 feature + 경과 주 + 달 + 일', : ] = [0.1855,   0.4307]
final_result.loc['기본 feature + 경과 주 + 달 + 일', : ] = [0.1841,    0.4291]

final_result

Unnamed: 0,mse,rmse
기본 feature,0.3797,0.6162
기본 feature + 경과 시간,0.1743,0.4175
기본 feature + 경과 일,0.1732,0.4162
기본 feature + 경과 주,0.1786,0.4226
기본 feature + 요일,0.3789,0.6156
기본 feature + 경과 주 + 경과 일,0.1746,0.4179
기본 feature + 경과 주 + 요일,0.1818,0.4264
기본 feature + 경과 주 + 달 + 일 + 요일,0.1814,0.4259
기본 feature + 경과 주 + 달 + 일,0.1841,0.4291
