* ## 기본 그래픽 설정

In [2]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import seaborn as sns

def get_font_family():
    import platform
    system_name = platform.system()
    # colab 사용자는 system_name이 'Linux'로 확인
    if system_name == "Darwin" :
        font_family = "AppleGothic"
    elif system_name == "Windows":
        font_family = "Malgun Gothic"
    else:
        !apt-get update -qq
        !apt-get install fonts-nanum -qq  > /dev/null
        import matplotlib.font_manager as fm
        fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
        font = fm.FontProperties(fname=fontpath, size=9)
        fm._rebuild()
        font_family = "NanumBarunGothic"
    return font_family
get_font_family()
# 시각화를 위한 폰트설정
# 위에서 만든 함수를 통해 시스템 폰트를 불러와서 font_family 라는 변수에 할당
font_family = get_font_family()
# 폰트설정
plt.rc('font', family=font_family)
# 마이너스폰트 설정
plt.rc('axes', unicode_minus=False)
# 그래프 스타일 설정
plt.style.use('ggplot')
# 그래프에 retina display 적용
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')

* ## 랜덤 시드 설정 

In [3]:
np.random.seed(42)

* ## 데이터 불러오기

In [4]:
df = pd.read_csv('./data/2nd/실전db.csv')
df

Unnamed: 0,USER_ID,JOIN_DATE,D_TYPE,STORE_ID,GOODS_TYPE,DATE,COUNT,AD1
0,2858,2014-01-07,AA,1892,A,2020-01-01,1,GN
1,5647,2014-02-14,BB,182009,A,2020-01-01,1,J
2,33314,2014-11-20,BB,82431,A,2020-01-01,1,SC
3,37001,2014-12-04,BB,725,C,2020-01-01,1,MP
4,37819,2014-12-07,AA,220691,C,2020-01-01,1,JRR
...,...,...,...,...,...,...,...,...
879266,1830551,2020-12-31,BB,219886,B,2020-12-31,1,GN
879267,1830570,2020-12-31,BB,82433,B,2020-12-31,1,CY
879268,1830580,2020-12-31,AA,92020,B,2020-12-31,1,JRR
879269,1830589,2020-12-31,BB,92437,B,2020-12-31,1,J


* ## 1970-01-01 전처리

In [5]:
# 전처리
# JOIN_DATE가 1970-01-01인 경우 => 2013-12-16
df.loc[df['JOIN_DATE']=='1970-01-01', 'JOIN_DATE'] = '2013-12-16'

In [6]:
df['JOIN_DATE'].sort_values()

176320    2013-12-16
624928    2013-12-16
625539    2013-12-16
226658    2013-12-16
226415    2013-12-16
             ...    
879214    2020-12-31
879213    2020-12-31
879212    2020-12-31
879230    2020-12-31
879270    2020-12-31
Name: JOIN_DATE, Length: 879271, dtype: object

* ## 파생변수 생성

In [7]:
df['JOIN_DATE'] = pd.to_datetime(df['JOIN_DATE'])
df['DATE'] = pd.to_datetime(df['DATE'])
df['ELAPSED_DAY'] = (df['DATE']-df['JOIN_DATE']).dt.days
df['ELAPSED_TIME'] = df['ELAPSED_DAY']*24 *60
df['ELAPSED_WEEK'] = (df['ELAPSED_DAY'] -1)//7 + 1

df

Unnamed: 0,USER_ID,JOIN_DATE,D_TYPE,STORE_ID,GOODS_TYPE,DATE,COUNT,AD1,ELAPSED_DAY,ELAPSED_TIME,ELAPSED_WEEK
0,2858,2014-01-07,AA,1892,A,2020-01-01,1,GN,2185,3146400,313
1,5647,2014-02-14,BB,182009,A,2020-01-01,1,J,2147,3091680,307
2,33314,2014-11-20,BB,82431,A,2020-01-01,1,SC,1868,2689920,267
3,37001,2014-12-04,BB,725,C,2020-01-01,1,MP,1854,2669760,265
4,37819,2014-12-07,AA,220691,C,2020-01-01,1,JRR,1851,2665440,265
...,...,...,...,...,...,...,...,...,...,...,...
879266,1830551,2020-12-31,BB,219886,B,2020-12-31,1,GN,0,0,0
879267,1830570,2020-12-31,BB,82433,B,2020-12-31,1,CY,0,0,0
879268,1830580,2020-12-31,AA,92020,B,2020-12-31,1,JRR,0,0,0
879269,1830589,2020-12-31,BB,92437,B,2020-12-31,1,J,0,0,0


In [129]:
df['DoW'] = df['DATE'].dt.day_name()
df['MONTH'] = df['DATE'].dt.month
df['DAY'] = df['DATE'].dt.day


df

Unnamed: 0,USER_ID,JOIN_DATE,D_TYPE,STORE_ID,GOODS_TYPE,DATE,COUNT,AD1,ELAPSED_DAY,ELAPSED_TIME,ELAPSED_WEEK,DoW,MONTH,DAY
0,2858,2014-01-07,AA,1892,A,2020-01-01,1,GN,2185,3146400,313,Wednesday,1,1
1,5647,2014-02-14,BB,182009,A,2020-01-01,1,J,2147,3091680,307,Wednesday,1,1
2,33314,2014-11-20,BB,82431,A,2020-01-01,1,SC,1868,2689920,267,Wednesday,1,1
3,37001,2014-12-04,BB,725,C,2020-01-01,1,MP,1854,2669760,265,Wednesday,1,1
4,37819,2014-12-07,AA,220691,C,2020-01-01,1,JRR,1851,2665440,265,Wednesday,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
879266,1830551,2020-12-31,BB,219886,B,2020-12-31,1,GN,0,0,0,Thursday,12,31
879267,1830570,2020-12-31,BB,82433,B,2020-12-31,1,CY,0,0,0,Thursday,12,31
879268,1830580,2020-12-31,AA,92020,B,2020-12-31,1,JRR,0,0,0,Thursday,12,31
879269,1830589,2020-12-31,BB,92437,B,2020-12-31,1,J,0,0,0,Thursday,12,31


# Training

* ## 기본 feature

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

feature = ['D_TYPE', 'GOODS_TYPE', 'AD1', 'COUNT']
df_new = df.loc[:,feature]

train_data = df_new.iloc[:623579]
test_data = df_new.iloc[623579:]

ct = ColumnTransformer([('label', OrdinalEncoder(), ['AD1', 'D_TYPE', 'GOODS_TYPE'])])

x_train, t_train = train_data.drop('COUNT', axis=1), train_data['COUNT']
x_test, t_test = test_data.drop('COUNT', axis=1), test_data['COUNT'] 

x_train_trans = ct.fit_transform(x_train)
x_test_trans = ct.fit_transform(x_test)

t_train = t_train.values.ravel()
t_test = t_test.values.ravel()

model = RandomForestRegressor()
kfold = 5
kfold_score = cross_val_score(model, x_train_trans, t_train, cv=kfold, scoring='neg_mean_squared_error')
print(f'mse : {np.mean(-kfold_score)}')
print(f'rmse : {np.sqrt(np.mean(-kfold_score))}')

mse : 0.15257267485542553
rmse : 0.3906055233293886


* ## 기본 feature + 경과 시간

In [120]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

feature = ['D_TYPE', 'GOODS_TYPE', 'AD1','ELAPSED_TIME', 'COUNT']
df_new = df.loc[:,feature]

train_data = df_new.iloc[:623579]
test_data = df_new.iloc[623579:]

ct = ColumnTransformer([('label', OrdinalEncoder(), ['AD1', 'D_TYPE', 'GOODS_TYPE'])])

x_train, t_train = train_data.drop('COUNT', axis=1), train_data['COUNT']
x_test, t_test = test_data.drop('COUNT', axis=1), test_data['COUNT'] 

x_train_trans = ct.fit_transform(x_train)
x_test_trans = ct.fit_transform(x_test)

t_train = t_train.values.ravel()
t_test = t_test.values.ravel()

model = RandomForestRegressor()
kfold = 5
kfold_score = cross_val_score(model, x_train_trans, t_train, cv=kfold, scoring='neg_mean_squared_error')
print(f'mse : {np.mean(-kfold_score)}')
print(f'rmse : {np.sqrt(np.mean(-kfold_score))}')

mse : 0.1520019678798766
rmse : 0.38987429753688124


In [9]:
x_train_trans[25,0]
# 25 -> 40

40.0

In [10]:
x_train.iloc[25,:]

D_TYPE               AA
GOODS_TYPE            C
AD1                 GSN
ELAPSED_TIME    2223360
Name: 25, dtype: object

In [11]:
x_test_trans[7,0]
# 7->40

40.0

In [12]:
x_test.iloc[7,:]

D_TYPE               AA
GOODS_TYPE            A
AD1                  GN
ELAPSED_TIME    2630880
Name: 623586, dtype: object

In [13]:
model.fit(x_train_trans, t_train)
pred = model.predict(x_test_trans)
print(pred)
print('mse :',mean_squared_error(pred, t_test))
print('rmse :',np.sqrt(mean_squared_error(pred, t_test)))

[1.01643854 1.         1.01710165 ... 1.00780778 1.01002873 1.        ]
mse : 0.4361751328246167
rmse : 0.6604355629617599


In [49]:
x_train_trans.astype(int)

array([[37,  0,  0],
       [46,  1,  0],
       [60,  1,  0],
       ...,
       [ 3,  0,  1],
       [48,  1,  0],
       [60,  0,  0]])

* ## 기본 feature + 경과 일

In [118]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

feature = ['D_TYPE', 'GOODS_TYPE', 'AD1','ELAPSED_DAY', 'COUNT']
df_new = df.loc[:,feature]

train_data = df_new.iloc[:623579]
test_data = df_new.iloc[623579:]

ct = ColumnTransformer([('label', OrdinalEncoder(), ['AD1', 'D_TYPE', 'GOODS_TYPE'])], remainder='passthrough')

x_train, t_train = train_data.drop('COUNT', axis=1), train_data['COUNT']
x_test, t_test = test_data.drop('COUNT', axis=1), test_data['COUNT'] 

x_train_trans = ct.fit_transform(x_train).astype(int)
x_test_trans = ct.fit_transform(x_test).astype(int)

t_train = t_train.values.ravel()
t_test = t_test.values.ravel()

model = RandomForestRegressor()
kfold = 5
kfold_score = cross_val_score(model, x_train_trans, t_train, cv=kfold, scoring='neg_mean_squared_error')
print(f'mse : {np.mean(-kfold_score)}')
print(f'rmse : {np.sqrt(np.mean(-kfold_score))}')

mse : 0.09996416428325908
rmse : 0.3161710996964445


* ## 기본 feature + 경과 주

In [119]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error

feature = ['D_TYPE', 'GOODS_TYPE', 'AD1','ELAPSED_WEEK', 'COUNT']
df_new = df.loc[:,feature]

train_data = df_new.iloc[:623579]
test_data = df_new.iloc[623579:]

ct = ColumnTransformer([('label', OrdinalEncoder(), ['AD1', 'D_TYPE', 'GOODS_TYPE'])], remainder='passthrough')

x_train, t_train = train_data.drop('COUNT', axis=1), train_data['COUNT']
x_test, t_test = test_data.drop('COUNT', axis=1), test_data['COUNT'] 

x_train_trans = ct.fit_transform(x_train)
x_test_trans = ct.fit_transform(x_test)

t_train = t_train.values.ravel()
t_test = t_test.values.ravel()

model = RandomForestRegressor()
kfold = 5
kfold_score = cross_val_score(model, x_train_trans, t_train, cv=kfold, scoring='neg_mean_squared_error')
print(f'mse : {np.mean(-kfold_score)}')
print(f'rmse : {np.sqrt(np.mean(-kfold_score))}')

mse : 0.09711903510574284
rmse : 0.3116392708015837


* ## 기본 feature + 요일

In [126]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error

feature = ['D_TYPE', 'GOODS_TYPE', 'AD1','DoW', 'COUNT']
df_new = df.loc[:,feature]

train_data = df_new.iloc[:623579]
test_data = df_new.iloc[623579:]

ct = ColumnTransformer([('label', OrdinalEncoder(), ['AD1', 'D_TYPE', 'GOODS_TYPE', 'DoW'])], remainder='passthrough')

x_train, t_train = train_data.drop('COUNT', axis=1), train_data['COUNT']
x_test, t_test = test_data.drop('COUNT', axis=1), test_data['COUNT'] 

x_train_trans = ct.fit_transform(x_train)
x_test_trans = ct.fit_transform(x_test)

t_train = t_train.values.ravel()
t_test = t_test.values.ravel()

model = RandomForestRegressor()
kfold = 5
kfold_score = cross_val_score(model, x_train_trans, t_train, cv=kfold, scoring='neg_mean_squared_error')
print(f'mse : {np.mean(-kfold_score)}')
print(f'rmse : {np.sqrt(np.mean(-kfold_score))}')

mse : 0.1589742144782981
rmse : 0.3987157063351005


* ## 기본 feature + 경과 주 + 경과 일

In [122]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error

feature = ['D_TYPE', 'GOODS_TYPE', 'AD1','ELAPSED_WEEK','ELAPSED_DAY', 'COUNT']
df_new = df.loc[:,feature]

train_data = df_new.iloc[:623579]
test_data = df_new.iloc[623579:]

ct = ColumnTransformer([('label', OrdinalEncoder(), ['AD1', 'D_TYPE', 'GOODS_TYPE'])], remainder='passthrough')

x_train, t_train = train_data.drop('COUNT', axis=1), train_data['COUNT']
x_test, t_test = test_data.drop('COUNT', axis=1), test_data['COUNT'] 

x_train_trans = ct.fit_transform(x_train)
x_test_trans = ct.fit_transform(x_test)

t_train = t_train.values.ravel()
t_test = t_test.values.ravel()

model = RandomForestRegressor()
kfold = 5
kfold_score = cross_val_score(model, x_train_trans, t_train, cv=kfold, scoring='neg_mean_squared_error')
print(f'mse : {np.mean(-kfold_score)}')
print(f'rmse : {np.sqrt(np.mean(-kfold_score))}')

mse : 0.10068684631434041
rmse : 0.3173119069848158


* ## 기본 feature + 경과 주 + 요일

In [128]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error

feature = ['D_TYPE', 'GOODS_TYPE', 'AD1','DoW', 'COUNT', 'ELAPSED_WEEK']
df_new = df.loc[:,feature]

train_data = df_new.iloc[:623579]
test_data = df_new.iloc[623579:]

ct = ColumnTransformer([('label', OrdinalEncoder(), ['AD1', 'D_TYPE', 'GOODS_TYPE', 'DoW'])], remainder='passthrough')

x_train, t_train = train_data.drop('COUNT', axis=1), train_data['COUNT']
x_test, t_test = test_data.drop('COUNT', axis=1), test_data['COUNT'] 

x_train_trans = ct.fit_transform(x_train)
x_test_trans = ct.fit_transform(x_test)

t_train = t_train.values.ravel()
t_test = t_test.values.ravel()

model = RandomForestRegressor()
kfold = 5
kfold_score = cross_val_score(model, x_train_trans, t_train, cv=kfold, scoring='neg_mean_squared_error')
print(f'mse : {np.mean(-kfold_score)}')
print(f'rmse : {np.sqrt(np.mean(-kfold_score))}')

mse : 0.09258063373448311
rmse : 0.3042706586815152


* ## 기본 feature + 경과 주 + 달 + 일 + 요일

In [130]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error

feature = ['D_TYPE', 'GOODS_TYPE', 'AD1','DoW','MONTH', 'DAY','COUNT', 'ELAPSED_WEEK']
df_new = df.loc[:,feature]

train_data = df_new.iloc[:623579]
test_data = df_new.iloc[623579:]

ct = ColumnTransformer([('label', OrdinalEncoder(), ['AD1', 'D_TYPE', 'GOODS_TYPE', 'DoW'])], remainder='passthrough')

x_train, t_train = train_data.drop('COUNT', axis=1), train_data['COUNT']
x_test, t_test = test_data.drop('COUNT', axis=1), test_data['COUNT'] 

x_train_trans = ct.fit_transform(x_train)
x_test_trans = ct.fit_transform(x_test)

t_train = t_train.values.ravel()
t_test = t_test.values.ravel()

model = RandomForestRegressor()
kfold = 5
kfold_score = cross_val_score(model, x_train_trans, t_train, cv=kfold, scoring='neg_mean_squared_error')
print(f'mse : {np.mean(-kfold_score)}')
print(f'rmse : {np.sqrt(np.mean(-kfold_score))}')

mse : 0.0773188646053536
rmse : 0.27806269905428455


* ## 기본 feature + 경과 주 + 달 + 일

In [134]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error

feature = ['D_TYPE', 'GOODS_TYPE', 'AD1','MONTH', 'DAY','COUNT', 'ELAPSED_WEEK']
df_new = df.loc[:,feature]

train_data = df_new.iloc[:623579]
test_data = df_new.iloc[623579:]

ct = ColumnTransformer([('label', OrdinalEncoder(), ['AD1', 'D_TYPE', 'GOODS_TYPE'])], remainder='passthrough')

x_train, t_train = train_data.drop('COUNT', axis=1), train_data['COUNT']
x_test, t_test = test_data.drop('COUNT', axis=1), test_data['COUNT'] 

x_train_trans = ct.fit_transform(x_train)
x_test_trans = ct.fit_transform(x_test)

t_train = t_train.values.ravel()
t_test = t_test.values.ravel()

model = RandomForestRegressor()
kfold = 5
kfold_score = cross_val_score(model, x_train_trans, t_train, cv=kfold, scoring='neg_mean_squared_error')
print(f'mse : {np.mean(-kfold_score)}')
print(f'rmse : {np.sqrt(np.mean(-kfold_score))}')

mse : 0.08116697034308935
rmse : 0.2848981753944545


* ## 기본 feature + 경과일 + 달 + 일

In [135]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error

feature = ['D_TYPE', 'GOODS_TYPE', 'AD1','MONTH','ELAPSED_DAY', 'DAY','COUNT', 'ELAPSED_WEEK']
df_new = df.loc[:,feature]

train_data = df_new.iloc[:623579]
test_data = df_new.iloc[623579:]

ct = ColumnTransformer([('label', OrdinalEncoder(), ['AD1', 'D_TYPE', 'GOODS_TYPE'])], remainder='passthrough')

x_train, t_train = train_data.drop('COUNT', axis=1), train_data['COUNT']
x_test, t_test = test_data.drop('COUNT', axis=1), test_data['COUNT'] 

x_train_trans = ct.fit_transform(x_train)
x_test_trans = ct.fit_transform(x_test)

t_train = t_train.values.ravel()
t_test = t_test.values.ravel()

model = RandomForestRegressor()
kfold = 5
kfold_score = cross_val_score(model, x_train_trans, t_train, cv=kfold, scoring='neg_mean_squared_error')
print(f'mse : {np.mean(-kfold_score)}')
print(f'rmse : {np.sqrt(np.mean(-kfold_score))}')

mse : 0.07791948631183183
rmse : 0.2791406210350472
