최종수정일시 : 2021-05-11 (모델링2)

#### - 기본 Feature : D_TYPE, GOODS_TYPE, AD1
#### - DATE 관련 추가 Feature : month(월), day(일), dayofweek(요일), holiday(일요일,국경일), weekend_indi(금,토)

In [87]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 경고창 무시
import warnings
warnings.filterwarnings(action='ignore')

# matplotlib 한글 폰트 오류 해결
import platform
from matplotlib import font_manager, rc
plt.rcParams['axes.unicode_minus'] = False

if platform.system() == 'Darwin':
    rc('font', family='appleGothic')
elif platform.system() == 'Windows':
    path = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
else:
    print('Unknown system.... sorry.....')
    
# 랜덤 시드 생성
np.random.seed(42)

### 데이터 불러오기 (미리 날짜별로 저장)

In [106]:
train = pd.read_csv('train.csv',encoding='ansi')
test = pd.read_csv('test.csv',encoding='ansi')

### 레이블(x)과 타겟(y)분리

In [131]:
train_X = train.drop('COUNT',axis=1)
train_y = train[['COUNT']]

test_X = test.drop('COUNT',axis=1)
test_y = test[['COUNT']]

### 필요없는 컬럼 제거

In [132]:
train_X.drop(['Unnamed: 0','USER_ID','JOIN_DATE','STORE_ID'],axis=1,inplace=True)
test_X.drop(['Unnamed: 0','USER_ID','JOIN_DATE','STORE_ID'],axis=1,inplace=True)

### 레이블인코딩 (문자형 -> 숫자형)

In [133]:
# Feature Encoding
from sklearn import preprocessing

le = preprocessing.LabelEncoder()

train_X['D_TYPE'] = le.fit_transform(train_X['D_TYPE'])
train_X['GOODS_TYPE'] = le.fit_transform(train_X['GOODS_TYPE'])
train_X['AD1'] = le.fit_transform(train_X['AD1'])

test_X['D_TYPE'] = le.fit_transform(test_X['D_TYPE'])
test_X['GOODS_TYPE'] = le.fit_transform(test_X['GOODS_TYPE'])
test_X['AD1'] = le.fit_transform(test_X['AD1'])

### DATE 컬럼을 데이트타임 타입으로 변경

In [134]:
train_X['DATE']=pd.to_datetime(train_X['DATE'])
test_X['DATE']=pd.to_datetime(test_X['DATE'])

### 월,일 컬럼 생성

In [135]:
train_X['Month'] = [k for k in train_X['DATE'].dt.month]
test_X['Month'] = [k for k in test_X['DATE'].dt.month]

In [136]:
train_X['Day'] = [k for k in train_X['DATE'].dt.day]
test_X['Day'] = [k for k in test_X['DATE'].dt.day] 

### 요일 컬럼 생성 {월:0, 화:1, 수:2, 목:3, 금:4, 토:5, 일:6}

In [137]:
## 요일 컬럼 생성
train_X['dayofweek'] = [k for k in train_X['DATE'].dt.dayofweek]
test_X['dayofweek'] = [k for k in test_X['DATE'].dt.dayofweek] 

### holiday 컬럼 생성 (구매율이 낮았던 일요일과 국경일)

In [139]:
# 2020년 holiday 지정
import pandas as pd
from pandas.tseries.holiday import AbstractHolidayCalendar, Holiday

class Calendar(AbstractHolidayCalendar):
    rules = [
        Holiday('New Year', month=1, day=1),
        Holiday('KR_New Year1', month=1, day=24),
        Holiday('KR_New Year2', month=1, day=25),
        Holiday('KR_New Year3', month=1, day=26),
        Holiday('3.1', month=3, day=1),
        Holiday('Buddha Day', month=4, day=30),
        Holiday('Memorial Day', month=6, day=6),
        Holiday('Liberation Day', month=8, day=15),
        Holiday('KR_Thanksgiving1', month=9, day=30),
        Holiday('KR_Thanksgiving2', month=10, day=1),
        Holiday('KR_Thanksgiving3', month=10, day=2),
        Holiday('National Foundation Day', month=10, day=3),
        Holiday('Hangul Day', month=10, day=9),
        Holiday('Christmas', month=12, day=25)
    ]
dr = pd.date_range(start='2020-01-01', end='2020-12-31')
cal = Calendar()
holidays = cal.holidays(start=dr.min(), end=dr.max())
train_X['Holiday'] = train_X['DATE'].isin(holidays)
test_X['Holiday'] = test_X['DATE'].isin(holidays)

# 일요일도 holiday로
for i in range(len(train_X)):
    if train_X['Day'][i] == 6:
        train_X['Holiday'][i] = True
        
for i in range(len(test_X)):
    if test_X['Day'][i] == 6:
        test_X['Holiday'][i] = True

### Weekend_indi 컬럼 생성 (구매율이 높았던 금,토)

In [140]:
train_X['Weekend_indi'] = 0
train_X['EveryDay'] = train_X.loc[train_X['Day'].isin([4,5]), 'Weekend_indi'] =1

In [141]:
test_X['Weekend_indi'] = 0
test_X['EveryDay'] = test_X.loc[test_X['Day'].isin([4,5]), 'Weekend_indi'] =1

### 필요없는 컬럼 제거

In [142]:
train_X.drop(['DATE','EveryDay'],axis=1,inplace=True)
test_X.drop(['DATE','EveryDay'],axis=1,inplace=True)

In [147]:
display(train_X, test_X)

Unnamed: 0,D_TYPE,GOODS_TYPE,AD1,Month,Day,dayofweek,Holiday,Weekend_indi
0,0,0,37,1,1,2,True,0
1,1,0,46,1,1,2,True,0
2,1,0,60,1,1,2,True,0
3,1,2,49,1,1,2,True,0
4,0,2,48,1,1,2,True,0
...,...,...,...,...,...,...,...,...
623574,1,1,37,9,30,2,True,0
623575,0,1,49,9,30,2,True,0
623576,0,1,3,9,30,2,True,0
623577,1,0,48,9,30,2,True,0


Unnamed: 0,D_TYPE,GOODS_TYPE,AD1,Month,Day,dayofweek,Holiday,Weekend_indi
0,0,0,52,10,1,3,True,0
1,1,3,50,10,1,3,True,0
2,1,0,52,10,1,3,True,0
3,1,0,22,10,1,3,True,0
4,0,0,64,10,1,3,True,0
...,...,...,...,...,...,...,...,...
255687,1,1,40,12,31,3,False,0
255688,1,1,19,12,31,3,False,0
255689,0,1,52,12,31,3,False,0
255690,1,1,50,12,31,3,False,0


### 랜덤포레스트 모델링

In [143]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

rfr = RandomForestRegressor()
    
rfr.fit(train_X, train_y)
    
y_pred = rfr.predict(test_X)

MSE = mean_squared_error(test_y, y_pred)
print('MSE :', MSE)

MSE : 0.35405773587254896


### 예측값과 원래값 비교

In [148]:
comparison = pd.DataFrame({'prediction': y_pred,
                          'ground_truth' : test_y.values.ravel()}) 
comparison

Unnamed: 0,prediction,ground_truth
0,1.014409,1
1,1.040651,1
2,1.041330,1
3,1.000000,1
4,1.081166,1
...,...,...
255687,1.002202,1
255688,1.000000,1
255689,1.066062,1
255690,1.002669,1


In [149]:
comparison.to_csv('비교.csv',encoding='ansi')