### 김태영, 주차장 수요예측

In [1]:
## 환경 세팅 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
%matplotlib inline 

from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')

pd.options.display.float_format = '{:.0f}'.format

def get_font_family():
    import platform
    system_name = platform.system()
    # colab 사용자는 system_name이 'Linux'로 확인
    if system_name == "Darwin" :
        font_family = "AppleGothic"
    elif system_name == "Windows":
        font_family = "Malgun Gothic"
    else:
        !apt-get update -qq
        !apt-get install fonts-nanum -qq  > /dev/null
        import matplotlib.font_manager as fm
        fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
        font = fm.FontProperties(fname=fontpath, size=9)
        fm._rebuild()
        font_family = "NanumBarunGothic"
    return font_family

get_font_family()
font_family = get_font_family()
plt.rc('font', family=font_family)
plt.rc('axes', unicode_minus=False)
plt.style.use('ggplot')

# seed 설정
np.random.seed(42)

In [2]:
#데이터 가져오기
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
traffic = pd.read_csv('traffic.csv')
test.head()

Unnamed: 0.1,Unnamed: 0,USER_ID,JOIN_DATE,D_TYPE,STORE_ID,GOODS_TYPE,DATE,COUNT,AD1
0,0,15521,2014-09-12,AA,182320,A,2020-10-01,1,JRR
1,1,17651,2014-09-19,BB,82434,D,2020-10-01,1,J
2,2,26388,2014-10-25,BB,182320,A,2020-10-01,1,JRR
3,3,81772,2015-05-02,BB,204438,A,2020-10-01,1,DJ
4,4,92258,2015-05-24,AA,220745,A,2020-10-01,1,SD


In [3]:
#교통량 데이터 DATE값 일치 시켜주기
traffic['DATE'] = pd.to_datetime(traffic['DATE']).apply(lambda x:x.strftime('%Y-%m-%d'))

In [4]:
#교통량 데이터 train과 test로 나눠주기
traffic_train = traffic.loc[traffic['DATE'].between('2020-01-01', '2020-09-30')]
traffic_test = traffic.loc[traffic['DATE'].between('2020-10-01', '2020-12-31')]

In [5]:
#merge
train = pd.merge(train, traffic_train, how='outer', on='DATE')
test = pd.merge(test, traffic_test, how='outer', on='DATE')

In [6]:
#단골손님 혹은 기업손님 일반손님과 분리하기
train['R_customer'] = np.where(train['COUNT']>=20, True, False)
test['R_customer'] = np.where(test['COUNT']>=20, True, False)

In [7]:
#COUNT값을 드랍하고 시작
train_X = train.drop('COUNT',axis=1)
train_y = train[['COUNT']]

test_X = test.drop('COUNT',axis=1)
test_y = test[['COUNT']]

In [8]:
#부절절하다고 판단되는 피쳐들 제외
train_X.drop(['Unnamed: 0','USER_ID','JOIN_DATE','STORE_ID'],axis=1,inplace=True)
test_X.drop(['Unnamed: 0','USER_ID','JOIN_DATE','STORE_ID'],axis=1,inplace=True)

In [9]:
# Feature Encoding
from sklearn import preprocessing

le = preprocessing.LabelEncoder()

train_X['D_TYPE'] = le.fit_transform(train_X['D_TYPE'])
train_X['GOODS_TYPE'] = le.fit_transform(train_X['GOODS_TYPE'])
train_X['AD1'] = le.fit_transform(train_X['AD1'])


test_X['D_TYPE'] = le.fit_transform(test_X['D_TYPE'])
test_X['GOODS_TYPE'] = le.fit_transform(test_X['GOODS_TYPE'])
test_X['AD1'] = le.fit_transform(test_X['AD1'])

In [10]:
# 년도, 월, 일 나눠주기위해 설정
train_X['DATE']=pd.to_datetime(train_X['DATE'])
test_X['DATE']=pd.to_datetime(test_X['DATE'])

In [11]:
## 요일 컬럼 생성
train_X['Day'] = [k for k in train_X['DATE'].dt.dayofweek]
test_X['Day'] = [k for k in test_X['DATE'].dt.dayofweek] 

In [12]:
## 년도 컬럼 생성
train_X['년도'] = train_X.DATE.dt.strftime('%Y')
test_X['년도'] = test_X.DATE.dt.strftime('%Y')

In [13]:
## 월 컬럼 생성
train_X['월'] = train_X.DATE.dt.strftime('%m')
test_X['월'] = test_X.DATE.dt.strftime('%m')

In [14]:
## 일 컬럼 생성
train_X['일'] = train_X.DATE.dt.strftime('%d')
test_X['일'] = test_X.DATE.dt.strftime('%d')

In [15]:
# 2020년 holiday 지정
import pandas as pd
from pandas.tseries.holiday import AbstractHolidayCalendar, Holiday

class Calendar(AbstractHolidayCalendar):
    rules = [
        Holiday('New Year', month=1, day=1),
        Holiday('KR_New Year1', month=1, day=24),
        Holiday('KR_New Year2', month=1, day=25),
        Holiday('KR_New Year3', month=1, day=26),
        Holiday('3.1', month=3, day=1),
        Holiday('Buddha Day', month=4, day=30),
        Holiday('Memorial Day', month=6, day=6),
        Holiday('Liberation Day', month=8, day=15),
        Holiday('KR_Thanksgiving1', month=9, day=30),
        Holiday('KR_Thanksgiving2', month=10, day=1),
        Holiday('KR_Thanksgiving3', month=10, day=2),
        Holiday('National Foundation Day', month=10, day=3),
        Holiday('Hangul Day', month=10, day=9),
        Holiday('Christmas', month=12, day=25)
    ]
dr = pd.date_range(start='2020-01-01', end='2020-12-31')
cal = Calendar()
holidays = cal.holidays(start=dr.min(), end=dr.max())
train_X['Holiday'] = train_X['DATE'].isin(holidays)
test_X['Holiday'] = test_X['DATE'].isin(holidays)

# 일요일도 holiday로
for i in range(len(train_X)):
    if train_X['Day'][i] == 6:
        train_X['Holiday'][i] = True
        
for i in range(len(test_X)):
    if test_X['Day'][i] == 6:
        test_X['Holiday'][i] = True

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_X['Holiday'][i] = True
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_X['Holiday'][i] = True


### 금,토요일 주말로 지정

In [16]:
# 금요일 토요일을 주말로 지정해주기위한 설정
train_X['Weekend_indi'] = 0
train_X['EveryDay'] = train_X.loc[train_X['Day'].isin([4,5]), 'Weekend_indi'] =1

In [17]:
test_X['Weekend_indi'] = 0
test_X['EveryDay'] = test_X.loc[test_X['Day'].isin([4,5]), 'Weekend_indi'] =1

In [18]:
# 불필요한 피쳐 드랍
train_X.drop('DATE',axis=1,inplace=True)
test_X.drop('DATE',axis=1,inplace=True)

In [19]:
# 불필요한 피쳐 드랍
train_X = train_X.drop(columns=['EveryDay'])
test_X = test_X.drop(columns=['EveryDay'])

In [20]:
train_X

Unnamed: 0,D_TYPE,GOODS_TYPE,AD1,합계,R_customer,Day,년도,월,일,Holiday,Weekend_indi
0,0,0,37,7347299,False,2,2020,01,01,True,0
1,1,0,46,7347299,False,2,2020,01,01,True,0
2,1,0,60,7347299,False,2,2020,01,01,True,0
3,1,2,49,7347299,False,2,2020,01,01,True,0
4,0,2,48,7347299,False,2,2020,01,01,True,0
...,...,...,...,...,...,...,...,...,...,...,...
623574,1,1,37,7129682,False,2,2020,09,30,True,0
623575,0,1,49,7129682,False,2,2020,09,30,True,0
623576,0,1,3,7129682,False,2,2020,09,30,True,0
623577,1,0,48,7129682,False,2,2020,09,30,True,0


In [21]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

rfr = RandomForestRegressor()
    
rfr.fit(train_X, train_y)
    
y_pred = rfr.predict(test_X)

MSE = mean_squared_error(test_y, y_pred)
print('MSE :', MSE)

  rfr.fit(train_X, train_y)


MSE : 0.09985309833875546
