### 라이브러리 선언

In [2]:
import numpy as np
import pandas as pd 

# 모델 라이브러리 선언
from sklearn import datasets, tree

# 모델 정확도 라이브러리 선언
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

# CSV 파일을 읽어 DataFrame 변수에 저장하기
featuresData = pd.read_csv("../dataset./feature_regression_example.csv")
featuresData.head()

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1,1225,Y,1,Y,0.209442
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2,968,N,4,Y,0.209442
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,2015,3,1209,N,4,Y,0.208155
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201504,2015,4,1810,Y,2,Y,0.208155
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201505,2015,5,1773,N,4,Y,0.208155


### 1. 타입 통합 / 특성 숫자컬럼추가

In [3]:
# 데이터 타입 표준화
featuresData["YEARWEEK"] = featuresData.YEARWEEK.astype(int)
featuresData["YEAR"] = featuresData.YEARWEEK.astype(int)
featuresData["WEEK"] = featuresData.YEARWEEK.astype(int)

# 특성 타입 추가
featuresData["HO_YN"] = np.where(featuresData.HOLIDAY == "Y", 1,0)
featuresData["PRO_YN"] = np.where(featuresData.PROMOTION == "Y", 1,0)
featuresData.head()

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,HO_YN,PRO_YN
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,201501,201501,1225,Y,1,Y,0.209442,1,1
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,201502,201502,968,N,4,Y,0.209442,0,1
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,201503,201503,1209,N,4,Y,0.208155,0,1
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201504,201504,201504,1810,Y,2,Y,0.208155,1,1
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201505,201505,201505,1773,N,4,Y,0.208155,0,1


In [4]:
# 데이터 조회
featuresData = featuresData[ (featuresData.YEARWEEK >= 201501) & \
                             (featuresData.YEARWEEK <= 201652)]
featuresData.head()
print(featuresData.YEARWEEK.min())
print(featuresData.YEARWEEK.max())

201501
201652


### 2. 특성선정 / 데이터 분리

In [5]:
# 상관관계 확인
corrDf = featuresData.corr()
corrDf

Unnamed: 0,YEARWEEK,YEAR,WEEK,QTY,HCLUS,PRO_PERCENT,HO_YN,PRO_YN
YEARWEEK,1.0,1.0,1.0,0.112267,-0.071586,0.404889,0.049867,0.205916
YEAR,1.0,1.0,1.0,0.112267,-0.071586,0.404889,0.049867,0.205916
WEEK,1.0,1.0,1.0,0.112267,-0.071586,0.404889,0.049867,0.205916
QTY,0.112267,0.112267,0.112267,1.0,-0.53723,0.700195,0.505932,0.612451
HCLUS,-0.071586,-0.071586,-0.071586,-0.53723,1.0,-0.545619,-0.974601,-0.374072
PRO_PERCENT,0.404889,0.404889,0.404889,0.700195,-0.545619,1.0,0.487062,0.898554
HO_YN,0.049867,0.049867,0.049867,0.505932,-0.974601,0.487062,1.0,0.365148
PRO_YN,0.205916,0.205916,0.205916,0.612451,-0.374072,0.898554,0.365148,1.0


In [8]:
# 인덱스컬럼 초기화
corrResult = corrDf.reset_index()

# 상관관계 계수 비율 선정
featuresStd = 0.5

# Feature 선택
features = corrResult[ (corrResult.QTY != 1) & 
            (abs(corrResult.QTY) > featuresStd)]["index"].tolist()
features

label = ['QTY']
label

['QTY']

In [9]:
# 데이터 분리
yearweekStd = 201630

trainingData_features = featuresData[featuresData.YEARWEEK <= yearweekStd][features]
trainingData_label = featuresData[featuresData.YEARWEEK <= yearweekStd][label]
testData_features = featuresData[featuresData.YEARWEEK > yearweekStd][features]
testData_label = featuresData[featuresData.YEARWEEK > yearweekStd]

### 3. 모델 적용

In [10]:
# Define Model
model_method = tree.DecisionTreeClassifier()

# Learning (Feature & Label)
model = model_method.fit(trainingData_features, trainingData_label)

### 4. 예측

In [12]:
# 예측
predict = model.predict(testData_features)
predict

array([ 973,  973,   16,  968,  968,  968,  973,  973,  973,  973, 1172,
        973,   16, 2033, 2033, 2033, 2033, 2033,  973,  973,  973,  968],
      dtype=int64)

### 5. 데이터 정리

In [14]:
### 데이터프레임으로 변환
predictData = pd.DataFrame(predict)

### 컬럼명 주입
predictData.columns = ["PREDICT"]
predictData.head()

Unnamed: 0,PREDICT
0,973
1,973
2,16
3,968
4,968


In [18]:
testData = featuresData[featuresData.YEARWEEK > yearweekStd]

In [19]:
### 테스트데이터 정답지 인덱스 초기화
testDataFinal = testData.reset_index(drop=True)
testDataFinal.head(2)

### 테스트데이터 + 예측값 붙이기 (index 끼리)
finalResult = pd.concat([testDataFinal, predictData], axis = 1)
finalResult.head(2)

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,HO_YN,PRO_YN,PREDICT
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201631,201631,201631,1522,N,4,Y,0.280258,0,1,973
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201632,201632,201632,2100,N,4,Y,0.280258,0,1,973


In [23]:
### MAE
mean_absolute_error(finalResult['QTY'], finalResult['PREDICT'])

### RMSE (MSE)
mean_squared_error(finalResult['QTY'], finalResult['PREDICT'])

### R2
r2_score(finalResult['QTY'], finalResult['PREDICT'])   

-0.060431446102753084