# Decision Tree Example

### 라이브러리 선언

In [1]:
import numpy as np
import pandas as pd 

# 모델 라이브러리 선언
from sklearn import datasets, tree

# 모델 정확도 라이브러리 선언
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

# 시각화 라이브러리 선언
import matplotlib.pyplot as plt

### 1. 분석데이터 로딩

In [2]:
#CSV 파일을 읽어 DataFrame 변수에 저장하기
featuresData = pd.read_csv("../dataset./feature_regression_example.csv")

In [10]:
featuresData.dtypes

REGIONID         object
PRODUCTGROUP     object
PRODUCT          object
ITEM             object
YEARWEEK          int64
YEAR              int64
WEEK              int64
QTY               int64
HOLIDAY          object
HCLUS             int64
PROMOTION        object
PRO_PERCENT     float64
dtype: object

### 2. 데이터 셋 분리

In [20]:
predictStd = 201630

In [21]:
trainingData = featuresData[
    featuresData.YEARWEEK <= predictStd]
print(len(trainingData))
trainingData.head()

83


Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1,1225,Y,1,Y,0.209442
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2,968,N,4,Y,0.209442
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,2015,3,1209,N,4,Y,0.208155
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201504,2015,4,1810,Y,2,Y,0.208155
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201505,2015,5,1773,N,4,Y,0.208155


In [23]:
testData = featuresData[
    (featuresData.YEARWEEK > predictStd) &
    (featuresData.YEARWEEK <= 201701)]
print(len(testData))
testData.head()

22


Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT
83,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201631,2016,31,1522,N,4,Y,0.280258
84,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201632,2016,32,2100,N,4,Y,0.280258
85,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201633,2016,33,43,N,4,N,0.0
86,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201634,2016,34,1700,Y,1,Y,0.308584
87,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201635,2016,35,1514,Y,1,Y,0.308584


In [16]:
testData_feature = testData[['WEEK','PRO_PERCENT','HOLIDAYCODE']]

In [17]:
testData_label = testData[['QTY']]

In [18]:
# from sklearn import tree
#clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)

### 3. 모델선언 및 학습

In [25]:
trainingData_feature = trainingData[['WEEK','PRO_PERCENT']]

In [26]:
trainingData_label = trainingData[['QTY']]

In [29]:
# Define Model
model_method = tree.DecisionTreeClassifier()

In [30]:
# Learning (Feature & Label)
model = model_method.fit(trainingData_feature, trainingData_label)

### 4. 예측

In [32]:
testData_feature = testData[['WEEK','PRO_PERCENT']]

In [33]:
predict = model.predict(testData_feature)

In [35]:
predict

array([2240, 2240,  147, 1318, 1318, 1318, 1318, 1318, 2240, 2240, 2240,
       2240,  230, 1454, 1454, 2033, 2033, 3691, 1758, 1758, 1758, 1758],
      dtype=int64)

### 5. 데이터 정리

In [39]:
### 데이터프레임으로 변환
predictData = pd.DataFrame(predict)

In [41]:
### 컬럼명 주입
predictData.columns = ["PREDICT"]
predictData.head()

Unnamed: 0,PREDICT
0,2240
1,2240
2,147
3,1318
4,1318


In [46]:
### 테스트데이터 인덱스 초기화
testDataFinal = testData.reset_index(drop=True)
testDataFinal.head(2)

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201631,2016,31,1522,N,4,Y,0.280258
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201632,2016,32,2100,N,4,Y,0.280258


In [51]:
### 컬럼 붙이기 (index 끼리)
finalResult = pd.concat([testDataFinal, predictData], axis = 1)
finalResult.head(2)

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,PREDICT
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201631,2016,31,1522,N,4,Y,0.280258,2240
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201632,2016,32,2100,N,4,Y,0.280258,2240


### 6. 정확도 측정

In [53]:
mean_absolute_error(finalResult['QTY'], finalResult['PREDICT'])  

362.09090909090907

In [54]:
mean_squared_error(finalResult['QTY'], finalResult['PREDICT'])  

261032.63636363635

In [55]:
r2_score(finalResult['QTY'], finalResult['PREDICT'])  

0.4409292918937219

In [43]:
finalDf.to_csv("./decisiontree_result.csv")