# Decision Tree Example

In [108]:
import numpy as np
import pandas as pd 

# 모델 라이브러리 선언
from sklearn import datasets, tree
from sklearn import datasets, linear_model

# 모델 정확도 라이브러리 선언
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

# 시각화 라이브러리 선언
import matplotlib.pyplot as plt

# 마크다운 테스트 메세지

### 1. 분석데이터 로딩

In [61]:
#CSV 파일을 읽어 DataFrame 변수에 저장하기
featuresData = pd.read_csv("../dataset./feature_regression_example.csv")

In [62]:
featuresData.corr()

Unnamed: 0,YEARWEEK,YEAR,WEEK,QTY,HCLUS,PRO_PERCENT
YEARWEEK,1.0,0.961051,0.213022,0.037392,-0.030681,0.30032
YEAR,0.961051,1.0,-0.065302,-0.048803,0.067443,0.208435
WEEK,0.213022,-0.065302,1.0,0.307541,-0.349205,0.347462
QTY,0.037392,-0.048803,0.307541,1.0,-0.54492,0.712772
HCLUS,-0.030681,0.067443,-0.349205,-0.54492,1.0,-0.552991
PRO_PERCENT,0.30032,0.208435,0.347462,0.712772,-0.552991,1.0


In [63]:
featuresData.head(1)

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1,1225,Y,1,Y,0.209442


### 2.데이터 형 변환

In [64]:
featuresData.dtypes

REGIONID         object
PRODUCTGROUP     object
PRODUCT          object
ITEM             object
YEARWEEK          int64
YEAR              int64
WEEK              int64
QTY               int64
HOLIDAY          object
HCLUS             int64
PROMOTION        object
PRO_PERCENT     float64
dtype: object

In [65]:
#형변환
featuresData[['WEEK','QTY','PRO_PERCENT']]=featuresData[['WEEK','QTY','PRO_PERCENT']].astype('float64')
featuresData.dtypes

REGIONID         object
PRODUCTGROUP     object
PRODUCT          object
ITEM             object
YEARWEEK          int64
YEAR              int64
WEEK            float64
QTY             float64
HOLIDAY          object
HCLUS             int64
PROMOTION        object
PRO_PERCENT     float64
dtype: object

### 3. 문자데이터 코드변환(Vector연산)

In [66]:
#case when 
############################
def codeConversion(df):
    if df == "Y":
        return 1
    else:
        return 0

In [67]:
featuresData['PROMOTIONCODE'] = featuresData['PROMOTION'].apply(codeConversion)
featuresData.head()

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,PROMOTIONCODE
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1.0,1225.0,Y,1,Y,0.209442,1
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2.0,968.0,N,4,Y,0.209442,1
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,2015,3.0,1209.0,N,4,Y,0.208155,1
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201504,2015,4.0,1810.0,Y,2,Y,0.208155,1
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201505,2015,5.0,1773.0,N,4,Y,0.208155,1


In [68]:
featuresData['HOLIDAYCODE'] = featuresData['HOLIDAY'].apply(codeConversion)
featuresData.head()

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,PROMOTIONCODE,HOLIDAYCODE
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1.0,1225.0,Y,1,Y,0.209442,1,1
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2.0,968.0,N,4,Y,0.209442,1,0
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,2015,3.0,1209.0,N,4,Y,0.208155,1,0
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201504,2015,4.0,1810.0,Y,2,Y,0.208155,1,1
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201505,2015,5.0,1773.0,N,4,Y,0.208155,1,0


In [69]:
featuresData.head()

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,PROMOTIONCODE,HOLIDAYCODE
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1.0,1225.0,Y,1,Y,0.209442,1,1
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2.0,968.0,N,4,Y,0.209442,1,0
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,2015,3.0,1209.0,N,4,Y,0.208155,1,0
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201504,2015,4.0,1810.0,Y,2,Y,0.208155,1,1
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201505,2015,5.0,1773.0,N,4,Y,0.208155,1,0


In [70]:
featuresData.corr()

Unnamed: 0,YEARWEEK,YEAR,WEEK,QTY,HCLUS,PRO_PERCENT,PROMOTIONCODE,HOLIDAYCODE
YEARWEEK,1.0,0.961051,0.213022,0.037392,-0.030681,0.30032,0.108551,0.009395
YEAR,0.961051,1.0,-0.065302,-0.048803,0.067443,0.208435,0.085606,-0.070803
WEEK,0.213022,-0.065302,1.0,0.307541,-0.349205,0.347462,0.089293,0.284231
QTY,0.037392,-0.048803,0.307541,1.0,-0.54492,0.712772,0.630081,0.514813
HCLUS,-0.030681,0.067443,-0.349205,-0.54492,1.0,-0.552991,-0.386926,-0.974902
PRO_PERCENT,0.30032,0.208435,0.347462,0.712772,-0.552991,1.0,0.903477,0.496585
PROMOTIONCODE,0.108551,0.085606,0.089293,0.630081,-0.386926,0.903477,1.0,0.378861
HOLIDAYCODE,0.009395,-0.070803,0.284231,0.514813,-0.974902,0.496585,0.378861,1.0


### 4. 데이터 셋 분리

In [71]:
predictStd = 201630

In [72]:
#where between and (조건 절)
trainingData = featuresData.query('YEARWEEK <= @predictStd')
trainingData.head()

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,PROMOTIONCODE,HOLIDAYCODE
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1.0,1225.0,Y,1,Y,0.209442,1,1
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2.0,968.0,N,4,Y,0.209442,1,0
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,2015,3.0,1209.0,N,4,Y,0.208155,1,0
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201504,2015,4.0,1810.0,Y,2,Y,0.208155,1,1
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201505,2015,5.0,1773.0,N,4,Y,0.208155,1,0


In [73]:
#where between and (조건 절)
testData = featuresData.query('YEARWEEK > @predictStd')
testData.head()

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,PROMOTIONCODE,HOLIDAYCODE
83,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201631,2016,31.0,1522.0,N,4,Y,0.280258,1,0
84,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201632,2016,32.0,2100.0,N,4,Y,0.280258,1,0
85,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201633,2016,33.0,43.0,N,4,N,0.0,0,0
86,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201634,2016,34.0,1700.0,Y,1,Y,0.308584,1,1
87,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201635,2016,35.0,1514.0,Y,1,Y,0.308584,1,1


In [74]:
trainingData_feature = trainingData[['WEEK','PRO_PERCENT','HOLIDAYCODE']]

In [75]:
trainingData_label = trainingData[['QTY']]
trainingData_label.head()

Unnamed: 0,QTY
0,1225.0
1,968.0
2,1209.0
3,1810.0
4,1773.0


In [76]:
testData_feature = testData[['WEEK','PRO_PERCENT','HOLIDAYCODE']]
testData_feature.head()

Unnamed: 0,WEEK,PRO_PERCENT,HOLIDAYCODE
83,31.0,0.280258,0
84,32.0,0.280258,0
85,33.0,0.0,0
86,34.0,0.308584,1
87,35.0,0.308584,1


In [77]:
testData_label = testData[['QTY']]

In [78]:
# from sklearn import tree
#clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)

### 5. 모델선언 및 예측

In [79]:
#model_method= tree.DecisionTreeClassifier(criterion='gini', max_depth=5, random_state=0)

In [80]:
# from sklearn import tree
#model_method = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)

In [169]:
# from sklearn import tree
#model_method = tree.DecisionTreeClassifier()
model_method = linear_model.LinearRegression()

In [148]:
### Extract Coefficient 머신러닝!!

In [170]:
model = model_method.fit(trainingData_feature, trainingData_label)

predict = model.predict(testData_feature)

predict

In [152]:
testData_label.head()

Unnamed: 0,QTY
83,1522.0
84,2100.0
85,43.0
86,1700.0
87,1514.0


### 6. 데이터 정리

In [172]:
type(predict)

numpy.ndarray

In [173]:
predictData = pd.DataFrame(predict)

In [155]:
predictData.head(5)

Unnamed: 0,0
0,2240.0
1,2240.0
2,147.0
3,1318.0
4,1318.0


In [174]:
predictData.columns = ["PREDICT"]

In [175]:
testData.head(5)

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,PROMOTIONCODE,HOLIDAYCODE
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201631,2016,31.0,1522.0,N,4,Y,0.280258,1,0
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201632,2016,32.0,2100.0,N,4,Y,0.280258,1,0
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201633,2016,33.0,43.0,N,4,N,0.0,0,0
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201634,2016,34.0,1700.0,Y,1,Y,0.308584,1,1
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201635,2016,35.0,1514.0,Y,1,Y,0.308584,1,1


In [176]:
testData.reset_index(drop=True, inplace=True)

In [121]:
testData.head(10)

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,PROMOTIONCODE,HOLIDAYCODE
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201631,2016,31.0,1522.0,N,4,Y,0.280258,1,0
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201632,2016,32.0,2100.0,N,4,Y,0.280258,1,0
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201633,2016,33.0,43.0,N,4,N,0.0,0,0
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201634,2016,34.0,1700.0,Y,1,Y,0.308584,1,1
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201635,2016,35.0,1514.0,Y,1,Y,0.308584,1,1
5,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201636,2016,36.0,1501.0,Y,1,Y,0.308584,1,1
6,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201637,2016,37.0,1491.0,N,4,Y,0.308584,1,0
7,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201638,2016,38.0,806.0,N,4,Y,0.308584,1,0
8,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201639,2016,39.0,2111.0,N,4,Y,0.280258,1,0
9,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201640,2016,40.0,2400.0,N,4,Y,0.280258,1,0


In [177]:
predictData.reset_index(drop=True, inplace=True)

In [160]:
#testData_feature.reset_index(drop=True, inplace=True)

In [178]:
#cncatenate labels to df as a new column / column binding
finalDf = pd.concat([testData, predictData], axis = 1)

In [162]:
finalDf.head()

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,PROMOTIONCODE,HOLIDAYCODE,PREDICT
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201631,2016,31.0,1522.0,N,4,Y,0.280258,1,0,2240.0
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201632,2016,32.0,2100.0,N,4,Y,0.280258,1,0,2240.0
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201633,2016,33.0,43.0,N,4,N,0.0,0,0,147.0
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201634,2016,34.0,1700.0,Y,1,Y,0.308584,1,1,1318.0
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201635,2016,35.0,1514.0,Y,1,Y,0.308584,1,1,1318.0


In [179]:
finalDf.to_csv("./dt_reg_result_20181022.csv")

In [127]:
pwd

'C:\\Users\\cj\\Python_CJ_ST_COPY\\Session01 - Why Python for Data Analysis'

### 7. 정확도 측정

In [180]:
mean_absolute_error(finalDf['QTY'], finalDf['PREDICT'])  
#rlt_linear_reg_mae

455.62189421546094

In [181]:
mean_squared_error(finalDf['QTY'], finalDf['PREDICT'])  
#rlt_linear_reg_mse

376307.3344786445

In [182]:
r2_score(finalDf['QTY'], finalDf['PREDICT'])  
#rlt_linear_reg_r2

0.38670554032608817

In [183]:
finalDf.to_csv("./regression_result.csv")

In [166]:
rslt_dctree_MAE

376.2083333333333

In [167]:
rlt_linear_reg_mae

455.62189421546094