### Decision Tree

In [10]:
import pandas as pd
import numpy as np

#모델 라이브러리 선언
from sklearn import datasets,tree

#모델 정확도 라이브러리 선언
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

### 데이터 전처리

In [131]:
featureData = pd.read_csv('./data/feature_regression_example.csv')

In [40]:
featureData.dtypes

REGIONID         object
PRODUCTGROUP     object
PRODUCT          object
ITEM             object
YEARWEEK          int64
YEAR              int64
WEEK              int64
QTY               int64
HOLIDAY          object
HCLUS             int64
PROMOTION        object
PRO_PERCENT     float64
dtype: object

In [132]:
featureData["PRO_YN"] = np.where(featureData.PROMOTION == 'Y', 1,0)
featureData["HO_YN"] = np.where(featureData.HOLIDAY == 'Y', 1,0)

featureData.head()

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,PRO_YN,HO_YN
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1,1225,Y,1,Y,0.209442,1,1
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2,968,N,4,Y,0.209442,1,0
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,2015,3,1209,N,4,Y,0.208155,1,0
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201504,2015,4,1810,Y,2,Y,0.208155,1,1
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201505,2015,5,1773,N,4,Y,0.208155,1,0


##### get_dummies -> 열해당 열 데이터의 문자열을 숫자로 바꿔준다

In [38]:
featureData = pd.concat([featureData,pd.get_dummies(prefix="PROMO",data=featureData.PROMOTION)],axis=1)
featureData = pd.concat([featureData,pd.get_dummies(prefix="HOLYMO",data=featureData.HOLIDAY)],axis=1)

featureData.head()

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,PROMO_N,PROMO_Y,HOLYMO_N,HOLYMO_Y
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1,1225,Y,1,Y,0.209442,0,1,0,1
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2,968,N,4,Y,0.209442,0,1,1,0
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,2015,3,1209,N,4,Y,0.208155,0,1,1,0
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201504,2015,4,1810,Y,2,Y,0.208155,0,1,0,1
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201505,2015,5,1773,N,4,Y,0.208155,0,1,1,0


###### labelEncoder 

In [48]:
from sklearn.preprocessing import LabelEncoder
le_pro = LabelEncoder()
le_holy = LabelEncoder()
featureData['PRO_ECO'] = le_pro.fit_transform(featureData.PROMOTION)
featureData["HOLY_ENCO"] = le_holy.fit_transform(featureData.HOLIDAY)

featureData.head()

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,PRO_ECO,HOLY_ENCO
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1,1225,Y,1,Y,0.209442,1,1
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2,968,N,4,Y,0.209442,1,0
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,2015,3,1209,N,4,Y,0.208155,1,0
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201504,2015,4,1810,Y,2,Y,0.208155,1,1
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201505,2015,5,1773,N,4,Y,0.208155,1,0


##### 복구방법

In [49]:
le_pro.inverse_transform(featureData["HOLY_ENCO"])

array(['Y', 'N', 'N', 'Y', 'N', 'N', 'N', 'Y', 'N', 'N', 'N', 'N', 'N',
       'N', 'N', 'N', 'N', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'N', 'N', 'N', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'N',
       'N', 'N', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y',
       'Y', 'N', 'N', 'Y', 'N', 'N', 'N', 'Y', 'N', 'N', 'N', 'N', 'N',
       'N', 'N', 'N', 'N', 'N', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y',
       'Y', 'Y', 'Y', 'Y', 'N', 'N', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'N',
       'N', 'N', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'N',
       'Y', 'N', 'N'], dtype=object)

###### dictionary mapping 

In [54]:
binarymap = {"Y":1,"N":0}
featureData["PRO_DICT"] = featureData.PROMOTION.map(binarymap)
featureData.head()

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,PRO_DICT
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1,1225,Y,1,Y,0.209442,1
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2,968,N,4,Y,0.209442,1
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,2015,3,1209,N,4,Y,0.208155,1
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201504,2015,4,1810,Y,2,Y,0.208155,1
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201505,2015,5,1773,N,4,Y,0.208155,1


#### 특성 선정 / 데이터분리

In [133]:
featureData = featureData[(featureData.YEARWEEK >= 201501 & (featureData.YEARWEEK <= 201652))]

In [134]:
corrDf = featureData.corr()

corrDf

Unnamed: 0,YEARWEEK,YEAR,WEEK,QTY,HCLUS,PRO_PERCENT,PRO_YN,HO_YN
YEARWEEK,1.0,0.961051,0.213022,0.037392,-0.030681,0.30032,0.108551,0.009395
YEAR,0.961051,1.0,-0.065302,-0.048803,0.067443,0.208435,0.085606,-0.070803
WEEK,0.213022,-0.065302,1.0,0.307541,-0.349205,0.347462,0.089293,0.284231
QTY,0.037392,-0.048803,0.307541,1.0,-0.54492,0.712772,0.630081,0.514813
HCLUS,-0.030681,0.067443,-0.349205,-0.54492,1.0,-0.552991,-0.386926,-0.974902
PRO_PERCENT,0.30032,0.208435,0.347462,0.712772,-0.552991,1.0,0.903477,0.496585
PRO_YN,0.108551,0.085606,0.089293,0.630081,-0.386926,0.903477,1.0,0.378861
HO_YN,0.009395,-0.070803,0.284231,0.514813,-0.974902,0.496585,0.378861,1.0


In [172]:
featuewsStd = 0.5
features = list(corrDf[abs(corrDf.QTY) > featuewsStd].index)

label = ['QTY']

featureData = featureData.sort_index()

alldataNum = 0.8
testIndex = round(len(featureData.YEARWEEK) * alldataNum)
testIndex

86

In [178]:
trainingData = featureData.iloc[0:testIndex,:]
testData = featureData.iloc[testIndex:,:]
trainingData_features = trainingData[features]
trainingData_label = trainingData[label]
testData_features = testData[features]
testData_label = testData[label]

### 모델 적용 

In [138]:
model_method = tree.DecisionTreeRegressor(random_state=1)
model = model_method.fit(trainingData_features,trainingData_label)
model

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1, splitter='best')

### 예측 

In [139]:
predict = model.predict(testData_features)
predict

predictData = pd.DataFrame(predict)
predictData.columns = ["PREDICT"]

predictData

testData = testData.reset_index(drop=True)
resultData = pd.concat([testData,predictData],axis=1)

array([1685., 1522., 1504., 1504.,  753., 2100., 2505., 2069., 1773.,
        147., 1616., 1586., 1801., 2671., 2505., 2069., 1773., 1187.,
        337.,   16.,   16.])

In [145]:
resultData

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,PRO_YN,HO_YN,PREDICT
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201634,2016,34,1700,Y,1,Y,0.308584,1,1,1685.0
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201635,2016,35,1514,Y,1,Y,0.308584,1,1,1522.0
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201636,2016,36,1501,Y,1,Y,0.308584,1,1,1504.0
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201637,2016,37,1491,N,4,Y,0.308584,1,0,1504.0
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201638,2016,38,806,N,4,Y,0.308584,1,0,753.0
5,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201639,2016,39,2111,N,4,Y,0.280258,1,0,2100.0
6,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201640,2016,40,2400,N,4,Y,0.280258,1,0,2505.0
7,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201641,2016,41,2010,Y,2,Y,0.280258,1,1,2069.0
8,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201642,2016,42,1900,N,4,Y,0.280258,1,0,1773.0
9,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201643,2016,43,141,N,4,N,0.0,0,0,147.0


### Ramdom forests

In [161]:
from sklearn.ensemble import RandomForestClassifier

model_method_random = RandomForestClassifier(criterion="entropy",n_estimators=10,n_jobs=2,random_state=1)

model_random = model_method_random.fit(trainingData_features,trainingData_label)

model_random

predictR = model_random.predict(testData_features)

predictRData = pd.DataFrame(predictR)
predictRData.columns = ["PREDICT"]

resultRData = pd.concat([testData,predictRData],axis=1)
resultRData

  """


Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,PRO_YN,HO_YN,PREDICT
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201634,2016,34,1700,Y,1,Y,0.308584,1,1,1685
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201635,2016,35,1514,Y,1,Y,0.308584,1,1,1586
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201636,2016,36,1501,Y,1,Y,0.308584,1,1,1586
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201637,2016,37,1491,N,4,Y,0.308584,1,0,1504
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201638,2016,38,806,N,4,Y,0.308584,1,0,753
5,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201639,2016,39,2111,N,4,Y,0.280258,1,0,2100
6,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201640,2016,40,2400,N,4,Y,0.280258,1,0,2240
7,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201641,2016,41,2010,Y,2,Y,0.280258,1,1,2143
8,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201642,2016,42,1900,N,4,Y,0.280258,1,0,2100
9,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201643,2016,43,141,N,4,N,0.0,0,0,147


### Linear Regression 

In [185]:
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression

model_method_Linear = LinearRegression()

model_Linear = model_method_Linear.fit(trainingData_features,trainingData_label)

predictL = model_Linear.predict(testData_features)

predictLData = pd.DataFrame(predictL)
predictLData.columns = ["PREDICT"]

resultLData = pd.concat([testData,predictLData],axis=1)
resultLData

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,PRO_YN,HO_YN,PREDICT
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201634,2016,34,1700,Y,1,Y,0.308584,1,1,1700.0
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201635,2016,35,1514,Y,1,Y,0.308584,1,1,1514.0
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201636,2016,36,1501,Y,1,Y,0.308584,1,1,1501.0
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201637,2016,37,1491,N,4,Y,0.308584,1,0,1491.0
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201638,2016,38,806,N,4,Y,0.308584,1,0,806.0
5,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201639,2016,39,2111,N,4,Y,0.280258,1,0,2111.0
6,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201640,2016,40,2400,N,4,Y,0.280258,1,0,2400.0
7,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201641,2016,41,2010,Y,2,Y,0.280258,1,1,2010.0
8,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201642,2016,42,1900,N,4,Y,0.280258,1,0,1900.0
9,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201643,2016,43,141,N,4,N,0.0,0,0,141.0


In [187]:
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression

model_method_Linear = LinearRegression()

model_L = LinearRegression().fit(trainingData_features,trainingData_label)

model_L

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [None]:
r_sq = model_L.score