# 라이브러리 선언

In [1]:
# 스프레드시트의 행과열이 있는 데이터 조작 라이브러리
import pandas as pd
import numpy as np

# 머신러닝 모델 라이브러리
from sklearn import tree, ensemble, svm

# 시각화 라이브러리
import matplotlib.pyplot as plt

# 머신러닝 전처리 라이브러리
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# 데이터 불러오기

In [2]:
featureGitUrl = "https://raw.githubusercontent.com/hyokwan/python-lecture/master/dataset/feature_regression_example.csv"
featuresData = pd.read_csv(featureGitUrl)
featuresData.head(2)

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1,1225,Y,1,Y,0.209442
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2,968,N,4,Y,0.209442


# 1. 타입 통합/ 특성숫자컬럼 추가

In [3]:
featuresData.PROMOTION.drop_duplicates()

0    Y
9    N
Name: PROMOTION, dtype: object

In [4]:
### 바꿔야 하는 대상이 1,2 -> np.where !!! 전처리 함수를 만든다
def ynVectorFuc(inValue):
    # inValue = "Y"
    if inValue == "Y":
        return 1
    else:
        return 0
### 바꿔야 하는 대상이 3-5 -> dictionary 만들고 map함수로!! 호출!!
ynMap = {"Y":1,"N":0}
featuresData.PROMOTION.map( ynMap )
### 바꿔야 하는 대상이 5개 초과 인경우
### LabelEncoder는  해당 컬럼의 카테코리데이터를 오름차순으로 정렬후
### 인덱스번호로 숫자변환
ynLabel = LabelEncoder()
featuresData["LE_PROMOTION"] = ynLabel.fit_transform(featuresData.PROMOTION)

In [5]:
featuresData.describe()

Unnamed: 0,YEARWEEK,YEAR,WEEK,QTY,HCLUS,PRO_PERCENT,LE_PROMOTION
count,105.0,105.0,105.0,105.0,105.0,105.0,105.0
mean,201576.27619,2015.495238,26.752381,1210.238095,2.742857,0.20578,0.761905
std,52.254278,0.502375,15.229514,820.097819,1.587382,0.128636,0.42796
min,201501.0,2015.0,1.0,15.0,0.0,0.0,0.0
25%,201527.0,2015.0,14.0,542.0,1.0,0.208155,1.0
50%,201553.0,2015.0,27.0,1139.0,4.0,0.209442,1.0
75%,201626.0,2016.0,40.0,1753.0,4.0,0.280258,1.0
max,201652.0,2016.0,53.0,4035.0,4.0,0.421888,1.0


In [6]:
### 테이블정의서 동일하게 타입을 정의해야 한다
### 특성 숫자 컬럼 추가

In [7]:
### np.where, function, loc
### 추가로 Label Encoder, Dictionary map 함수 써서 사용하는 방법!

In [8]:
# featuresData["LE_PROMOTION"] = np.where( featuresData.PROMOTION=="Y",1,0)
# featuresData["LE_HOLIDAY"] = np.where( featuresData.HOLIDAY=="Y",1,0)

In [9]:
ynDict = {"Y":1,"N":0}
featuresData["LE_PROMOTION"] = featuresData.PROMOTION.map( ynDict )
featuresData["LE_HOLIDAY"] = featuresData.HOLIDAY.map( ynDict )

In [10]:
## 라벨 인코더
ynLabel = LabelEncoder()
featuresData["LE_PROMOTION"] = ynLabel.fit_transform(featuresData.PROMOTION)
# 각 데이터가 라벨링된 내용 확인 (카테고리 데이터를 정렬 오름차순 후 인덱싱)
ynLabel.classes_
featuresData["LE_INV_PROMOTION"] = ynLabel.inverse_transform( featuresData.LE_PROMOTION )
featuresData = featuresData.drop(columns=["LE_INV_PROMOTION"] )

# 2. 특성 선정 및 데이터 분리

In [11]:
### 특성 선정

In [12]:
featuresData

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,LE_PROMOTION,LE_HOLIDAY
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1,1225,Y,1,Y,0.209442,1,1
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2,968,N,4,Y,0.209442,1,0
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,2015,3,1209,N,4,Y,0.208155,1,0
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201504,2015,4,1810,Y,2,Y,0.208155,1,1
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201505,2015,5,1773,N,4,Y,0.208155,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201648,2016,48,2412,Y,0,Y,0.421888,1,1
101,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201649,2016,49,1955,N,4,Y,0.421888,1,0
102,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201650,2016,50,1800,N,4,Y,0.352361,1,0
103,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201651,2016,51,1173,N,4,Y,0.352361,1,0


In [13]:
corrDf = featuresData.corr(method="pearson", numeric_only=True)
stdCorr = 0.3
label = ["QTY"]
features = list( corrDf.loc[ (abs( corrDf.QTY) >= stdCorr) &
                             (abs( corrDf.QTY) != 1)    ].index )
print( f"문제지 {features}, 정답지: {label} " )

문제지 ['HCLUS', 'PRO_PERCENT', 'LE_PROMOTION', 'LE_HOLIDAY'], 정답지: ['QTY'] 


In [14]:
### 데이터 분리 (쉽게 접근하는 방법)

In [15]:
trainData, testData = train_test_split( featuresData, test_size=0.2, random_state=1 )
## 훈련, 테스트데이터 내부에서 문제지 정답지 분리
trainingDataFeatures = trainData.loc[ :, features]
trainingDataLabel = trainData.loc[ :, label]
testDataFeatures = testData.loc[ :, features]
testDataLabel = testData.loc[ :, label]

In [16]:
print(trainingDataFeatures.shape)
print(trainingDataLabel.shape)
print(testDataFeatures.shape)
print(testDataLabel.shape)

(84, 4)
(84, 1)
(21, 4)
(21, 1)


# 3. 모델 적용

In [17]:
dtModel = tree.DecisionTreeRegressor( random_state= 1)
fittedDtModel = dtModel.fit( trainingDataFeatures, trainingDataLabel )

# 4. 예측

In [18]:
dtPredictValue = fittedDtModel.predict( testDataFeatures )
len( dtPredictValue )

21

In [19]:
# 학습덩어리!!
features
# fittedDtModel

['HCLUS', 'PRO_PERCENT', 'LE_PROMOTION', 'LE_HOLIDAY']

In [20]:
# featuresData.describe()

In [21]:
#### 예측 샘플데이터 생성 및 확인
# 0~4 0: 대규모 홀리데이 4: 소규모 홀리데잍
inHCLUS = 2
# 프로모션정보
inPropercent = 0.0
inPromotion = "N"
inHoliday = "N"
# 학습했을때와 동일한 전처리를 예측모델에서 해줘야 한다
inPromotion = ynDict[ inPromotion ]
inHoliday = ynDict[ inHoliday ]
testDf = \
    pd.DataFrame([ [inHCLUS, inPropercent, inPromotion, inHoliday] ])
testDf
predictValue = fittedDtModel.predict( testDf )[0]
predictValue



272.8235294117647

In [22]:
featuresOrg = ["HCLUS","PRO_PERCENT","PROMOTION","HOLIDAY"]
modelDict={}
# 예측모델에서 필요한 학습덩어리(모델)을 저장한다
modelDict["model"] = fittedDtModel
# 예측모델에서 필요한 feature컬럼을 정의한다 (org 포함)
modelDict["feature"] = features
modelDict["featureOrg"] = featuresOrg
modelDict["label"] = label
modelDict["preprocessing"] = [None,None,ynDict, ynDict]
modelDict
import pickle

with open ("coreDtModel.pickle", "wb") as fw:
    pickle.dump(modelDict, fw)

# 5. 데이터 정리

In [23]:
testDataLabel["PREDICT_DT"] = dtPredictValue
testDataLabel.head(2)

Unnamed: 0,QTY,PREDICT_DT
65,969,977.555556
35,2069,1448.4


# 정확도 검증

In [24]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

In [25]:
maeDt = mean_absolute_error( testDataLabel.QTY, testDataLabel.PREDICT_DT )

In [26]:
rmseDt = np.sqrt( mean_squared_error( testDataLabel.QTY, testDataLabel.PREDICT_DT ) )

In [27]:
mean_absolute_percentage_error( testDataLabel.QTY, testDataLabel.PREDICT_DT )

1.3378781904352093

In [28]:
accuracyMatrix = pd.DataFrame( [ [maeDt, rmseDt]], columns = ["MAE_DT", "RMSE_DT"] )
accuracyMatrix

Unnamed: 0,MAE_DT,RMSE_DT
0,491.388889,571.356909
