In [1]:
import pandas as pd
import numpy as np

# 모델 라이브러리 선언
from sklearn import datasets, tree
from sklearn.linear_model import LinearRegression

# 모델 정확도 라이브러리 선언
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [2]:
originData = pd.read_csv("../../lee/Python_ST_EX/dataset/kopo_decision_tree_all_new.csv")
originData.head()

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT
0,A01,PG01,P01,ITEM001,201538,2015,38,1,N,4,N,0.0
1,A01,PG01,P01,ITEM001,201548,2015,48,1,Y,0,N,0.0
2,A01,PG01,P01,ITEM001,201549,2015,49,2,Y,0,N,0.0
3,A01,PG01,P01,ITEM002,201526,2015,26,1,Y,1,N,0.0
4,A01,PG01,P01,ITEM002,201532,2015,32,1,N,4,N,0.0


In [3]:
originData.dtypes

REGIONID         object
PRODUCTGROUP     object
PRODUCT          object
ITEM             object
YEARWEEK          int64
YEAR              int64
WEEK              int64
QTY               int64
HOLIDAY          object
HCLUS             int64
PROMOTION        object
PRO_PERCENT     float64
dtype: object

In [4]:
originData["HO_YN"] = np.where(originData.HOLIDAY == 'Y', 1, 0)
originData["PRO_YN"] = np.where(originData.PROMOTION == 'Y', 1, 0)

In [5]:
originData.head()

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,HO_YN,PRO_YN
0,A01,PG01,P01,ITEM001,201538,2015,38,1,N,4,N,0.0,0,0
1,A01,PG01,P01,ITEM001,201548,2015,48,1,Y,0,N,0.0,1,0
2,A01,PG01,P01,ITEM001,201549,2015,49,2,Y,0,N,0.0,1,0
3,A01,PG01,P01,ITEM002,201526,2015,26,1,Y,1,N,0.0,1,0
4,A01,PG01,P01,ITEM002,201532,2015,32,1,N,4,N,0.0,0,0


In [6]:
corrDf = originData.corr()
corrDf
#그룹의 특성이 섞여져서 유의미한 상관관계 지수를 얻을 수 없음.

Unnamed: 0,YEARWEEK,YEAR,WEEK,QTY,HCLUS,PRO_PERCENT,HO_YN,PRO_YN
YEARWEEK,1.0,0.984528,0.005567,-0.021432,0.01604,0.161164,0.001566,0.11804
YEAR,0.984528,1.0,-0.169746,-0.039309,0.068345,0.122548,-0.041092,0.097886
WEEK,0.005567,-0.169746,1.0,0.103788,-0.299818,0.207041,0.243307,0.10525
QTY,-0.021432,-0.039309,0.103788,1.0,-0.184016,0.237682,0.157853,0.095984
HCLUS,0.01604,0.068345,-0.299818,-0.184016,1.0,-0.289396,-0.977417,-0.143294
PRO_PERCENT,0.161164,0.122548,0.207041,0.237682,-0.289396,1.0,0.254015,0.812344
HO_YN,0.001566,-0.041092,0.243307,0.157853,-0.977417,0.254015,1.0,0.136497
PRO_YN,0.11804,0.097886,0.10525,0.095984,-0.143294,0.812344,0.136497,1.0


In [7]:
originData.YEARWEEK.min()

201501

In [8]:
originData.YEARWEEK.max()

201741

### 그룹화하기

In [9]:
groupKey = ["REGIONID", "PRODUCTGROUP", "PRODUCT","ITEM"]

In [23]:
groupData = originData.groupby(groupKey)

In [11]:
len(groupData)

491

In [12]:
len(originData[groupKey].drop_duplicates())

491

In [13]:
onegroup = groupData.get_group(list(groupData.groups)[8])
len(onegroup)

89

## 예측 함수 만들기

In [14]:
def mlFcast(groupData, indexOfSelectGroup):
    onegroup = groupData.get_group(list(groupData.groups)[indexOfSelectGroup])
    onegroupLength = len(onegroup)
    if(onegroupLength >= 10):
        eachgroup = onegroup.reset_index()

        ### 1.그룹별 상관관계를 산출
        corrDf = eachgroup.corr()

        ### 2. 데이터를 분리한다 (features/label)
        features = list(corrDf[(abs(corrDf.QTY > 0.5)) & \
                      (corrDf.QTY != 1 )].index)
        label = ["QTY"]

        sortedData = onegroup.sort_values("YEARWEEK").reset_index(drop=True)
        standardIndex = round(len(sortedData)* 0.8)

        #년도로 오름차순 정렬했을시 80% 구간
        #학습할 특성
        trainingData_features =sortedData.iloc[0:standardIndex-1,:][features]
        #학습할 답지
        trainingData_label =sortedData.iloc[0:standardIndex-1,:][label]

        #년도로 오름차순 정렬했을시 20% 구간
        #검증할 특성
        testData_features = sortedData.iloc[standardIndex:,:][features]
        #검증할 답지
        testData_label = sortedData.iloc[standardIndex:,:]


        ### 3. 모델 선언 및 학습
        # Define Model
        model_method = tree.DecisionTreeRegressor(random_state=1)
        # Learning (Feature & Label)
        model = model_method.fit(trainingData_features, trainingData_label)

        ### 4. 예측
        # 예측
        predict = model.predict(testData_features)
        predictData = pd.DataFrame(predict)

        ### 컬럼명 주입
        predictData.columns = ["PREDICT_QTY"]
        testData_label = testData_label.reset_index(drop = True)
        predictQTY = pd.concat([testData_label,predictData], axis =1)
        predictQTY = predictQTY.reset_index(drop = True)

        return predictQTY
    else :
        result = "데이터 표본이 너무적습니다. 데이터 추가 수집 요망"
        return result

In [24]:
result = mlFcast(groupData,15)
result

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,HO_YN,PRO_YN,PREDICT_QTY
0,A01,PG01,P01,ITEM016,201714,2017,14,68,N,4,Y,0.181967,0,1,55.074074
1,A01,PG01,P01,ITEM016,201715,2017,15,43,N,4,Y,0.181967,0,1,55.074074
2,A01,PG01,P01,ITEM016,201716,2017,16,20,N,4,Y,0.181967,0,1,55.074074
3,A01,PG01,P01,ITEM016,201717,2017,17,19,N,4,Y,0.181967,0,1,55.074074
4,A01,PG01,P01,ITEM016,201718,2017,18,28,N,4,Y,0.181967,0,1,55.074074
5,A01,PG01,P01,ITEM016,201719,2017,19,15,N,4,N,0.0,0,0,29.030303
6,A01,PG01,P01,ITEM016,201720,2017,20,37,Y,1,Y,0.181967,1,1,77.5
7,A01,PG01,P01,ITEM016,201721,2017,21,46,Y,1,Y,0.181967,1,1,77.5
8,A01,PG01,P01,ITEM016,201722,2017,22,56,Y,1,Y,0.181967,1,1,77.5
9,A01,PG01,P01,ITEM016,201723,2017,23,32,N,4,Y,0.181967,0,1,55.074074


In [16]:
import math

In [17]:
### MAE
mean_absolute_error(result['QTY'], result['PREDICT_QTY'])

28.032347282347285

In [18]:
### RMSE (MSE)
math.sqrt(mean_squared_error(result['QTY'], result['PREDICT_QTY']))

39.40691386055075

In [19]:
### R2
r2_score(result['QTY'], result['PREDICT_QTY'])

-0.1441650355943782