In [1]:
import pandas as pd
import numpy as np

# 모델 라이브러리 선언
from sklearn.linear_model import LinearRegression

# 모델 정확도 라이브러리 선언
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [2]:
rawData = pd.read_csv("../lee/Python_ST_EX/dataset/feature_regression_example.csv")
rawData.head(5)

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1,1225,Y,1,Y,0.209442
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2,968,N,4,Y,0.209442
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,2015,3,1209,N,4,Y,0.208155
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201504,2015,4,1810,Y,2,Y,0.208155
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201505,2015,5,1773,N,4,Y,0.208155


In [3]:
rawData["HO_YN"] = np.where(rawData.HOLIDAY == "Y", 1,\
                                 np.where(rawData.HOLIDAY == "N", 0, "error"))

rawData["PRO_YN"] = np.where(rawData.PROMOTION == "Y", 1,\
                                 np.where(rawData.PROMOTION == "N", 0, "error"))
rawData.head()


Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,HO_YN,PRO_YN
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1,1225,Y,1,Y,0.209442,1,1
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2,968,N,4,Y,0.209442,0,1
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,2015,3,1209,N,4,Y,0.208155,0,1
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201504,2015,4,1810,Y,2,Y,0.208155,1,1
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201505,2015,5,1773,N,4,Y,0.208155,0,1


In [4]:
#int로 형변환 해주고 계산가능하도록
rawData["HO_YN"] = rawData["HO_YN"].astype(int)
rawData["PRO_YN"] = rawData["PRO_YN"].astype(int)

In [5]:
featuresData = rawData[(rawData.YEARWEEK >= 201501) & (rawData.YEARWEEK <= 201652)]
featuresData.head(5)

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,HO_YN,PRO_YN
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1,1225,Y,1,Y,0.209442,1,1
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2,968,N,4,Y,0.209442,0,1
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,2015,3,1209,N,4,Y,0.208155,0,1
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201504,2015,4,1810,Y,2,Y,0.208155,1,1
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201505,2015,5,1773,N,4,Y,0.208155,0,1


In [6]:
featuresData.dtypes

REGIONID         object
PRODUCTGROUP     object
PRODUCT          object
ITEM             object
YEARWEEK          int64
YEAR              int64
WEEK              int64
QTY               int64
HOLIDAY          object
HCLUS             int64
PROMOTION        object
PRO_PERCENT     float64
HO_YN             int32
PRO_YN            int32
dtype: object

In [7]:
# 상관관계 확인
corrDf = featuresData.corr()
corrDf

Unnamed: 0,YEARWEEK,YEAR,WEEK,QTY,HCLUS,PRO_PERCENT,HO_YN,PRO_YN
YEARWEEK,1.0,0.956598,0.275593,0.112267,-0.071586,0.404889,0.049867,0.205916
YEAR,0.956598,1.0,-0.016493,0.028931,0.028593,0.321193,-0.031106,0.195931
WEEK,0.275593,-0.016493,1.0,0.289766,-0.339943,0.329705,0.27371,0.060206
QTY,0.112267,0.028931,0.289766,1.0,-0.53723,0.700195,0.505932,0.612451
HCLUS,-0.071586,0.028593,-0.339943,-0.53723,1.0,-0.545619,-0.974601,-0.374072
PRO_PERCENT,0.404889,0.321193,0.329705,0.700195,-0.545619,1.0,0.487062,0.898554
HO_YN,0.049867,-0.031106,0.27371,0.505932,-0.974601,0.487062,1.0,0.365148
PRO_YN,0.205916,0.195931,0.060206,0.612451,-0.374072,0.898554,0.365148,1.0


In [8]:
# 인덱스 컬럼 초기화
corrResult = corrDf.reset_index()
corrResult

Unnamed: 0,index,YEARWEEK,YEAR,WEEK,QTY,HCLUS,PRO_PERCENT,HO_YN,PRO_YN
0,YEARWEEK,1.0,0.956598,0.275593,0.112267,-0.071586,0.404889,0.049867,0.205916
1,YEAR,0.956598,1.0,-0.016493,0.028931,0.028593,0.321193,-0.031106,0.195931
2,WEEK,0.275593,-0.016493,1.0,0.289766,-0.339943,0.329705,0.27371,0.060206
3,QTY,0.112267,0.028931,0.289766,1.0,-0.53723,0.700195,0.505932,0.612451
4,HCLUS,-0.071586,0.028593,-0.339943,-0.53723,1.0,-0.545619,-0.974601,-0.374072
5,PRO_PERCENT,0.404889,0.321193,0.329705,0.700195,-0.545619,1.0,0.487062,0.898554
6,HO_YN,0.049867,-0.031106,0.27371,0.505932,-0.974601,0.487062,1.0,0.365148
7,PRO_YN,0.205916,0.195931,0.060206,0.612451,-0.374072,0.898554,0.365148,1.0


In [9]:
#상관 관계 계수 비율 선정
featuresStd = 0.5

#Feature 선택(상관관계가 높은 데이터들 뽑기)
features = corrResult[(corrResult.QTY != 1) &\
                     (abs(corrResult.QTY) > featuresStd)]["index"].tolist()
features

['HCLUS', 'PRO_PERCENT', 'HO_YN', 'PRO_YN']

In [10]:
label = ['QTY']
label

['QTY']

In [11]:
# 데이터 분리
yearweekStd = 201630

# features = 특성, label = 답지
# 201630주차 이하인 주차와 201630주차 초과인 주차를 분리함.

# 201630주차 이하인 주차(학습할 특성)
trainingData_features = featuresData[featuresData.YEARWEEK <= yearweekStd][features]
# 201630주차 이하인 주차(학습할 답지)
trainingData_label = featuresData[featuresData.YEARWEEK <= yearweekStd][label]

# 201630주차 초과인 주차(검증할 특성)
testData_features = featuresData[featuresData.YEARWEEK > yearweekStd][features]
# 201630주차 초과인 주차(검증할 답지)
# 맨뒤에 [label]이 없는 이유는 전체를 같이 보기 위해서
testData_label = featuresData[featuresData.YEARWEEK > yearweekStd]

In [12]:
# Define Model (모델 정의)
model_method = LinearRegression()

# Learning (Feature & Label)
# trainingData_features, trainingData_label을 인자로 던져주면 모델을 만들어줌.
# fit<< 학습시킬때 쓰는 함수, 학습할 특성과 학습할 답지 모두 인자로 던져주어야함.
model = model_method.fit(trainingData_features, trainingData_label)
model

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [13]:
testData_features

Unnamed: 0,HCLUS,PRO_PERCENT,HO_YN,PRO_YN
83,4,0.280258,0,1
84,4,0.280258,0,1
85,4,0.0,0,0
86,1,0.308584,1,1
87,1,0.308584,1,1
88,1,0.308584,1,1
89,4,0.308584,0,1
90,4,0.308584,0,1
91,4,0.280258,0,1
92,4,0.280258,0,1


In [14]:
predict = model.predict(testData_features)
predict

array([[1411.29171929],
       [1411.29171929],
       [ 300.43091536],
       [2004.96200456],
       [2004.96200456],
       [2004.96200456],
       [1586.12154967],
       [1586.12154967],
       [1411.29171929],
       [1411.29171929],
       [1799.54021119],
       [1411.29171929],
       [ 300.43091536],
       [2734.87329525],
       [2734.87329525],
       [2734.87329525],
       [2734.87329525],
       [2734.87329525],
       [2285.44087736],
       [1856.31310796],
       [1856.31310796],
       [2275.15356285]])

In [15]:
# testData_features의 수 = 배열 predict의 길이
len(predict)

22

In [16]:
testData = testData_label.reset_index(drop = True)
testData

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,HO_YN,PRO_YN
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201631,2016,31,1522,N,4,Y,0.280258,0,1
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201632,2016,32,2100,N,4,Y,0.280258,0,1
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201633,2016,33,43,N,4,N,0.0,0,0
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201634,2016,34,1700,Y,1,Y,0.308584,1,1
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201635,2016,35,1514,Y,1,Y,0.308584,1,1
5,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201636,2016,36,1501,Y,1,Y,0.308584,1,1
6,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201637,2016,37,1491,N,4,Y,0.308584,0,1
7,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201638,2016,38,806,N,4,Y,0.308584,0,1
8,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201639,2016,39,2111,N,4,Y,0.280258,0,1
9,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201640,2016,40,2400,N,4,Y,0.280258,0,1


In [17]:
### 데이터프레임으로 변환
predictData = pd.DataFrame(predict)

#컬럼헤더명 바꾸기
predictData.columns = ["PREDICT"]

#컬럼 인덱스를 통해서 컬럼헤더명 바꾸기
#predictData = predictData.rename(columns = {0:"PREDICT"})
predictData = predictData.reset_index(drop = True)

In [18]:
resultData = pd.concat([testData, predictData], axis=1)
resultData

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,HO_YN,PRO_YN,PREDICT
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201631,2016,31,1522,N,4,Y,0.280258,0,1,1411.291719
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201632,2016,32,2100,N,4,Y,0.280258,0,1,1411.291719
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201633,2016,33,43,N,4,N,0.0,0,0,300.430915
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201634,2016,34,1700,Y,1,Y,0.308584,1,1,2004.962005
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201635,2016,35,1514,Y,1,Y,0.308584,1,1,2004.962005
5,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201636,2016,36,1501,Y,1,Y,0.308584,1,1,2004.962005
6,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201637,2016,37,1491,N,4,Y,0.308584,0,1,1586.12155
7,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201638,2016,38,806,N,4,Y,0.308584,0,1,1586.12155
8,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201639,2016,39,2111,N,4,Y,0.280258,0,1,1411.291719
9,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201640,2016,40,2400,N,4,Y,0.280258,0,1,1411.291719


In [19]:
### MAE
mean_absolute_error(resultData["QTY"], resultData["PREDICT"])

566.617239476098

In [20]:
import math

### RMSE (MSE)
math.sqrt(mean_squared_error(resultData["QTY"], resultData["PREDICT"]))

722.6052639386677

In [21]:
### R2
r2_score(resultData["QTY"], resultData["PREDICT"])

-0.11834080332946595

In [None]:
x_data = predict
y_data = 
print(x_data.shape) #(7, 1)
print(y_data.shape) #(7,)

####################

estimator = LinearRegression()

estimator.fit(x_data, y_data)

print(estimator.coef_) #[1.24836601] #계수(coefficient) = 기울기 = 가중치(weight)
print(estimator.intercept_) #2.61437908496732 #상수항(intercept) = y절편 = 편향(bias)

import matplotlib.pylab as plt
import matplotlib as mpl
mpl.rc('font', family='Malgun Gothic') #한글 폰트 설정
#
plt.scatter(x_data[:,0], y_data, c='red')
#
xmin, xmax = plt.xlim()
ymin, ymax = plt.ylim()
x_plot = np.linspace(xmin,xmax)
#'''
plt.plot(x_plot, x_plot*estimator.coef_ + estimator.intercept_, color='blue')
#'''
'''
y_predict = estimator.predict(x_plot.reshape((-1,1))) 
plt.plot(x_plot, y_predict, color='blue')
'''

.
#
plt.title('선형 회귀 모델')
plt.xlabel('x')
plt.ylabel('y')
plt.show()