In [1]:
import pandas as pd
import numpy as np

# 모델 라이브러리 선언
from sklearn import datasets, tree
from sklearn.linear_model import LinearRegression

# 모델 정확도 라이브러리 선언
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [2]:
carsData = pd.read_csv("https://raw.githubusercontent.com/hyokwan/python-lecture/master/dataset/cars.csv")

In [3]:
carsData.head()

Unnamed: 0,age,gender,miles,debt,income,sales
0,28,0,23,0,4099,620
1,26,0,27,0,2677,1792
2,30,1,58,41576,6215,27754
3,26,1,25,43172,7626,28256
4,20,1,17,6979,8071,4438


In [4]:
carsData.dtypes

age       int64
gender    int64
miles     int64
debt      int64
income    int64
sales     int64
dtype: object

### 상관관계 지수를 통해 컬럼뽑아내는 과정은 생략

In [5]:
# corrDf = carsData.corr()
# corrDf

Unnamed: 0,age,gender,miles,debt,income,sales
age,1.0,-0.000702,0.232399,0.218896,0.239644,0.352609
gender,-0.000702,1.0,-0.031355,-0.033181,-0.034317,-0.03635
miles,0.232399,-0.031355,1.0,0.544791,0.422141,0.636676
debt,0.218896,-0.033181,0.544791,1.0,0.49179,0.835541
income,0.239644,-0.034317,0.422141,0.49179,1.0,0.674685
sales,0.352609,-0.03635,0.636676,0.835541,0.674685,1.0


In [6]:
# 상관관계 계수 비율 선정
# hyper parameter라는 개념으로 반복문을 돌려 각각의 지수에 따른 정확도를 구한후 가장 정확도가 높은것을 parameter로선정
# featuresStd = 0.5

# features = list(corrDf[abs(corrDf.sales) > featuresStd].index)
# features

['miles', 'debt', 'income', 'sales']

In [22]:
features = ["age","gender","miles","debt","income"]

In [23]:
label = ["sales"]
label

['sales']

In [24]:
## Test/Training 자동 분리
from sklearn.model_selection import train_test_split

In [25]:
### feature 데이터, label 데이터 분리
featuresData = carsData[features]
labelData = carsData[label]

In [26]:
feature_train, feature_test, label_train, label_test = \
 train_test_split(featuresData, labelData, test_size = 0.3)

### Decision Tree 적용

In [27]:
# Define Model
model_method = tree.DecisionTreeRegressor(random_state=1)

In [28]:
# Learning (Feature & Label)
model = model_method.fit(feature_train, label_train)

In [29]:
# 예측
predict = model.predict(feature_test)
predict

array([ 6014.,  3485.,  7601., 10941.,  4785.,  3724.,  3640., 11274.,
        5909., 28511.,  1825.,  4609., 22527., 14363., 18128., 22527.,
       28511., 18128.,  3168., 20341.,  1577.,  6317.,  7346.,  9913.,
        8971., 29560.,   943.,  5580.,  2112.,  2993.,  3168., 13699.,
        1608.,  8990.,  1728.,  8082., 29886.,  3310., 25577., 18005.,
       17930., 17253.,  3485.,  3039., 29652.,  4785.,  5888., 14913.,
        4368.,  5256.,  1349., 18254., 19369., 24498., 22429.,  6112.,
       10915., 12771., 29770., 20050., 17397.,  3844., 18640.,  4456.,
        4531.,  5256., 28511.,  4240.,  5580.,  5665.,  8751.,   790.,
        3928.,  3039.,  2116., 27854., 28942., 16875., 16558., 10941.,
       27599.,  1358.,  2140., 18005., 25979., 22306.,  4241.,   549.,
        1803., 16754.,  1577.,  3328.,  3328.,  3286.,  3240.,  1393.,
       11708.,  1825.,  1728.,  6536.,  2488.,  1577., 28228., 22997.,
       11564.,  4093., 29560.,  2665.,  4182.,  2856., 18128.,  1393.,
      

In [30]:
predictData = pd.DataFrame(predict)

In [31]:
### 컬럼명 주입
predictData.columns = ["PREDICT_DECISIONTREE"]
predictData.head()

Unnamed: 0,PREDICT_DECISIONTREE
0,6014.0
1,3485.0
2,7601.0
3,10941.0
4,4785.0


In [32]:
label_test = label_test.reset_index(drop = True)
label_test.head()

Unnamed: 0,sales
0,9508
1,3021
2,10401
3,7596
4,7239


In [33]:
decisiontreePredict = pd.concat([label_test,predictData], axis =1)
decisiontreePredict.head()

Unnamed: 0,sales,PREDICT_DECISIONTREE
0,9508,6014.0
1,3021,3485.0
2,10401,7601.0
3,7596,10941.0
4,7239,4785.0


In [34]:
import math

In [35]:
### MAE
mean_absolute_error(decisiontreePredict['sales'], decisiontreePredict['PREDICT_DECISIONTREE'])

2117.9273356401386

In [36]:
### RMSE
math.sqrt(mean_squared_error(decisiontreePredict['sales'], decisiontreePredict['PREDICT_DECISIONTREE']))

3341.043091268992

In [37]:
### R2
r2_score(decisiontreePredict['sales'], decisiontreePredict['PREDICT_DECISIONTREE'])

0.8657612320194938