In [1]:
%matplotlib inline
from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, BayesianRidge
import statsmodels.formula.api as sm
import matplotlib.pylab as plt

from dmba import regressionSummary, exhaustive_search
from dmba import backward_elimination, forward_selection, stepwise_selection
from dmba import adjusted_r2_score, AIC_score, BIC_score

In [41]:
toyota = pd.read_csv('ToyotaCorolla.csv')

In [42]:
toyota = toyota.copy()
toyota = pd.get_dummies(toyota, columns=["Fuel_Type"], prefix_sep='_', drop_first = True) 
## Fuel_Type object라 가변수로 변환 뒤 적용

In [43]:
toyota.columns

Index(['Id', 'Model', 'Price', 'Age_08_04', 'Mfg_Month', 'Mfg_Year', 'KM',
       'HP', 'Met_Color', 'Color', 'Automatic', 'CC', 'Doors', 'Cylinders',
       'Gears', 'Quarterly_Tax', 'Weight', 'Mfr_Guarantee', 'BOVAG_Guarantee',
       'Guarantee_Period', 'ABS', 'Airbag_1', 'Airbag_2', 'Airco',
       'Automatic_airco', 'Boardcomputer', 'CD_Player', 'Central_Lock',
       'Powered_Windows', 'Power_Steering', 'Radio', 'Mistlamps',
       'Sport_Model', 'Backseat_Divider', 'Metallic_Rim', 'Radio_cassette',
       'Parking_Assistant', 'Tow_Bar', 'Fuel_Type_Diesel', 'Fuel_Type_Petrol'],
      dtype='object')

In [44]:
predictors = ['Age_08_04',  'KM', 'Fuel_Type_Diesel', 'Fuel_Type_Petrol', 'HP', 'Automatic', 'Doors',
             'Quarterly_Tax', 'Mfr_Guarantee', 'Guarantee_Period', 'Airco', 'Automatic_airco', 'CD_Player', 
              'Powered_Windows', 'Sport_Model', 'Tow_Bar']
outcome = "Price"

X = pd.get_dummies(toyota[predictors], drop_first=True)
y = toyota[outcome]

In [37]:
train_X, vaild_X, train_y, vaild_y = train_test_split(X,y, train_size = 0.5, random_state = 1)

In [38]:
## 학습용 50% 검증용 30% 평가용 20%
trainData = toyota.sample(frac=0.5, random_state=1)
validData = toyota.drop(trainData.index).sample(frac=0.6, random_state=1)
testData = toyota.drop(trainData.index).drop(validData.index) 

In [40]:
car_lm = LinearRegression()
car_lm.fit(train_X, train_y)

print("Intercept", car_lm.intercept_)
print(pd.DataFrame({'Preditior': X.columns, 'coefficient': car_lm.coef_})) # 계수 print

Intercept 8998.528552931206
           Preditior  coefficient
0          Age_08_04  -112.139772
1                 KM    -0.019437
2   Fuel_Type_Diesel  2163.735433
3   Fuel_Type_Petrol  1968.284558
4                 HP    39.474311
5          Automatic   583.265499
6              Doors   214.445095
7      Quarterly_Tax    17.192451
8      Mfr_Guarantee   129.110109
9   Guarantee_Period    77.305623
10             Airco    45.831357
11   Automatic_airco  2956.041165
12         CD_Player   276.496513
13   Powered_Windows   521.606032
14       Sport_Model   517.807321
15           Tow_Bar  -267.478660


In [None]:
##a 가격 예측하는 데 있어서 가장 중요한 3~4개의 자동차 속성들은 무엇인가

In [47]:
# train:valid:test = 6:2;2로 나눔
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=1)
# X랑 y를 train: test=8:2로 나눔
train_X, valid_X, train_y, valid_y = train_test_split(train_X, train_y, test_size=0.25, random_state=1)
# 8에 해당하는 train을 또 train:valid=3:1 (0.75:0.25)로 나눠지고 이후 6:2로 나눠지게 됨

model=LinearRegression()
model.fit(train_X, train_y)

print("intercept : ", model.intercept_)
print(pd.DataFrame({"Predictors": X.columns, "Coefficient": model.coef_}))

intercept :  8285.39834941601
          Predictors  Coefficient
0          Age_08_04  -111.220280
1                 KM    -0.016265
2   Fuel_Type_Diesel  2313.076450
3   Fuel_Type_Petrol  3145.575719
4                 HP    32.069859
5          Automatic   536.593712
6              Doors   130.936305
7      Quarterly_Tax    21.689746
8      Mfr_Guarantee   216.361361
9   Guarantee_Period    91.314854
10             Airco    62.256902
11   Automatic_airco  3184.704838
12         CD_Player   247.676882
13   Powered_Windows   492.718042
14       Sport_Model   349.905657
15           Tow_Bar  -180.733479


In [48]:
## 가장 중요한 변수를 찾기위해 전역탐색 이용
def train_model(variables):
    model = LinearRegression()
    model.fit(train_X[variables], train_y)
    return model

def score_model(model, variables):
    pred_y = model.predict(train_X[variables])
    # we negate as score is optimized to be as low as possible
    return -adjusted_r2_score(train_y, pred_y, model)

allVariables = train_X.columns
results = exhaustive_search(allVariables, train_model, score_model)

data = []
for result in results:
    model = result['model']
    variables = result['variables']
    AIC = AIC_score(train_y, model.predict(train_X[variables]), model)
    
    d = {'n': result['n'], 'r2adj': -result['score'], 'AIC': AIC}
    d.update({var: var in result['variables'] for var in allVariables})
    data.append(d)
pd.set_option('display.width', 100)
print(pd.DataFrame(data, columns=('n', 'r2adj', 'AIC') + tuple(sorted(allVariables))))
pd.reset_option('display.width')

     n     r2adj           AIC  Age_08_04  Airco  Automatic  Automatic_airco  CD_Player  Doors  \
0    1  0.761855  15312.342479       True  False      False            False      False  False   
1    2  0.817745  15083.046574       True  False      False             True      False  False   
2    3  0.841174  14965.571840       True  False      False             True      False  False   
3    4  0.850247  14915.917776       True  False      False             True      False  False   
4    5  0.868412  14805.576124       True  False      False             True      False  False   
5    6  0.873589  14772.010618       True  False      False             True      False  False   
6    7  0.876484  14753.049493       True  False      False             True      False  False   
7    8  0.880403  14726.283911       True  False      False             True      False  False   
8    9  0.883718  14703.069300       True  False      False             True      False  False   
9   10  0.885014  14

첫번째 시행했을 때 Age_08_04 변수가 가장 영향이 크고 다음으로 Automatic_airco이며 세번째로는 HP 변수로 가장 영향을 많이 끼치는 변수라 볼 수 있다.

In [50]:
##b 유용하다고 생각하는 평가지표를 사용하여, 가격 예측 모델의 성능을 평가

In [51]:
pred_y = model.predict(valid_X)

In [53]:
print('adjusted r2 : ', adjusted_r2_score(valid_y, pred_y, model))
print('AIC : ', AIC_score(valid_y, pred_y, model))
print('BIC : ', BIC_score(valid_y, pred_y, model))

adjusted r2 :  0.8624440914674278
AIC :  4961.532350281473
BIC :  5027.4030301651455
