In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl 
import math
import scipy
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats.stats import pearsonr

import sys
sys.path.append("../")

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [19]:
from sklearn.datasets import load_boston
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

boston = load_boston()
dataset = pd.DataFrame(boston.data, columns=boston.feature_names)
dataset['target'] = boston.target
dataset

observations = len(dataset)
variables = dataset.columns[:-1]
X = dataset.iloc[:, :-1]
y = dataset['target'].values

In [11]:
linear_regression = linear_model.LinearRegression(normalize=False, fit_intercept=True)

In [12]:
standardization = StandardScaler()
stand_coef_linear_reg = make_pipeline(standardization, linear_regression)

In [14]:
linear_regression.fit(X, y)

LinearRegression()

In [15]:
for coef, var in sorted(zip(map(abs, linear_regression.coef_), dataset.columns[:-1]), reverse=True):
    print("%6.3f %s" % (coef, var))

17.767 NOX
 3.810 RM
 2.687 CHAS
 1.476 DIS
 0.953 PTRATIO
 0.525 LSTAT
 0.306 RAD
 0.108 CRIM
 0.046 ZN
 0.021 INDUS
 0.012 TAX
 0.009 B
 0.001 AGE


In [16]:
stand_coef_linear_reg.fit(X, y)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])

In [17]:
for coef, var in sorted(zip(map(abs, stand_coef_linear_reg.steps[1][1].coef_), dataset.columns[:-1]), reverse=True):
    print("%6.3f %s" % (coef, var))

 3.744 LSTAT
 3.104 DIS
 2.674 RM
 2.662 RAD
 2.077 TAX
 2.061 PTRATIO
 2.057 NOX
 1.082 ZN
 0.928 CRIM
 0.849 B
 0.682 CHAS
 0.141 INDUS
 0.019 AGE


### R^2으로 모델 비교 

In [20]:
def r2_est(X, y):
    return r2_score(y, linear_regression.fit(X, y).predict(X))
print('Baseline R2: %0.3f' % r2_est(X, y))

Baseline R2: 0.741


In [23]:
r2_impact = list()
for j in range(X.shape[1]):
    selection = [i for i in range(X.shape[1]) if i!=j]
    r2_impact.append(((r2_est(X, y) - r2_est(X.values[:, selection], y)), dataset.columns[j]))
for imp, varname in sorted(r2_impact, reverse=True):
    print('%6.3f %s' % (imp, varname))

 0.056 LSTAT
 0.044 RM
 0.029 DIS
 0.028 PTRATIO
 0.011 NOX
 0.011 RAD
 0.006 B
 0.006 ZN
 0.006 CRIM
 0.006 TAX
 0.005 CHAS
 0.000 INDUS
 0.000 AGE
