In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# 맷플롯립 설정
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# 한글출력
matplotlib.rc('font', family='NanumGothic')
plt.rcParams['axes.unicode_minus'] = False
plt.rc('font', family='Malgun Gothic')

In [2]:
housing_prepared = np.loadtxt('datasets/housing/housing_prepared.txt', dtype=float)
housing_labels = pd.read_csv("datasets/housing/housing_labels.csv")

# 모델 훈련

## LinearRegression

In [3]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

### MSE

In [4]:
from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

68644.88938931034

### MAE

In [5]:
from sklearn.metrics import mean_absolute_error

lin_mae = mean_absolute_error(housing_labels, housing_predictions)
lin_mae

49519.3445373062

## DecisionTreeRegressor

In [6]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(housing_prepared, housing_labels)

In [7]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

# 교차검증을 통한 평가

## DecisionTreeRegressor

In [8]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

In [9]:
def display_scores(scores):
    print("점수:", scores)
    print("평균:", scores.mean())
    print("표준편차:", scores.std())

display_scores(tree_rmse_scores)

점수: [72845.67443449 70103.03601583 69431.52687513 72671.66704237
 68906.7738104  79131.58707634 70344.17934657 73035.60861677
 70060.07148353 71273.59955054]
평균: 71780.37242519669
표준편차: 2818.0180214479633


## LinearRegression

In [10]:
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

점수: [71762.76302789 64122.05022719 67760.87417175 68635.19126601
 66987.90744008 72527.15803886 74093.20819009 68802.33757141
 66485.9695096  70146.08293189]
평균: 69132.35423747802
표준편차: 2881.3825001317587


## RandomForestRegressor

In [11]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=10, random_state=42)
forest_reg.fit(housing_prepared, housing_labels)

In [12]:
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

22366.609160370783

In [13]:
from sklearn.model_selection import cross_val_score

forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

점수: [53538.55469313 50612.72636093 48883.37258017 53884.28790649
 50847.38317638 54857.41426501 56030.79800034 51960.32206005
 51487.92311347 55700.72774435]
평균: 52780.35099003227
표준편차: 2261.989537187567


In [14]:
scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
pd.Series(np.sqrt(-scores)).describe()

count       10.000000
mean     69132.354237
std       3037.243837
min      64122.050227
25%      67181.149123
50%      68718.764419
75%      71358.593004
max      74093.208190
dtype: float64

## SVR

In [15]:
from sklearn.svm import SVR

svm_reg = SVR(kernel="linear")
svm_reg.fit(housing_prepared, housing_labels)
housing_predictions = svm_reg.predict(housing_prepared)
svm_mse = mean_squared_error(housing_labels, housing_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_rmse

111095.06688520181

# 모델 세부 튜닝

## 그리드 탐색

In [16]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # 하이퍼파라미터 12(=3×4)개의 조합을 시도합니다.
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # bootstrap은 False로 하고 6(=2×3)개의 조합을 시도합니다.
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
# 다섯 폴드에서 훈련하면 총 (12+6)*5=90번의 훈련이 일어납니다.
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error', 
                           return_train_score=True, n_jobs=-1)
grid_search.fit(housing_prepared, housing_labels)

In [17]:
grid_search.best_params_

{'max_features': 8, 'n_estimators': 30}

In [18]:
grid_search.best_estimator_

In [19]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

63916.575761905464 {'max_features': 2, 'n_estimators': 3}
54936.41801736748 {'max_features': 2, 'n_estimators': 10}
52917.16736317768 {'max_features': 2, 'n_estimators': 30}
60172.20212674653 {'max_features': 4, 'n_estimators': 3}
52570.515034035256 {'max_features': 4, 'n_estimators': 10}
50244.652386918846 {'max_features': 4, 'n_estimators': 30}
58082.56227380028 {'max_features': 6, 'n_estimators': 3}
51522.85913933991 {'max_features': 6, 'n_estimators': 10}
49970.30111162011 {'max_features': 6, 'n_estimators': 30}
58863.984741627086 {'max_features': 8, 'n_estimators': 3}
52327.82463997598 {'max_features': 8, 'n_estimators': 10}
49898.3288291329 {'max_features': 8, 'n_estimators': 30}
62379.30604451122 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
54463.167358450715 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
59894.3616401629 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
52729.78518142424 {'bootstrap': False, 'max_features': 3, 'n_estimators':

In [20]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_features,param_n_estimators,param_bootstrap,params,split0_test_score,split1_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.111631,0.004031,0.007234,0.001189,2,3,,"{'max_features': 2, 'n_estimators': 3}",-4131778000.0,-3727541000.0,...,-4085329000.0,186621000.0,18,-1154993000.0,-1089537000.0,-1154284000.0,-1119194000.0,-1090311000.0,-1121664000.0,28969560.0
1,0.377681,0.008253,0.015445,0.002839,2,10,,"{'max_features': 2, 'n_estimators': 10}",-2973630000.0,-2807277000.0,...,-3018010000.0,117314000.0,11,-598379000.0,-590830000.0,-611878600.0,-572315600.0,-590845800.0,-592849800.0,12825110.0
2,1.100389,0.020411,0.056879,0.005814,2,30,,"{'max_features': 2, 'n_estimators': 30}",-2806410000.0,-2672936000.0,...,-2800227000.0,80280300.0,9,-441396000.0,-432651400.0,-453808800.0,-431289500.0,-431732700.0,-438175700.0,8651154.0
3,0.184915,0.007735,0.008026,0.000867,4,3,,"{'max_features': 4, 'n_estimators': 3}",-3590167000.0,-3500472000.0,...,-3620694000.0,127110700.0,16,-978142600.0,-982964500.0,-1004610000.0,-1016225000.0,-1005646000.0,-997517800.0,14515830.0
4,0.551928,0.022027,0.021461,0.001581,4,10,,"{'max_features': 4, 'n_estimators': 10}",-2765209000.0,-2615408000.0,...,-2763659000.0,116517200.0,7,-506590200.0,-525712200.0,-508092500.0,-518084000.0,-527939000.0,-517283600.0,8764054.0
5,1.736577,0.062862,0.050885,0.004489,4,30,,"{'max_features': 4, 'n_estimators': 30}",-2532510000.0,-2439557000.0,...,-2524525000.0,84474570.0,3,-377833500.0,-390019500.0,-388195500.0,-383667100.0,-389446600.0,-385832400.0,4579274.0
6,0.248764,0.014671,0.008911,0.002056,6,3,,"{'max_features': 6, 'n_estimators': 3}",-3361951000.0,-3305042000.0,...,-3373584000.0,137457200.0,13,-905298400.0,-957091900.0,-899449300.0,-894436100.0,-915293500.0,-914313800.0,22486780.0
7,0.734047,0.03609,0.019111,0.004799,6,10,,"{'max_features': 6, 'n_estimators': 10}",-2626441000.0,-2663824000.0,...,-2654605000.0,67194700.0,5,-500253200.0,-514718400.0,-501353800.0,-495415100.0,-515306700.0,-505409400.0,8093635.0
8,2.307468,0.083373,0.055755,0.004942,6,30,,"{'max_features': 6, 'n_estimators': 30}",-2454199000.0,-2444403000.0,...,-2497031000.0,70851970.0,2,-377617500.0,-387217900.0,-388185900.0,-376277000.0,-385719500.0,-383003600.0,5024957.0
9,0.329387,0.029554,0.007666,0.001131,8,3,,"{'max_features': 8, 'n_estimators': 3}",-3602004000.0,-3210567000.0,...,-3464969000.0,138538500.0,14,-948604100.0,-916071300.0,-903009000.0,-907806300.0,-945089600.0,-924116100.0,19056680.0


## 랜덤 탐색

In [21]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=8),
    }

forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', 
                                random_state=42, n_jobs=-1)
rnd_search.fit(housing_prepared, housing_labels)

In [22]:
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

49131.83457958955 {'max_features': 7, 'n_estimators': 180}
51450.84815656001 {'max_features': 5, 'n_estimators': 15}
50686.3351464383 {'max_features': 3, 'n_estimators': 72}
50797.95310722897 {'max_features': 5, 'n_estimators': 21}
49183.94670013987 {'max_features': 7, 'n_estimators': 122}
50650.96405423256 {'max_features': 3, 'n_estimators': 75}
50509.5936967293 {'max_features': 3, 'n_estimators': 88}
49532.46366377122 {'max_features': 5, 'n_estimators': 100}
50300.919499325304 {'max_features': 3, 'n_estimators': 150}
65178.44470135641 {'max_features': 5, 'n_estimators': 2}


## 최상의 모델과 오차 분석

In [23]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([6.95556703e-02, 6.03855087e-02, 4.21345961e-02, 1.52923840e-02,
       1.54474318e-02, 1.58998048e-02, 1.49720520e-02, 3.79203496e-01,
       5.48287342e-02, 1.07034780e-01, 4.80223643e-02, 6.82078731e-03,
       1.65774711e-01, 7.83480660e-05, 1.52165360e-03, 3.02767843e-03])

In [61]:
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
# cat_encoder = cat_pipeline.named_steps["cat_encoder"]
cat_encoder = full_pipeline.named_transformers_["cat_encoder"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

AttributeError: 'ColumnTransformer' object has no attribute 'transformers_'

## 테스트 세트로 시스템 평가하기

In [39]:
strat_train_set = pd.read_csv("datasets/housing/strat_train_set.csv")
strat_test_set = pd.read_csv("datasets/housing/strat_test_set.csv")

In [None]:
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [None]:
final_rmse

In [None]:
from scipy import stats

In [None]:
confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
mean = squared_errors.mean()
m = len(squared_errors)

np.sqrt(stats.t.interval(confidence, m - 1,
                         loc=np.mean(squared_errors),
                         scale=stats.sem(squared_errors)))

In [None]:
tscore = stats.t.ppf((1 + confidence) / 2, df=m - 1)
tmargin = tscore * squared_errors.std(ddof=1) / np.sqrt(m)
np.sqrt(mean - tmargin), np.sqrt(mean + tmargin)

In [None]:
zscore = stats.norm.ppf((1 + confidence) / 2)
zmargin = zscore * squared_errors.std(ddof=1) / np.sqrt(m)
np.sqrt(mean - zmargin), np.sqrt(mean + zmargin)

# 추가 내용

## 전처리와 예측을 포함한 파이프라인

In [None]:
full_pipeline_with_predictor = Pipeline([
        ("preparation", full_pipeline),
        ("linear", LinearRegression())
    ])

full_pipeline_with_predictor.fit(housing, housing_labels)
full_pipeline_with_predictor.predict(some_data)

## joblib을 사용한 모델 저장

In [None]:
my_model = full_pipeline_with_predictor

In [None]:
import joblib
# from sklearn.externals import joblib
joblib.dump(my_model, "my_model.pkl") # DIFF
#...
my_model_loaded = joblib.load("my_model.pkl") # DIFF

## RandomizedSearchCV을 위한 Scipy 분포 함수

In [None]:
from scipy.stats import geom, expon
geom_distrib=geom(0.5).rvs(10000, random_state=42)
expon_distrib=expon(scale=1).rvs(10000, random_state=42)
plt.hist(geom_distrib, bins=50)
plt.show()
plt.hist(expon_distrib, bins=50)
plt.show()