In [6]:
import sys
assert sys.version_info >=(3,5)

import sklearn
assert sklearn.__version__ >="0.20"

import numpy as np
import os

import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

PROJECT_ROOT_DIR="."
CHAPTER_ID="end_to_end_project"

IMAGES_PATH=os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

#plt를 저장함
def save_fig(fig_id, tight_layout=True, fig_extension="png",resolution=300):
    path=os.path.join(IMAGES_PATH, fig_id+"."+fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)
    
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

import os
import tarfile
import urllib

DOWNLOAD_ROOT="http://jth2405.dothome.co.kr/"
HOUSING_PATH=os.path.join("datasets","housing")
HOUSING_URL= DOWNLOAD_ROOT+"datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path=os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url,tgz_path)
    housing_tgz=tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
    
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path=os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [None]:
fetch_housing_data()

In [7]:
housing=load_housing_data()

In [8]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

print("\n\ntest_set.head():\n",test_set.head())



test_set.head():
        longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
20046    -119.01     36.06                25.0       1505.0             NaN   
3024     -119.46     35.14                30.0       2943.0             NaN   
15663    -122.44     37.80                52.0       3830.0             NaN   
20484    -118.72     34.28                17.0       3051.0             NaN   
9814     -121.93     36.62                34.0       2351.0             NaN   

       population  households  median_income  median_house_value  \
20046      1392.0       359.0         1.6812             47700.0   
3024       1565.0       584.0         2.5313             45800.0   
15663      1310.0       963.0         3.4801            500001.0   
20484      1705.0       495.0         5.7376            218600.0   
9814       1063.0       428.0         3.7250            278000.0   

      ocean_proximity  
20046          INLAND  
3024           INLAND  
15663        NEAR BAY  


In [9]:
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True) 

from sklearn.model_selection import StratifiedShuffleSplit 

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index] 
    strat_test_set = housing.loc[test_index] 
    
housing = strat_train_set.copy()

In [10]:
print("\n\n\n##################################################\n")
print("제 03강 실습과제 2017250045 정태환\n")

housing = strat_train_set.drop("median_house_value", axis=1)# drop labels for training set
housing_labels = strat_train_set["median_house_value"].copy()

sample_incomplete_rows = housing[housing.isnull().any(axis=1)].head()
sample_incomplete_rows




##################################################

제 03강 실습과제 2017250045 정태환



Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,income_cat
1606,-122.08,37.88,26.0,2947.0,,825.0,626.0,2.933,NEAR BAY,2.0
10915,-117.87,33.73,45.0,2264.0,,1970.0,499.0,3.4193,<1H OCEAN,3.0
19150,-122.7,38.35,14.0,2313.0,,954.0,397.0,3.7813,<1H OCEAN,3.0
4186,-118.23,34.13,48.0,1308.0,,835.0,294.0,4.2891,<1H OCEAN,3.0
16885,-122.4,37.58,26.0,3281.0,,1145.0,480.0,6.358,NEAR OCEAN,5.0


In [82]:
median = housing["total_bedrooms"].median()
sample_incomplete_rows["total_bedrooms"].fillna(median, inplace=True) # option 3
print("제 03강 실습과제 2017250045 정태환\n")
sample_incomplete_rows

제 03강 실습과제 2017250045 정태환



Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,income_cat
1606,-122.08,37.88,26.0,2947.0,433.0,825.0,626.0,2.933,NEAR BAY,2.0
10915,-117.87,33.73,45.0,2264.0,433.0,1970.0,499.0,3.4193,<1H OCEAN,3.0
19150,-122.7,38.35,14.0,2313.0,433.0,954.0,397.0,3.7813,<1H OCEAN,3.0
4186,-118.23,34.13,48.0,1308.0,433.0,835.0,294.0,4.2891,<1H OCEAN,3.0
16885,-122.4,37.58,26.0,3281.0,433.0,1145.0,480.0,6.358,NEAR OCEAN,5.0


In [83]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")

#ocean_procimity속성 제거
housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num)
imputer.statistics_
print("제 03강 실습과제 2017250045 정태환\n")
housing_num.median().values

제 03강 실습과제 2017250045 정태환



array([-118.51   ,   34.26   ,   29.     , 2119.     ,  433.     ,
       1164.     ,  408.     ,    3.54155,    3.     ])

In [84]:
X = imputer.transform(housing_num) 
housing_tr = pd.DataFrame(X, columns=housing_num.columns,
                          index = list(housing.index.values)) 
print("제 03강 실습과제 2017250045 정태환\n")
housing_tr.loc[sample_incomplete_rows.index.values]

제 03강 실습과제 2017250045 정태환



Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,income_cat
1606,-122.08,37.88,26.0,2947.0,433.0,825.0,626.0,2.933,2.0
10915,-117.87,33.73,45.0,2264.0,433.0,1970.0,499.0,3.4193,3.0
19150,-122.7,38.35,14.0,2313.0,433.0,954.0,397.0,3.7813,3.0
4186,-118.23,34.13,48.0,1308.0,433.0,835.0,294.0,4.2891,3.0
16885,-122.4,37.58,26.0,3281.0,433.0,1145.0,480.0,6.358,5.0


In [14]:
housing_cat=housing[["ocean_proximity"]]
housing_cat.head(10)

Unnamed: 0,ocean_proximity
12655,INLAND
15502,NEAR OCEAN
2908,INLAND
14053,NEAR OCEAN
20496,<1H OCEAN
1481,NEAR BAY
18125,<1H OCEAN
5830,<1H OCEAN
17989,<1H OCEAN
4861,<1H OCEAN


In [86]:
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder=OrdinalEncoder()
housing_cat_encoded=ordinal_encoder.fit_transform(housing_cat)
print("제 03강 실습과제 2017250045 정태환\n")
housing_cat_encoded[:10]

제 03강 실습과제 2017250045 정태환



array([[1.],
       [4.],
       [1.],
       [4.],
       [0.],
       [3.],
       [0.],
       [0.],
       [0.],
       [0.]])

In [87]:
print("제 03강 실습과제 2017250045 정태환\n")
ordinal_encoder.categories_

제 03강 실습과제 2017250045 정태환



[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

In [88]:
from sklearn.preprocessing import OneHotEncoder
print("제 03강 실습과제 2017250045 정태환\n")
cat_encoder = OneHotEncoder() 
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

제 03강 실습과제 2017250045 정태환



<16512x5 sparse matrix of type '<class 'numpy.float64'>'
	with 16512 stored elements in Compressed Sparse Row format>

In [18]:
housing_cat_1hot.toarray()

array([[0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       ...,
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [19]:
cat_encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

In [20]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6 

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room 
    def fit(self, X, y=None):
        return self 
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else: 
            return np.c_[X, rooms_per_household, population_per_household]
        
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)   
housing_extra_attribs = attr_adder.transform(housing.values)


In [89]:
housing_extra_attribs = pd.DataFrame(housing_extra_attribs,
        columns=list(housing.columns)+ ["rooms_per_household", "population_per_household"])
print("제 03강 실습과제 2017250045 정태환\n")
housing_extra_attribs.head()



제 03강 실습과제 2017250045 정태환



Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,income_cat,rooms_per_household,population_per_household
0,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,INLAND,2.0,5.485836,3.168555
1,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,NEAR OCEAN,5.0,6.927083,2.623698
2,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.875,INLAND,2.0,5.393333,2.223333
3,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,NEAR OCEAN,2.0,3.886128,1.859213
4,-118.7,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,<1H OCEAN,3.0,6.096552,3.167241


In [22]:
housing_cat = housing[["ocean_proximity"]] 
housing_cat.head(10)

Unnamed: 0,ocean_proximity
12655,INLAND
15502,NEAR OCEAN
2908,INLAND
14053,NEAR OCEAN
20496,<1H OCEAN
1481,NEAR BAY
18125,<1H OCEAN
5830,<1H OCEAN
17989,<1H OCEAN
4861,<1H OCEAN


In [90]:
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler 
print("제 03강 실습과제 2017250045 정태환\n")
num_pipeline = Pipeline([ 
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
    ])

housing_num_tr=num_pipeline.fit_transform(housing_num)
housing_num_tr

제 03강 실습과제 2017250045 정태환



array([[-0.94135046,  1.34743822,  0.02756357, ...,  0.01739526,
         0.00622264, -0.12112176],
       [ 1.17178212, -1.19243966, -1.72201763, ...,  0.56925554,
        -0.04081077, -0.81086696],
       [ 0.26758118, -0.1259716 ,  1.22045984, ..., -0.01802432,
        -0.07537122, -0.33827252],
       ...,
       [-1.5707942 ,  1.31001828,  1.53856552, ..., -0.5092404 ,
        -0.03743619,  0.32286937],
       [-1.56080303,  1.2492109 , -1.1653327 , ...,  0.32814891,
        -0.05915604, -0.45702273],
       [-1.28105026,  2.02567448, -0.13148926, ...,  0.01407228,
         0.00657083, -0.12169672]])

In [92]:
from sklearn.compose import ColumnTransformer 

num_attribs = list(housing_num) 
cat_attribs = ["ocean_proximity"] 

full_pipeline = ColumnTransformer([ 
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
]) 

print("제 03강 실습과제 2017250045 정태환\n")
housing_prepared = full_pipeline.fit_transform(housing) 
housing_prepared

제 03강 실습과제 2017250045 정태환



array([[-0.94135046,  1.34743822,  0.02756357, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.17178212, -1.19243966, -1.72201763, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.26758118, -0.1259716 ,  1.22045984, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-1.5707942 ,  1.31001828,  1.53856552, ...,  0.        ,
         0.        ,  0.        ],
       [-1.56080303,  1.2492109 , -1.1653327 , ...,  0.        ,
         0.        ,  0.        ],
       [-1.28105026,  2.02567448, -0.13148926, ...,  0.        ,
         0.        ,  0.        ]])

In [36]:
from sklearn.linear_model import LinearRegression

lin_reg=LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

LinearRegression()

In [93]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("제 03강 실습과제 2017250045 정태환\nPredictions:", lin_reg.predict(some_data_prepared)) 
print("Labels:", list(some_labels))

제 03강 실습과제 2017250045 정태환
Predictions: [ 83811.08312039 314388.27147935 145641.85902864 183704.64921202
 238729.66725745]
Labels: [72100.0, 279600.0, 82700.0, 112500.0, 238300.0]


In [40]:
from sklearn.metrics import mean_squared_error 
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse) 
lin_rmse

68376.51254853733

In [94]:
from sklearn.metrics import mean_absolute_error
lin_mae = mean_absolute_error(housing_labels, housing_predictions) 
print("제 03강 실습과제 2017250045 정태환\n")
lin_mae

제 03강 실습과제 2017250045 정태환



12102.725989583334

In [95]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

print("제 03강 실습과제 2017250045 정태환\n")
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions) 
tree_rmse = np.sqrt(tree_mse) 
tree_rmse

제 03강 실습과제 2017250045 정태환



0.0

In [96]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
                         scoring="neg_mean_squared_error", cv=10) 
tree_rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    print("\nScores:\n", scores)
    print("\nMean:\n", scores.mean())
    print("\nStandard deviation:\n", scores.std())

print("제 03강 실습과제 2017250045 정태환\n")
display_scores(tree_rmse_scores)

제 03강 실습과제 2017250045 정태환


Scores:
 [72992.28422217 69956.97933148 66792.58634274 71380.50032958
 69687.9053465  76262.84201452 72217.59785132 73746.22366318
 68788.62248983 71043.48132603]

Mean:
 71286.90229173577

Standard deviation:
 2557.6656930586637


In [97]:
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores) 
print("제 03강 실습과제 2017250045 정태환\n")
display_scores(lin_rmse_scores)


제 03강 실습과제 2017250045 정태환


Scores:
 [71523.78333874 64044.46774989 67454.97869698 68514.10137273
 66303.62531226 72166.63405138 74464.08841381 68570.11804395
 66063.64175868 69870.86192291]

Mean:
 68897.63006613274

Standard deviation:
 3002.7461275348655


In [102]:
from sklearn.ensemble import RandomForestRegressor 
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels) 

forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                                scoring="neg_mean_squared_error", cv=10)
 
forest_rmse_scores = np.sqrt(-forest_scores)
print("제 03강 실습과제 2017250045 정태환\n")
display_scores(forest_rmse_scores)

제 03강 실습과제 2017250045 정태환


Scores:
 [51267.50697953 49298.6916879  46653.69989169 52182.57902534
 47387.21561895 51387.55434546 52637.05913161 49684.95976704
 48229.27252511 53397.80645778]

Mean:
 50212.63454304069

Standard deviation:
 2193.022877899876


In [105]:
from sklearn.model_selection import GridSearchCV 
param_grid = [ 
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
forest_reg = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, 
                           scoring='neg_mean_squared_error')
grid_search.fit(housing_prepared, housing_labels)

print("2017250045 정태환\ngrid_search.best_params_ :", grid_search.best_params_) 

2017250045 정태환
grid_search.best_params_ : {'max_features': 6, 'n_estimators': 30}


In [106]:
grid_search.best_estimator_

RandomForestRegressor(max_features=6, n_estimators=30, random_state=42)

In [110]:
from sklearn.model_selection import RandomizedSearchCV 
from scipy.stats import randint 
param_distribs = { 'n_estimators': randint(low=1, high=200),
                  'max_features': randint(low=1, high=8),
} 

forest_reg = RandomForestRegressor(random_state=42) 
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error',random_state=42) 

print("2017250045 정태환\n")
rnd_search.fit(housing_prepared, housing_labels)
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

2017250045 정태환

49799.635737761106 {'max_features': 7, 'n_estimators': 180}
52293.114093913726 {'max_features': 5, 'n_estimators': 15}
51327.353255586764 {'max_features': 3, 'n_estimators': 72}
51528.97117998048 {'max_features': 5, 'n_estimators': 21}
49958.42482333546 {'max_features': 7, 'n_estimators': 122}
51270.531241462595 {'max_features': 3, 'n_estimators': 75}
51172.437672640175 {'max_features': 3, 'n_estimators': 88}
50255.14987044715 {'max_features': 5, 'n_estimators': 100}
50894.38729795359 {'max_features': 3, 'n_estimators': 150}
65022.070435017646 {'max_features': 5, 'n_estimators': 2}


In [112]:
feature_importances = grid_search.best_estimator_.feature_importances_ 
print("2017250045 정태환\n")
print("\nfeature_importances: \n", feature_importances)

2017250045 정태환


feature_importances: 
 [6.76445809e-02 6.24744883e-02 4.48349664e-02 1.82692423e-02
 1.78441912e-02 1.96046284e-02 1.81345873e-02 2.43728155e-01
 1.64552239e-01 5.15312129e-02 1.00601503e-01 5.56925429e-02
 1.39077386e-02 1.13027019e-01 8.79969201e-05 2.68765993e-03
 5.37724723e-03]


In [111]:
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]

cat_encoder = full_pipeline.named_transformers_["cat"] 
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
print("2017250045 정태환\n")
print("\nsorted(zip(feature_importances, attributes), reverse=True) : \n", 
      sorted(zip(feature_importances, attributes), reverse=True))

2017250045 정태환


sorted(zip(feature_importances, attributes), reverse=True) : 
 [(0.31835471150192546, 'median_income'), (0.14262787170081898, 'INLAND'), (0.1196237044148624, 'income_cat'), (0.10724333043362204, 'pop_per_hhold'), (0.06538779056897238, 'longitude'), (0.05754931656343965, 'latitude'), (0.04355226840372541, 'housing_median_age'), (0.03529887374571845, 'bedrooms_per_room'), (0.03448950971627275, 'rooms_per_hhold'), (0.01622085016573279, 'population'), (0.015272077451460732, 'households'), (0.015204571326993975, 'total_bedrooms'), (0.015173430030007646, 'total_rooms'), (0.0075459731392879285, '<1H OCEAN'), (0.0039949237642856955, 'NEAR OCEAN'), (0.002374953161691069, 'NEAR BAY'), (8.584391118242743e-05, 'ISLAND')]


In [113]:
final_model = grid_search.best_estimator_
X_test = strat_test_set.drop("median_house_value", axis=1) 
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test) 

final_predictions = final_model.predict(X_test_prepared) 

final_mse = mean_squared_error(y_test, final_predictions)

final_rmse = np.sqrt(final_mse) 
print("2017250045 정태환\n")
print("y_test: \n", y_test[:10])
print("final_predictions: \n", final_predictions[:10])
print("\nFinal RMSE:\n", final_rmse)

2017250045 정태환

y_test: 
 5241     500001.0
17352    162500.0
3505     204600.0
7777     159700.0
14155    184000.0
7057     151900.0
33       104900.0
17049    500001.0
18164    367400.0
10444    346500.0
Name: median_house_value, dtype: float64
final_predictions: 
 [488480.76666667 230096.7        217300.         164356.66666667
 258603.36666667 160763.33333333 117356.66666667 415110.13333333
 254880.         243366.7       ]

Final RMSE:
 48557.33618531538
