In [1]:
import os
import tarfile
from six.moves import urllib
import pandas as pd
import numpy as np

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
    
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)
fetch_housing_data()
housing = load_housing_data()

In [2]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
from sklearn.model_selection import train_test_split

In [4]:
y = housing['median_house_value']
X = housing.drop('median_house_value', axis=1)
#X['total_bedrooms'].fillna((X['total_bedrooms'].mean()), inplace=True) 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [5]:
X_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
5748,-118.27,34.16,15.0,5036.0,1299.0,3164.0,1175.0,2.9148,<1H OCEAN
5592,-118.26,33.8,41.0,2004.0,481.0,1658.0,456.0,3.1779,<1H OCEAN
2110,-119.75,36.75,49.0,2331.0,460.0,1290.0,477.0,2.5111,INLAND
13901,-116.56,34.06,15.0,6928.0,1529.0,2568.0,1075.0,2.5405,INLAND
18425,-121.83,37.26,7.0,3609.0,751.0,1739.0,682.0,4.5033,<1H OCEAN


In [6]:
X_train.dtypes

longitude             float64
latitude              float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
ocean_proximity        object
dtype: object

In [7]:
X_train.shape

(14448, 9)

In [8]:
train_objs_num = len(X_train)
dataset = pd.concat(objs=[X_train, X_test], axis=0)
dataset = pd.get_dummies(data=dataset, columns=["ocean_proximity"])
X_train = dataset[:train_objs_num]
X_test = dataset[train_objs_num:]

In [9]:
X_train['total_bedrooms'].isnull().sum()

143

In [10]:
# Going to use a random forest model to impute the missing total bedrooms value

In [11]:
total_bedrooms = X_train[pd.notnull(X_train['total_bedrooms'])]
total_bedrooms.shape

(14305, 13)

In [12]:
y_total_bedrooms = total_bedrooms['total_bedrooms']
X_total_bedrooms = total_bedrooms.drop('total_bedrooms', axis=1)

In [13]:
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor(random_state=0)
clf.fit(X_total_bedrooms, y_total_bedrooms)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [14]:
for index, row in X_train.iterrows():
    if pd.isnull(row['total_bedrooms']):
        X_train.set_value(index,'total_bedrooms', clf.predict(row.drop('total_bedrooms').values.reshape(1, -1)))
        
X_train['total_bedrooms'].isnull().sum()

0

In [15]:
X_test['total_bedrooms'].isnull().sum()

64

In [16]:
for index, row in X_test.iterrows():
    if pd.isnull(row['total_bedrooms']):
        X_test.set_value(index,'total_bedrooms', clf.predict(row.drop('total_bedrooms').values.reshape(1, -1)))
        
X_test['total_bedrooms'].isnull().sum()

0

In [17]:
from sklearn.linear_model import LinearRegression
linear = LinearRegression()
linear.fit(X_train, y_train)
linear.score(X_test, y_test)

0.63842973933662472

In [18]:
from sklearn.neighbors import KNeighborsRegressor
kn = KNeighborsRegressor()
kn.fit(X_train, y_train)
kn.score(X_test, y_test)

0.24213483873922781

In [19]:
from sklearn.ensemble import AdaBoostRegressor
abr = AdaBoostRegressor()
abr.fit(X_train, y_train)
abr.score(X_test, y_test)

0.40052435957757904

In [20]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)
gbr.score(X_test, y_test)

0.76736469069396485

In [21]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
rfr.score(X_test, y_test)

0.79902376259544505

In [22]:
from sklearn.ensemble import BaggingRegressor
br = BaggingRegressor()
br.fit(X_train, y_train)
br.score(X_test, y_test)

0.79977597375867482

In [23]:
from sklearn.linear_model import HuberRegressor
hr = HuberRegressor()
hr.fit(X_train, y_train)
hr.score(X_train, y_train)

0.5929330217791039

In [24]:
# from sklearn.svm import SVC
# svc = SVC()
# svc.fit(X_train, y_train)
# svc.score(X_test, y_test)

In [None]:
# The initial top 3 models are GradientBoostingRegressor, RandomForestRegressor, and BaggingRegressor

In [None]:
# GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

param_grid = {"alpha": [.0001, .001, .01, .1, .99],
             "max_depth": [5, 10, 15]}

grid_search = GridSearchCV(GradientBoostingRegressor(max_leaf_nodes=None, n_estimators=100), param_grid, n_jobs=-1)
grid_search.fit(X_train, y_train)
grid_search.score(X_test, y_test)

In [None]:
grid_search.best_params_

In [None]:
# {'alpha': 0.99, 'max_depth': 10} got me a score of .82
# {'alpha': 0.001, 'max_depth': 10, 'n_estimators': 100} score of .82

In [None]:
# RandomForestRegressor

param_grid = {"alpha": [.0001, .001, .01, .1, .99],
             "max_depth": [5, 10, 15]}

grid_search = GridSearchCV(GradientBoostingRegressor(max_leaf_nodes=None, n_estimators=100), param_grid, n_jobs=-1)
grid_search.fit(X_train, y_train)
grid_search.score(X_test, y_test)

In [None]:
grid_search.best_params_