In [5]:
import os
import tarfile
from six.moves import urllib
import pandas as pd
import numpy as np
from math import log

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
    
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)
fetch_housing_data()
housing = load_housing_data()

In [6]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
# y = np.log(y[:])
# log(y[:])
# for row in y:
#     y[row] = log(y).astype(float)

# y = y.astype(float)
    

In [9]:
y = housing['median_house_value']
y = np.log(y[:])
X = housing.drop('median_house_value', axis=1)
#X['total_bedrooms'].fillna((X['total_bedrooms'].mean()), inplace=True) 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [10]:
X_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
5748,-118.27,34.16,15.0,5036.0,1299.0,3164.0,1175.0,2.9148,<1H OCEAN
5592,-118.26,33.8,41.0,2004.0,481.0,1658.0,456.0,3.1779,<1H OCEAN
2110,-119.75,36.75,49.0,2331.0,460.0,1290.0,477.0,2.5111,INLAND
13901,-116.56,34.06,15.0,6928.0,1529.0,2568.0,1075.0,2.5405,INLAND
18425,-121.83,37.26,7.0,3609.0,751.0,1739.0,682.0,4.5033,<1H OCEAN


In [11]:
# Feature engineer people per household, improved my score by .5
combine = [X_train, X_test]
for dataset in combine:
    dataset['ppl_per_household'] = (dataset['population']/dataset['households'])
    
X_train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,ppl_per_household
5748,-118.27,34.16,15.0,5036.0,1299.0,3164.0,1175.0,2.9148,<1H OCEAN,2.692766
5592,-118.26,33.8,41.0,2004.0,481.0,1658.0,456.0,3.1779,<1H OCEAN,3.635965
2110,-119.75,36.75,49.0,2331.0,460.0,1290.0,477.0,2.5111,INLAND,2.704403
13901,-116.56,34.06,15.0,6928.0,1529.0,2568.0,1075.0,2.5405,INLAND,2.388837
18425,-121.83,37.26,7.0,3609.0,751.0,1739.0,682.0,4.5033,<1H OCEAN,2.549853


In [12]:
X_train.dtypes

longitude             float64
latitude              float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
ocean_proximity        object
ppl_per_household     float64
dtype: object

In [13]:
X_train.shape

(14448, 10)

In [14]:
train_objs_num = len(X_train)
dataset = pd.concat(objs=[X_train, X_test], axis=0)
dataset = pd.get_dummies(data=dataset, columns=["ocean_proximity"])
X_train = dataset[:train_objs_num]
X_test = dataset[train_objs_num:]

In [15]:
X_train['total_bedrooms'].isnull().sum()

143

In [16]:
# Going to use a random forest model to impute the missing total bedrooms value

In [17]:
total_bedrooms = X_train[pd.notnull(X_train['total_bedrooms'])]
total_bedrooms.shape

(14305, 14)

In [18]:
y_total_bedrooms = total_bedrooms['total_bedrooms']
X_total_bedrooms = total_bedrooms.drop('total_bedrooms', axis=1)

In [21]:
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor(random_state=0)
clf.fit(X_total_bedrooms, y_total_bedrooms)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [22]:
for index, row in X_train.iterrows():
    if pd.isnull(row['total_bedrooms']):
        X_train.set_value(index,'total_bedrooms', clf.predict(row.drop('total_bedrooms').values.reshape(1, -1)))
        
X_train['total_bedrooms'].isnull().sum()

0

In [23]:
X_test['total_bedrooms'].isnull().sum()

64

In [24]:
for index, row in X_test.iterrows():
    if pd.isnull(row['total_bedrooms']):
        X_test.set_value(index,'total_bedrooms', clf.predict(row.drop('total_bedrooms').values.reshape(1, -1)))
        
X_test['total_bedrooms'].isnull().sum()

0

In [25]:
# Feature engineer people per household, improved my score by .5
combine = [X_train, X_test]
for dataset in combine:
    dataset['br_per_household'] = (dataset['total_bedrooms']/dataset['households'])
    dataset['rooms_per_household'] = (dataset['total_rooms']/dataset['households'])
    
# X_train = X_train.drop(['ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY', 'ocean_proximity_NEAR OCEAN', 'ocean_proximity_<1H OCEAN'], axis=1)
# X_test = X_test.drop(['ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY', 'ocean_proximity_NEAR OCEAN', 'ocean_proximity_<1H OCEAN'], axis=1)
X_train.head()


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ppl_per_household,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,br_per_household,rooms_per_household
5748,-118.27,34.16,15.0,5036.0,1299.0,3164.0,1175.0,2.9148,2.692766,1,0,0,0,0,1.105532,4.285957
5592,-118.26,33.8,41.0,2004.0,481.0,1658.0,456.0,3.1779,3.635965,1,0,0,0,0,1.054825,4.394737
2110,-119.75,36.75,49.0,2331.0,460.0,1290.0,477.0,2.5111,2.704403,0,1,0,0,0,0.964361,4.886792
13901,-116.56,34.06,15.0,6928.0,1529.0,2568.0,1075.0,2.5405,2.388837,0,1,0,0,0,1.422326,6.444651
18425,-121.83,37.26,7.0,3609.0,751.0,1739.0,682.0,4.5033,2.549853,1,0,0,0,0,1.101173,5.291789


In [26]:
# Create income brackets
# This hurt my score
          
# for index, row in X_train.iterrows():
#     if(row['median_income'] <= 3):
#         X_train.set_value(index,'income_bracket', 1)
#     elif(row['median_income'] > 5 and row['median_income'] <= 6):
#         X_train.set_value(index,'income_bracket', 2)
#     elif(row['median_income'] > 5 and row['median_income'] <= 9):
#         X_train.set_value(index,'income_bracket', 3)
#     elif(row['median_income'] > 5 and row['median_income'] <= 12):
#         X_train.set_value(index,'income_bracket', 4)
#     else:
#         X_train.set_value(index,'income_bracket', 5)

# for index, row in X_test.iterrows():
#     if(row['median_income'] <= 3):
#         X_test.set_value(index,'income_bracket', 1)
#     elif(row['median_income'] > 5 and row['median_income'] <= 6):
#         X_test.set_value(index,'income_bracket', 2)
#     elif(row['median_income'] > 5 and row['median_income'] <= 9):
#         X_test.set_value(index,'income_bracket', 3)
#     elif(row['median_income'] > 5 and row['median_income'] <= 12):
#         X_test.set_value(index,'income_bracket', 4)
#     else:
#         X_test.set_value(index,'income_bracket', 5)
        
# X_test.head()

In [27]:
# X_train = X_train.drop('median_income', axis=1)
# X_test = X_test.drop('median_income', axis=1)

In [28]:
from sklearn.linear_model import LinearRegression #.63842973933662484
linear = LinearRegression()
linear.fit(X_train, y_train)
linear.score(X_test, y_test)

0.65597417627999777

In [29]:
from sklearn.neighbors import KNeighborsRegressor #0.24213483873922781
kn = KNeighborsRegressor()
kn.fit(X_train, y_train)
kn.score(X_test, y_test)

0.16620231275698327

In [30]:
from sklearn.ensemble import AdaBoostRegressor #0.37685610836554639
abr = AdaBoostRegressor()
abr.fit(X_train, y_train)
abr.score(X_test, y_test)

0.61172580876969729

In [31]:
from sklearn.ensemble import GradientBoostingRegressor #0.76735969543767302
gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)
gbr.score(X_test, y_test)

0.79666700808842683

In [32]:
from sklearn.ensemble import RandomForestRegressor #0.79285589164147563
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
rfr.score(X_test, y_test)

0.81015907596121894

In [33]:
from sklearn.ensemble import BaggingRegressor #0.79144004831467507
br = BaggingRegressor()
br.fit(X_train, y_train)
br.score(X_test, y_test)

0.81148839713027099

In [34]:
from sklearn.linear_model import HuberRegressor #0.5929330217791039
hr = HuberRegressor()
hr.fit(X_train, y_train)
hr.score(X_train, y_train)

0.5844223308085067

In [35]:
# from sklearn.svm import SVC
# svc = SVC()
# svc.fit(X_train, y_train)
# svc.score(X_test, y_test)

In [36]:
# The initial top 3 models are GradientBoostingRegressor, RandomForestRegressor, and BaggingRegressor

In [37]:
# GradientBoostingRegressor 0.82203915589692222
from sklearn.model_selection import GridSearchCV

# param_grid = {"n_estimators": [100, 110, 125],
#              "max_depth": [5, 10, 15]}

# grid_search = GridSearchCV(GradientBoostingRegressor(alpha=.99), param_grid, n_jobs=-1)
# grid_search.fit(X_train, y_train)
# grid_search.score(X_test, y_test)

In [38]:
# grid_search.best_params_

In [39]:
# {'alpha': 0.99, 'max_depth': 10, n_estimators=100} got me a score of 0.82126867966327466

In [40]:
# RandomForestRegressor 0.79994094212026812

# param_grid = {"max_depth": [100, 110, 115]}

# grid_search = GridSearchCV(RandomForestRegressor(max_leaf_nodes=None, n_estimators=10), param_grid, n_jobs=-1)
# grid_search.fit(X_train, y_train)
# grid_search.score(X_test, y_test)

In [41]:
# grid_search.best_params_

In [42]:
# BaggingRegressor 0.81569243911264921

# param_grid = {"n_estimators": [110, 115]}

# grid_search = GridSearchCV(BaggingRegressor(), param_grid, n_jobs=-1)
# grid_search.fit(X_train, y_train)
# grid_search.score(X_test, y_test)

In [43]:
# grid_search.best_params_

In [44]:
# Doing a grid search it looks like GradientBoostingRegressor and BaggingRegressor were our two best models

# Lets do some more searching 0.82769850351711172

# param_grid = {"min_samples_split": [6,8,10],
#              "min_samples_leaf": [4,6,8]}

# grid_search = GridSearchCV(GradientBoostingRegressor(alpha=.99, max_depth=10, n_estimators=100), param_grid, n_jobs=-1)
# grid_search.fit(X_train, y_train)
# grid_search.score(X_test, y_test)

In [45]:
# grid_search.best_params_

In [None]:
# {'min_samples_leaf': 8, 'min_samples_split': 6} 0.82769850351711172

In [None]:
# BaggingRegressor 0.81624051089996907

param_grid = {}

grid_search = GridSearchCV(BaggingRegressor(n_estimators=110, bootstrap_features=True), param_grid, n_jobs=-1)
grid_search.fit(X_train, y_train)
grid_search.score(X_test, y_test)

In [None]:
grid_search.best_params_

In [None]:
# {'bootstrap': True, 'bootstrap_features': True} 0.81833486378237474

In [None]:
# GradientBoostingRegressor is the best model with a score of 0.83154332457718538
gbr = GradientBoostingRegressor(alpha=.99, max_depth=10, n_estimators=100, min_samples_leaf=8, min_samples_split=6)
gbr.fit(X_train, y_train)
gbr.score(X_test, y_test)

In [None]:
from sklearn.model_selection import cross_val_score
print(cross_val_score(GradientBoostingRegressor(alpha=.99, max_depth=10, n_estimators=100, min_samples_leaf=8, min_samples_split=6),X_train, y_train))

In [None]:
feature_importance_df = pd.DataFrame({"importances": gbr.feature_importances_, "feature": X_train.columns.values})

In [None]:
feature_importance_df.sort_values(by=['importances'])