In [1]:
import os
import tarfile
import pandas as pd
from pandas.plotting import scatter_matrix
from six.moves import urllib
import numpy as np
from sklearn.model_selection import train_test_split #package for splitting up the test and training data
from sklearn.model_selection import StratifiedShuffleSplit #for stratified sampling
import matplotlib.pyplot as plt 
from sklearn.preprocessing import Imputer
# from sklearn.preprocessing import LabelEncoder
# from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.externals import joblib
from sklearn.model_selection import GridSearchCV
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"


# loads the housing data from github
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path): #checks to see if dir exists
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz") #path for zip file
    urllib.request.urlretrieve(housing_url, tgz_path) #requests zip file and stores it in location
    housing_tgz = tarfile.open(tgz_path) #opens the file
    housing_tgz.extractall(path=housing_path) #extracts the file
    housing_tgz.close() #closes file
# reads csv file for data
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv") #creates link to path of file
    return pd.read_csv(csv_path) #returns file that was previously downloaded

housing = load_housing_data() 
#median income is important to determine average house price
#median income data is reduced in order to stratify the data
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5) # divides values by 1.5 and rounds up
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True) #sets all values that are greater than 5 to 5

#splits data into test and training sets using stratified method
split = StratifiedShuffleSplit(n_splits=1, test_size=.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

#drops the income_cat used for indexing and sorting
for set in (strat_train_set, strat_test_set):
    set.drop(["income_cat"], axis=1, inplace=True)

housing = strat_train_set.copy()
# housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]#number of rooms per households
# housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"] #number of bedrooms per household
# housing["population_per_household"] = housing["population"]/housing["households"]#average population for each household

# corr_matrix = housing.corr()#gets standard coorelation cooefficent for housing data
# corr_matrix["median_house_value"].sort_values(ascending=False)#how much each variable coorelates with median_house_val

housing = strat_train_set.drop("median_house_value", axis=1) #drops the median_house_value for training
housing_labels = strat_train_set["median_house_value"].copy()#value that the predictors try to predict(labels)

# imputer = Imputer(strategy="median") #new importer object that is meant to find median values in order to replace null vals
housing_num = housing.drop("ocean_proximity",  axis=1) #imputer doesn't work with non-numerical vals
# imputer.fit(housing_num) #apply the imputer onto the dataset
# X = imputer.transform(housing_num) #transforms the housing_num set
# housing_tr = pd.DataFrame(X, columns=housing_num.columns) #transforms numpy array into pandas dataframe

# encoder = LabelBinarizer(sparse_output=True) #label encoder and onehot encoder all in one, that outputs sparse matrix
# housing_cat_1hot = encoder.fit_transform(housing_cat) #applies the transform

rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6 #index of rows
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
        
# attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
# housing_extra_attribs = attr_adder.transform(housing.values)


###DATA PREPERATION BLOCK ###
num_attribs = list(housing_num) 
cat_attribs = ["ocean_proximity"]

#pipelines for different numbers
num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)), #selects number values
    ('imputer', Imputer(strategy="median")), #replaces null vals with computed median
    ('attribs_adder', CombinedAttributesAdder()), #adds new values generated from old e.g. bedrooms_per_house
    ('std_scaler', StandardScaler()), #scales data to fit 
])
#cat pipeline
cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)), #selects data with text vals
    ('label_binalizer', LabelBinarizer()), #onehot encoder for data
])
#runs number pipline and cat pipeline simoultaniously and concactinates the resulting values
full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])
housing_prepared = full_pipeline.fit_transform(housing)


###HYPERPARAMETER TWEAKING###


###PREDICTION BLOCK###

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard Deviation", scores.std())

# forest_reg = RandomForestRegressor() #type of model
# forest_reg.fit(housing_prepared, housing_labels) #fits value to model
# scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
#                         scoring="neg_mean_squared_error", cv=10) #runs predictions on 10 datasets
# rmse_scores = np.sqrt(-scores)
# display_scores(rmse_scores)

#dump current model to be loaded later
# joblib.dump(my_model, "my_model.pkl")


param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error')
grid_search.fit(housing_prepared, housing_labels)

final_model = grid_search.best_estimator_
X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)

final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

TypeError: fit_transform() takes 2 positional arguments but 3 were given

{'max_features': 8, 'n_estimators': 30}