In [11]:
import os
import tarfile
import pandas as pd
from pandas.plotting import scatter_matrix
from six.moves import urllib
import numpy as np
from sklearn.model_selection import train_test_split #package for splitting up the test and training data
from sklearn.model_selection import StratifiedShuffleSplit #for stratified sampling
import matplotlib.pyplot as plt 
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"


# loads the housing data from github
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path): #checks to see if dir exists
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz") #path for zip file
    urllib.request.urlretrieve(housing_url, tgz_path) #requests zip file and stores it in location
    housing_tgz = tarfile.open(tgz_path) #opens the file
    housing_tgz.extractall(path=housing_path) #extracts the file
    housing_tgz.close() #closes file
# reads csv file for data
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv") #creates link to path of file
    return pd.read_csv(csv_path) #returns file that was previously downloaded

housing = load_housing_data() 
#splits the housing data into a training set and a test set

#median income data is reduced in order to stratify the data
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5) # divides values by 1.5 and rounds up
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True) #sets all values that are greater than 5 to 5

#splits data into test and training sets using stratified method
split = StratifiedShuffleSplit(n_splits=1, test_size=.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
#drops the income_cat used for indexing
for set in (strat_train_set, strat_test_set):
    set.drop(["income_cat"], axis=1, inplace=True)
housing = strat_train_set.copy()
# housing.plot(kind="scatter", x="longitude", y="latitude", alpha=.1,
#             s=housing["population"]/100,
#             label="population",
#             c="median_house_value", 
#             cmap=plt.get_cmap("jet"), colorbar=True) #scatterplot of lon and lat vals of the houses,
# plt.legend()

# attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"] #main attr for comparing
# scatter_matrix(housing[attributes], figsize=(12, 8)) #scatterplot of main attr
# housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=.1) #see coorelation between median_income and house val

housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]#number of rooms per households
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"] #number of bedrooms per household
housing["population_per_household"] = housing["population"]/housing["households"]#average population for each household

corr_matrix = housing.corr()#gets standard coorelation cooefficent for housing data
corr_matrix["median_house_value"].sort_values(ascending=False)#how much each variable coorelates with median_house_val

housing = strat_train_set.drop("median_house_value", axis=1) #drops the median_house_value 
housing_labels = strat_train_set["median_house_value"].copy()#copies in order to seperate predictors and labels

# housing.dropna(subset=["total_bedroom"]) #drops districts that have null values for total_bedroom
imputer = Imputer(strategy="median") #new importer object that is meant to find median values
housing_num = housing.drop("ocean_proximity",  axis=1) #imputer doesn't work with non-numerical vals
imputer.fit(housing_num) #apply the imputer onto the dataset
X = imputer.transform(housing_num) #transforms the housing_num set
housing_tr = pd.DataFrame(X, columns=housing_num.columns) #transforms numpy array into pandas dataframe

encoder = LabelEncoder() #converts text values to number values, assumes nearby values are more similiar then far apart ones
housing_cat = housing["ocean_proximity"] #selects ocean proximity array which is only text
housing_cat_encoded = encoder.fit_transform(housing_cat)#apply transform on text category
encoder = OneHotEncoder() #converts text to number with a hot and cold attr
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1)) #applies the hot and cold attr



<16512x5 sparse matrix of type '<class 'numpy.float64'>'
	with 16512 stored elements in Compressed Sparse Row format>