In [69]:
import os
import tarfile
import pandas as pd
from six.moves import urllib
import numpy as np
from sklearn.model_selection import train_test_split #package for splitting up the test and training data
from sklearn.model_selection import StratifiedShuffleSplit #for stratified sampling
 

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"


# loads the housing data from github
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path): #checks to see if dir exists
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz") #path for zip file
    urllib.request.urlretrieve(housing_url, tgz_path) #requests zip file and stores it in location
    housing_tgz = tarfile.open(tgz_path) #opens the file
    housing_tgz.extractall(path=housing_path) #extracts the file
    housing_tgz.close() #closes file
# reads csv file for data
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv") #creates link to path of file
    return pd.read_csv(csv_path) #returns file that was previously downloaded

housing = load_housing_data() 
#splits the housing data into a training set and a test set

#median income data is reduced in order to stratify the data
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5) # divides values by 1.5 and rounds up
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True) #sets all values that are greater than 5 to 5

#splits data into test and training sets using stratified method
split = StratifiedShuffleSplit(n_splits=1, test_size=.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,income_cat
17606,-121.89,37.29,38.0,1568.0,351.0,710.0,339.0,2.7042,286600.0,<1H OCEAN,2.0
18632,-121.93,37.05,14.0,679.0,108.0,306.0,113.0,6.4214,340600.0,<1H OCEAN,5.0
14650,-117.20,32.77,31.0,1952.0,471.0,936.0,462.0,2.8621,196900.0,NEAR OCEAN,2.0
3230,-119.61,36.31,25.0,1847.0,371.0,1460.0,353.0,1.8839,46300.0,INLAND,2.0
3555,-118.59,34.23,17.0,6592.0,1525.0,4459.0,1463.0,3.0347,254500.0,<1H OCEAN,3.0
19480,-120.97,37.66,24.0,2930.0,588.0,1448.0,570.0,3.5395,127900.0,INLAND,3.0
8879,-118.50,34.04,52.0,2233.0,317.0,769.0,277.0,8.3839,500001.0,<1H OCEAN,5.0
13685,-117.24,34.15,26.0,2041.0,293.0,936.0,375.0,6.0000,140200.0,INLAND,4.0
4937,-118.26,33.99,47.0,1865.0,465.0,1916.0,438.0,1.8242,95000.0,<1H OCEAN,2.0
4861,-118.28,34.02,29.0,515.0,229.0,2690.0,217.0,0.4999,500001.0,<1H OCEAN,1.0
