# Fetch The Housing Data
    Fetch Housing Data from the web

In [None]:
import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"

def fetch_housing_data(housing_url = HOUSING_URL, housing_path = HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()


In [None]:
fetch_housing_data()

# Load Housing Data
    Load CSV of housing data

In [None]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, 'housing.csv')
    return pd.read_csv(csv_path)

In [None]:
housing = load_housing_data()
housing.head()

In [None]:
housing["ocean_proximity"].value_counts()

# Plotting The Different Attributes

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
plt.show()

# Train Test Splitting

In [None]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

In [None]:
housing = train_set.copy()

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4, s=housing["population"]/100, label="population", 
            c = "median_house_value", cmap=plt.get_cmap("jet"), colorbar=True)
plt.legend()

In [None]:
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

# Scatter matrix
Plots every numerical attribute against every other numerical attribute

In [None]:
from pandas.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12,8))

Median Income clearly has the biggest correlation between the housing value so let's plot that

In [None]:
housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)

# Adjust Attributes
For example the total number of rooms in a district is not useful if you don't take into account the amount of households in order to get the 'rooms per househould'. same thing with number of bedrooms, should be compared with total number of rooms. Population per household is also a good attribute to use

In [None]:
housing['rooms_per_household'] = housing['total_rooms'] / housing['households']
housing['bedrooms_per_room'] = housing['total_bedrooms'] / housing['total_rooms']
housing['population_per_household'] = housing['population']  / housing['households']

Now we can check the correlation matrix with the new values

In [None]:
corr_matrix = housing.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)

# Data Cleaning
some values are missing in certain districts so we calculate the median of each attribute and use that to fill the missing data. sklearn does this with the Imputer class. This can only be computed on numerical values so we need a copy of the dataset without the text attribute ocean_proximity