# ENGSCI205 - Machine Intelligence

This notebook assumes that you have downloaded the Python script https://api.bitbucket.org/2.0/repositories/dsengapp/dslab/src/term122x/dslab.py into the same folder, in which this notebook is saved.

In [2]:
from dslab import distribution

# Get the data (MI03)

## II.1 List the data you need and how much you need.

### California House Pricing (MI03)

Adapted from Geron (2017):

https://github.com/ageron/handson-ml/blob/master/02_end_to_end_machine_learning_project.ipynb

## II.2 Find and document where you can get that data 

In [3]:
config = {'data_repository': "https://raw.githubusercontent.com/ageron/handson-ml/master",
          'data_path': "/datasets/housing/housing.tgz"}
config['download_url'] = config['data_repository'] + config['data_path']
config

{'data_repository': 'https://raw.githubusercontent.com/ageron/handson-ml/master',
 'data_path': '/datasets/housing/housing.tgz',
 'download_url': 'https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.tgz'}

## II.7 Get the data

In [4]:
import os
from six.moves import urllib

config['reference_path'] = "../../data/reference/datasets/housing"

def fetch_housing_data(housing_url=config['download_url'], 
                       housing_path=config['reference_path']):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    return tgz_path

In [5]:
config['local_reference'] = fetch_housing_data()
config['local_reference']

'../../data/reference/datasets/housing\\housing.tgz'

## II.8 Convert the data to a format you can easily manipulate

In [None]:
import tarfile
def convert_housing_data(tgz_path=config['local_reference'], 
                        housing_path=config['reference_path']):
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [None]:
convert_housing_data()
reference_files = os.listdir(config['reference_path'])
reference_files

In [None]:
import fnmatch
csv_files = fnmatch.filter(reference_files, '*.csv')
csv_files

In [None]:
config['data_filename'] = csv_files[0]
config['data_filename']

## II.9 Check the size and type of data (time series, sample, geographical, etc.).

In [None]:
import pandas as pd
def load_housing_data(housing_path=config['reference_path'],
                      filename=config['data_filename']):
    csv_path = os.path.join(housing_path, filename)
    return pd.read_csv(csv_path)

In [None]:
housing = load_housing_data()
housing.info()

In [None]:
import numpy as np
np.sum(housing['median_house_value']>=500000)/housing.shape[0]

In [None]:
housing['ocean_proximity'].value_counts()

In [None]:
housing.describe()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
_ = housing.hist(bins=50, figsize=(20,15))

## II.11 Sample a test set, put it aside, and never look at it

In [None]:
import numpy as np
housing["value_cat"] = np.ceil(housing["median_house_value"] / 100000)
housing["value_cat"].where(housing["value_cat"] < 5, 5.0, inplace=True)

In [None]:
_ = sns.histplot(housing["value_cat"],kde=False)

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["value_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
check_sample = lambda df: df["value_cat"].value_counts() / len(df)

In [None]:
pd.DataFrame(dict([(sample, check_sample(globals()[sample])) for sample in 
                  ("housing", "strat_train_set", "strat_test_set")])
            ).sort_index()

# Remove the additional value_cat.

In [None]:
for data in (strat_train_set, strat_test_set):
    data.drop(["value_cat"], axis=1, inplace=True)

In [None]:
def save_housing_data(project_path="..",
                      train_set=strat_train_set, 
                      test_set=strat_test_set):
    
    housing_path = os.path.join(project_path, 'data')
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
        
    filepaths = list()
    for data_set in ("train_set", "test_set"):
        csv_path = os.path.join(housing_path, 
                                "{}.csv".format(data_set))
        filepaths.append(csv_path)
        locals()[data_set].to_csv(csv_path)
    return filepaths

In [None]:
save_housing_data()

In [None]:
housing.head()

# III. Explore the data
## III.1 Create a Jupyter notebook

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import dslab

%matplotlib inline
%load_ext autoreload
%autoreload 2


## III.2 Study each attribute

In [None]:
housing = pd.read_csv('../data/train_set.csv', index_col=0)
housing.info()

In [None]:
housing.head()

In [None]:
housing.ocean_proximity.value_counts().plot(kind='bar')

In [None]:
_ = distribution(housing.longitude)
plt.savefig('../figs/CaH_longitude.pdf', bbox_inches='tight')

In [None]:
_ = distribution(housing.latitude)
plt.savefig('../figs/CaH_latitude.pdf', bbox_inches='tight')

In [None]:
_ = distribution(housing.housing_median_age)

In [None]:
_ = distribution(housing.total_rooms)

In [None]:
_ = distribution(housing.total_bedrooms)

In [None]:
_ = distribution(housing.population)

In [None]:
_ = distribution(housing.households)

In [None]:
_ = distribution(housing.median_income)

In [None]:
_ = distribution(housing.median_house_value)

## III.3 For supervised learning tasks, identify the target attribute(s)

In [None]:
_ = distribution(housing.median_house_value)

## III.4 Visualize the data

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
             s=housing["population"]/100, label="population",
             c="median_house_value", cmap=plt.get_cmap("jet"), 
             colorbar=True)
plt.legend()

## III.5 Study the correlations between attributes.

In [None]:
# compute the standard correlation coefficient (also called Pearson’s r)
corr_matrix = housing.corr()

In [None]:
# look at how much each attribute correlates with the median house value
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
sns.pairplot(housing[["median_house_value", "median_income", 
                      "total_rooms", "housing_median_age","ocean_proximity"]],
            hue="ocean_proximity",
            plot_kws= { "alpha":0.4})

In [None]:
housing["max_value"] = housing["median_house_value"] == housing["median_house_value"].max()
sns.pairplot(housing,
             x_vars=["median_house_value", "median_income", "total_rooms", "housing_median_age"],
             y_vars=["median_house_value", "households", "total_bedrooms", "population"],
            hue="max_value", plot_kws= { "alpha":0.4})

In [None]:
del housing["max_value"]

## III.6 Engineer features

In [None]:
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

In [None]:
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
_ = distribution(housing["population_per_household"])

In [None]:
housing["log_population_per_household"] = np.log(housing["population_per_household"])

In [None]:
_ = distribution(housing["log_population_per_household"])

In [None]:
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)