# Import Libraries 

In [None]:
import os
import tarfile
import urllib.request
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Import Data

In [None]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [None]:
fetch_housing_data()

In [None]:
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

# Exploratory Data Analysis

## Quantitative Exploration

In [None]:
housing = load_housing_data()
housing.head()

In [None]:
housing.info()

In [None]:
housing["ocean_proximity"].value_counts()

Since *ocean_proximity* showed up as the only object Dtype, we checked to make sure it is a categorical variable. It is with five different levels.

In [None]:
housing.describe()

This is the similar to *summary()* in R.

## Graphical Exploration

In [None]:
%matplotlib inline # This is required in Jupyter
housing.hist(bins = 50, figsize = (20, 15))
plt.show() # This is optional in Jupyter

In [None]:
sns.pairplot(housing)