In [6]:
import os
import tarfile
from six.moves import urllib
import pandas as pd
import pprint
import matplotlib.pyplot as plt
import subprocess
import numpy as np

BASE_DIR = '/Users/james/inote/homl_c2'
DOWNLOAD_ROOT = 'https://raw.githubusercontent.com/ageron/handson-ml/master/'
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + '/housing.tgz'
datasets_path = BASE_DIR + '/' + HOUSING_PATH


def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    """Download data from url."""
    if not os.path.isdir(datasets_path):
        os.makedirs(datasets_path)
    tgz_path = os.path.join(datasets_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url,tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=datasets_path)
    housing_tgz.close()


# download data
#fetch_housing_data()
print('# download data success!')

def load_housing_data(housing_path=datasets_path):
    """Read csv by pandas"""
    csv_path = os.path.join(housing_path, 'housing.csv')
    return pd.read_csv(csv_path)

# head()

print('\n# head()\n')
housing = load_housing_data()
pprint.pprint(housing.head())  # look first 5 rows.


# info()

print('\n# info()\n')
pprint.pprint(housing.info())  # get a quick description fo the data.

# value_count()

print('\n# ocean_proximity value_counts()\n')

pprint.pprint(housing['ocean_proximity'].value_counts())

# describe()

print('\n# describe()\n')

pprint.pprint(housing.describe())


# hist()

print('\n# hist()\n')

housing.hist(bins=50, figsize=(20, 15))
plt.savefig('p1')
subprocess.call(['catimg', '-f', 'p1.png'])


#split data

# from testset import split_train_test_by_id
# print("\nSplit data.\n")
# housing_with_id = housing.reset_index()  # adds an 'index' colume
# train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")
# print(len(train_set), "train + ", len(test_set), "test")

# split data use sklearn learn
# from sklearn.model_selection import train_test_split
# train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
# print(len(train_set), "train + ", len(test_set), "test")


# income_cat
housing['income_cat'] = np.ceil(housing['median_income'] / 1.5)
housing['income_cat'].where(housing['income_cat'] < 5, 5.0, inplace=True)

from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing['income_cat']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

pprint.pprint(housing['income_cat'].value_counts() / len(housing))


# remove income_cat
for set_ in (strat_train_set, strat_test_set):
    set_.drop(['income_cat'], axis=1, inplace=True)

# Discover and Visualize the Data to gain insights
housing = strat_train_set.copy()

housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)
plt.savefig('p2')
subprocess.call(['catimg', '-f', 'p2.png'])

housing.plot(
    kind="scatter",
    x="longitude",
    y="latitude",
    alpha=0.4,
    s=housing['population'] / 100,
    label="population",
    c="median_house_value",
    cmap=plt.get_cmap("jet"),
    colorbar=True,
)
plt.legend()



# download data success!

# head()

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  

# info()

<class 'pandas.core.frame.Da

<matplotlib.legend.Legend at 0x11799c2e8>