<a href="https://colab.research.google.com/github/khlose/homl_ageron/blob/master/homl_cp2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
'''
DATA LOADING AND SPLITTING PRACTICE
'''


import os
import tarfile
from six.moves import urllib
import pandas as pd

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets","housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL,housing_path=HOUSING_PATH):
  if not os.path.isdir(housing_path):
    os.makedirs(housing_path)
  
  tgz_path = os.path.join(housing_path,"housing.tgz")
  
  urllib.request.urlretrieve(housing_url,tgz_path)
  housing_tgz = tarfile.open(tgz_path)
  housing_tgz.extractall(path=housing_path)
  housing_tgz.close()
  
  
  

def load_housing_data(housing_path=HOUSING_PATH):
  csv_path = os.path.join(housing_path,"housing.csv")
  return pd.read_csv(csv_path)




fetch_housing_data()
housing = load_housing_data()


#housing.head()

#housing["ocean_proximity"].value_counts()
#housing.describe()

import matplotlib.pyplot as plt
#housing.hist(bins=50,figsize=(20,15))


import numpy as np
from zlib import crc32

def test_set_check(identifier, test_ratio):
  return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32

def split_train_test(data,test_ratio,id_column):
  
  ids = data[id_column]
  in_test_set = ids.apply(lambda id_:test_set_check(id_,test_ratio))
  
  return data.loc[~in_test_set], data.loc[in_test_set]





#housing_with_id = housing.reset_index()
#housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"]
#train,test = split_train_test(housing_with_id,0.2,"index")

from sklearn.model_selection import train_test_split

train_set,test_set = train_test_split(housing,test_size=0.2,random_state=42)



housing["income_cat"] = np.ceil(housing["median_income"]/1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0,inplace=True)

from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size = 0.2, random_state=42)

for train_index,test_index in split.split(housing,housing["income_cat"]):
  strat_train_data = housing.loc[train_index]
  strat_test_data = housing.loc[test_index]

  
print(strat_train_data.count())  


#basically loop twice, first loop is start_train_data loop
#and second is strat_test_data loop
for set_ in (strat_train_data,strat_test_data):
  #print(set_.count())
  
  set_.drop("income_cat",axis=1,inplace=True)

In [0]:
'''
Data extraction practice
'''

#separating numerical and non-numerical data

def separate_numerical(data):
  cloned = data.copy()
  
  object_list = list(data.select_dtypes(include=['object']).columns)
  numerical = cloned.drop(columns=object_list,axis=1)
  
  nonnumer = cloned[object_list]
  
  return numerical,nonnumer


In [0]:

'''
VISUALIZATION PRACTICE
'''

housing = strat_train_data.copy()

corr_matrix = housing.corr()

#corr_matrix["median_house_value"].sort_values()

from pandas.plotting import scatter_matrix

#attributes = ["median_house_value","median_income","total_rooms","housing_median_age"]

#scatter_matrix(housing[attributes])

#housing.plot(kind="scatter",x="median_income",y="median_house_value",alpha=0.1)

In [0]:
'''
CORRELATION MATRIX PRACTICE
'''

housing["room_per_household"] = housing["total_rooms"] / housing["households"]

housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]

housing["populations_per_household"] = housing["population"]/housing["households"]

#corr_matrix = housing.corr()
#corr_matrix["median_house_value"].sort_values()

In [0]:

'''
Imputer and Encoder practices
'''
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

def imputeData(data):
  
  ret_data = data.copy()
  
  object_list = list(data.select_dtypes(include=['object']).columns)
  ret_data.drop(columns=object_list,axis=1,inplace=True)

  imputer = SimpleImputer(strategy="median")
  imputer.fit(ret_data)
  
  X = imputer.transform(ret_data)

  ret_data = pd.DataFrame(X,columns=ret_data.columns)
  
  #attach back the column with object
  
  wo_object =ret_data.copy()
  
  for column in object_list:
    ret_data[column] = data[column]
    
  #return dataframe without object column and a frame WITH object column
  return ret_data



from sklearn.preprocessing import OneHotEncoder


#this will one-hot encode Panda Dataframe
def encodeProx(data):
  
  ret_data = data.copy()
  cat_encoder = OneHotEncoder(categories='auto')
  #fill missing categorical value with just missing
  
  
  object_list = list(ret_data.select_dtypes(include=['object']).columns)
  '''for col in object_list:
    ret_data[col].fillna('Missing',inplace=True)  
    '''
  #print(pd.isnull(ret_data).sum())
  onehot = pd.get_dummies(ret_data,dummy_na=True)
  #print(ret_data["ocean_proximity"].values)
 
  return onehot

In [0]:
'''
Transformer class inheritance practice

'''
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder



rooms_ix,bedrooms_ix,population_ix,households_ix = 3,4,5,6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
  
  def __init__(self, add_bedrooms_per_room = True):
    self.add_bedrooms_per_room = add_bedrooms_per_room
    
  def fit(self,X,y=None):
    return self
  
  def transform(self,X,y=None):
    rooms_per_household = X[:,rooms_ix] / X[:,households_ix]
    population_per_household = X[:,population_ix]/X[:,households_ix]
    if(self.add_bedrooms_per_room):
      bedrooms_per_room  = X[:,bedrooms_ix] / X[:,rooms_ix]
      return np.c_[X,rooms_per_household,population_per_household,bedrooms_per_room]
    else:
      return np.c_[X,rooms_per_household,population_per_household]
    
    
class DataFrameSelector(BaseEstimator,TransformerMixin):
  def __init__(self,attribute_names):
    self.attribute_names = attribute_names
  def fit(self,X,y=None):
    return self
  def transform(self,X):
    return X[self.attribute_names].values
    

class CategoricalOnehotEncoder(BaseEstimator, TransformerMixin):
  
  def __init__(self,encoding="onehot"):
    self.encoding = encoding
  def fit(self,X,y=None):
    return self
  def transform(self,X,y=None):
    x_cat_enc,x_cat_arr = X.factorize()
    encoder = OneHotEncoder()
    x_1hot = encoder.fit_transform(x_cat_enc.reshape(-1,1))
    return x_1hot
    
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

    
    

In [23]:
'''
Pipelining practice
'''

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

#pipeline
#estimator at all but last must be transfomer
#ie. they must have fit_transform

#this is numerical pipeline => input data must be numerical data
#this pipeline does not have dataframe selector, we make use of the
#separate_numerical function feed in numerical data
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_augment',CombinedAttributesAdder()),
    #feature scaling transformer using stdev
    ('std_scaler',StandardScaler())
])

housing_num,housing_non_num = separate_numerical(housing)
housing_num_tr = num_pipeline.fit_transform(housing_num)


cat_pipeline = Pipeline([
    #we dont need to use selector, another function has done it
    #('selector',DataFrameSelector),
    ('encoder', CategoricalOnehotEncoder(encoding='onehot'))
])

housing_cat_tr = cat_pipeline.fit_transform(housing_non_num)




AttributeError: ignored

In [0]:
'''
Scrap page
'''


#reset
housing = strat_train_data.copy()

housing_pred = strat_train_data.drop("median_house_value",axis=1)
housing_label = strat_train_data["median_house_value"].copy()



housing_tr_reg = imputeData(housing_pred)

onehotted = encodeProx(housing_tr_reg)
