In [None]:
# 입력 특성 전처리

In [None]:
# Features API

In [1]:
import os
import tarfile
import urllib.request


DOWNLOAD_ROOT = "https://raw.githubusercontent.com/rickiepark/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

fetch_housing_data()

In [2]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [3]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

housing = fetch_california_housing()
X_train_full, X_test, y_train_full, y_test = train_test_split(
    housing.data, housing.target.reshape(-1, 1), random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full, random_state=42)

scaler = StandardScaler()
scaler.fit(X_train)
X_mean = scaler.mean_
X_std = scaler.scale_

In [4]:
housing = load_housing_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [None]:
# numeric_column 할당

In [6]:
import tensorflow as tf
import numpy as np

housing_median_age = tf.feature_column.numeric_column("housing_median_age")

In [7]:
housing_median_age

NumericColumn(key='housing_median_age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)

In [None]:
# normalizer_fn : 정규화 함수를 지정할 수 있다

In [8]:
age_mean, age_std = X_mean[1], X_std[1]  # The median age is column in 1
housing_median_age = tf.feature_column.numeric_column(
    "housing_median_age", normalizer_fn=lambda x: (x - age_mean) / age_std)

In [None]:
# 가끔은 수치 특성을 bucketize하여 범주화 하는게 효율을 높일 수 있다.
# 예) median_income열을 5개의 버킷으로 구분
# 1.5 ($15,000)이하, ~3.3, ~4.5, ~6, 그리고 6이상
# boundaries인자 사용

In [9]:
median_income = tf.feature_column.numeric_column("median_income")
bucketized_income = tf.feature_column.bucketized_column(
    median_income, boundaries=[1.5, 3., 4.5, 6.])
bucketized_income

BucketizedColumn(source_column=NumericColumn(key='median_income', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(1.5, 3.0, 4.5, 6.0))

In [None]:
# 범주형(categorical) 특성

In [None]:
# 범주를 모두 나열 가능하면 -> categorical_column_with_vocabulary_list

In [10]:
ocean_prox_vocab = ['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN']
ocean_proximity = tf.feature_column.categorical_column_with_vocabulary_list(
    "ocean_proximity", ocean_prox_vocab)

ocean_proximity

VocabularyListCategoricalColumn(key='ocean_proximity', vocabulary_list=('<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'), dtype=tf.string, default_value=-1, num_oov_buckets=0)

In [None]:
# 범주가 너무 많으면 -> categorical_column_with_hash_bucket

In [12]:
# Just an example, it's not used later on
city_hash = tf.feature_column.categorical_column_with_hash_bucket(
    "city", hash_bucket_size=1000)
city_hash

HashedCategoricalColumn(key='city', hash_bucket_size=1000, dtype=tf.string)

In [None]:
# 교차 범주 특성

In [13]:
bucketized_age = tf.feature_column.bucketized_column(
    housing_median_age, boundaries=[-1., -0.5, 0., 0.5, 1.]) # age was scaled
age_and_ocean_proximity = tf.feature_column.crossed_column(
    [bucketized_age, ocean_proximity], hash_bucket_size=100)

In [None]:
# 또 다른 예
latitude = tf.feature_column.numeric_column("latitude")
longitude = tf.feature_column.numeric_column("longitude")

bucketized_latitude = tf.feature_column.bucketized_column(
    latitude, boundaries=list(np.linspace(32., 42., 20 - 1)))
bucketized_longitude = tf.feature_column.bucketized_column(
    longitude, boundaries=list(np.linspace(-125., -114., 20 - 1)))
location = tf.feature_column.crossed_column(
    [bucketized_latitude, bucketized_longitude], hash_bucket_size=1000)

In [None]:
# 범주형 데이터를 원-핫 벡터로 바꾸기(중요)
# 원-핫 벡터

In [14]:
ocean_proximity_one_hot = tf.feature_column.indicator_column(ocean_proximity)

In [27]:
vocab = ["<1H OCEAN", "INLAND", "NEAR OCEAN", "NEAR BAY", "ISLAND"]
indices = tf.range(len(vocab), dtype=tf.int64)
table_init = tf.lookup.KeyValueTensorInitializer(vocab, indices)
num_oov_buckets = 2
table = tf.lookup.StaticVocabularyTable(table_init, num_oov_buckets) 

In [28]:
categories = tf.constant(["NEAR BAY", "DESERT", "INLAND", "ISLAND"])
cat_indices = table.lookup(categories)
cat_indices

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([3, 5, 1, 4])>

In [18]:
cat_one_hot = tf.one_hot(cat_indices, depth=len(vocab) + num_oov_buckets)
cat_one_hot

<tf.Tensor: shape=(4, 7), dtype=float32, numpy=
array([[0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.]], dtype=float32)>

In [None]:
# 임베딩
# 단어 인베딩 (대세)

In [22]:
# 각 범주의 임베딩을 담은 임베딩 행렬을 만들어 랜덤하게 초기화하자
# 이 행렬은 범주와 oov버킷마다 하나의 행이 있고 차원(여기서는 2차원)마다 열을 가짐
# 어휘 사전 크기에 따라 10~300차원도 가짐

In [20]:
vocab = ["<1H OCEAN", "INLAND", "NEAR OCEAN", "NEAR BAY", "ISLAND"]
embedding_dim = 2
embed_init = tf.random.uniform([len(vocab) + num_oov_buckets, embedding_dim])
embedding_matrix = tf.Variable(embed_init)
embedding_matrix

<tf.Variable 'Variable:0' shape=(7, 2) dtype=float32, numpy=
array([[0.18662012, 0.64141905],
       [0.7665025 , 0.28692627],
       [0.08777869, 0.7601507 ],
       [0.78664875, 0.19007576],
       [0.78246367, 0.6718564 ],
       [0.84175956, 0.10111177],
       [0.38719273, 0.71831226]], dtype=float32)>

In [25]:
categories = tf.constant( ["NEAR BAY", "DESERT", "INLAND", "ISLAND"] )
cat_indices = table.lookup(categories)
cat_indices

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([3, 5, 1, 4])>

In [24]:
tf.nn.embedding_lookup(embedding_matrix, cat_indices)

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[0.78664875, 0.19007576],
       [0.84175956, 0.10111177],
       [0.7665025 , 0.28692627],
       [0.7665025 , 0.28692627]], dtype=float32)>

In [30]:
from tensorflow import keras

embedding = keras.layers.Embedding( input_dim=len(vocab)+num_oov_buckets, output_dim=embedding_dim )
embedding(cat_indices) # [3, 5, 1, 1]

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[-1.5007865e-02, -4.5961868e-02],
       [ 1.5132453e-02, -4.7256351e-02],
       [ 3.6887992e-02, -1.4316320e-02],
       [ 5.8211386e-05,  1.2161516e-02]], dtype=float32)>

In [31]:
# 케라스 모델 만들기

regular_inputs = keras.layers.Input(shape=[8]) # a regular input containing 8 numerical features per instance

categories = keras.layers.Input(shape=[], dtype=tf.string) # a categorical input (one categorical feature per instance)

# uses a Lambda layer to look up each category’s index
cat_indices = keras.layers.Lambda(lambda cats: table.lookup(cats)) (categories)

# looks up the embeddings for these indices
# for exampe: input_dim=len(vocab)+num_oov_buckets = 5+2 =7
cat_embed = keras.layers.Embedding(input_dim=7, output_dim=2)(cat_indices)

# concatenates the embeddings and the regular inputs in order to give the encoded inputs, 
# which are ready to be fed to a neural network.
encoded_inputs = keras.layers.concatenate([regular_inputs, cat_embed])

#We could add any kind of neural network at this point, but we just add a dense output layer
outputs = keras.layers.Dense(1)(encoded_inputs)

#create the Keras model
model = keras.models.Model( inputs=[regular_inputs, categories], outputs=[outputs] )

In [32]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None,)]            0           []                               
                                                                                                  
 lambda (Lambda)                (None,)              0           ['input_2[0][0]']                
                                                                                                  
 input_1 (InputLayer)           [(None, 8)]          0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, 2)            14          ['lambda[0][0]']                 
                                                                                              