In [7]:
from utils import *

import os
import tarfile
import urllib

import pandas as pd

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


So this is just another unclear part about (in this case) preprocessing. Let's just get over it.

## get modified housing set

In [8]:
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
    
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [6]:
fetch_housing_data()

In [9]:
housing = load_housing_data()

In [10]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [11]:
housing.shape

(20640, 10)

## one-hot encoding (book example)

In [12]:
ocean_prox_vocab = ['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN']

In [14]:
indices = tf.range(len(ocean_prox_vocab), dtype=tf.int64)
table_init = tf.lookup.KeyValueTensorInitializer(ocean_prox_vocab, indices)
num_oov_buckets = 2
table = tf.lookup.StaticVocabularyTable(table_init, num_oov_buckets)

In [15]:
categories = tf.constant(["NEAR BAY", "DESERT", "INLAND", "INLAND"])

In [16]:
cat_indices = table.lookup(categories)

In [17]:
cat_indices

<tf.Tensor: id=19, shape=(4,), dtype=int64, numpy=array([3, 5, 1, 1])>

In [19]:
cat_one_hot = tf.one_hot(cat_indices, depth=len(ocean_prox_vocab) + num_oov_buckets)

In [20]:
cat_one_hot

<tf.Tensor: id=23, shape=(4, 7), dtype=float32, numpy=
array([[0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.]], dtype=float32)>

## embedding (book example)

### manually

In [22]:
vocab = ['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN']

In [23]:
embedding_dim = 2
embed_init = tf.random.uniform([len(vocab) + num_oov_buckets, embedding_dim])
embedding_matrix = tf.Variable(embed_init)

In [24]:
embedding_matrix.shape

TensorShape([7, 2])

In [25]:
embedding_matrix

<tf.Variable 'Variable:0' shape=(7, 2) dtype=float32, numpy=
array([[0.2380935 , 0.39202857],
       [0.9300972 , 0.38149107],
       [0.9662479 , 0.54803514],
       [0.33006072, 0.41778934],
       [0.5889914 , 0.09456277],
       [0.5487037 , 0.27534354],
       [0.64018726, 0.4839449 ]], dtype=float32)>

In [26]:
categories = tf.constant(["NEAR BAY", "DESERT", "INLAND", "INLAND"])

In [27]:
cat_indices = table.lookup(categories)

In [28]:
cat_indices

<tf.Tensor: id=44, shape=(4,), dtype=int64, numpy=array([3, 5, 1, 1])>

In [29]:
tf.nn.embedding_lookup(embedding_matrix, cat_indices)

<tf.Tensor: id=45, shape=(4, 2), dtype=float32, numpy=
array([[0.33006072, 0.41778934],
       [0.5487037 , 0.27534354],
       [0.9300972 , 0.38149107],
       [0.9300972 , 0.38149107]], dtype=float32)>

### with a layer

In [37]:
embedding = keras.layers.Embedding(input_dim=len(vocab) + num_oov_buckets, 
                                   output_dim=embedding_dim)

In [38]:
embedding

<tensorflow.python.keras.layers.embeddings.Embedding at 0x13b516ba8>

In [39]:
embedding.weights

[]

In [40]:
embedding(cat_indices)

<tf.Tensor: id=74, shape=(4, 2), dtype=float32, numpy=
array([[ 0.00359845, -0.02396472],
       [-0.04327668, -0.04477734],
       [-0.03309637,  0.01221321],
       [-0.03309637,  0.01221321]], dtype=float32)>

In [41]:
embedding.weights

[<tf.Variable 'embedding_1/embeddings:0' shape=(7, 2) dtype=float32, numpy=
 array([[ 0.02656266, -0.02890195],
        [-0.03309637,  0.01221321],
        [ 0.03812787, -0.04499642],
        [ 0.00359845, -0.02396472],
        [ 0.02715447,  0.03177998],
        [-0.04327668, -0.04477734],
        [-0.04363113, -0.01460475]], dtype=float32)>]