<a href="https://colab.research.google.com/github/leegw1211/begin-ml/blob/main/tf_tutorial_structured_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

In [2]:
import pathlib
dataset_url = 'http://storage.googleapis.com/download.tensorflow.org/data/petfinder-mini.zip'
csv_file = 'datasets/petfinder-mini/petfinder-mini.csv'

tf.keras.utils.get_file('petfinder_mini.zip', dataset_url, extract=True, cache_dir='.')
dataframe = pd.read_csv(csv_file)

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/petfinder-mini.zip


In [3]:
dataframe.head()

Unnamed: 0,Type,Age,Breed1,Gender,Color1,Color2,MaturitySize,FurLength,Vaccinated,Sterilized,Health,Fee,Description,PhotoAmt,AdoptionSpeed
0,Cat,3,Tabby,Male,Black,White,Small,Short,No,No,Healthy,100,Nibble is a 3+ month old ball of cuteness. He ...,1,2
1,Cat,1,Domestic Medium Hair,Male,Black,Brown,Medium,Medium,Not Sure,Not Sure,Healthy,0,I just found it alone yesterday near my apartm...,2,0
2,Dog,1,Mixed Breed,Male,Brown,White,Medium,Medium,Yes,No,Healthy,0,Their pregnant mother was dumped by her irresp...,7,3
3,Dog,4,Mixed Breed,Female,Black,Brown,Medium,Short,Yes,No,Healthy,150,"Good guard dog, very alert, active, obedience ...",8,2
4,Dog,1,Mixed Breed,Male,Black,No Color,Medium,Short,No,No,Healthy,0,This handsome yet cute boy is up for adoption....,3,2


In [4]:
dataframe['target'] = np.where(dataframe['AdoptionSpeed']==4, 0, 1)
dataframe = dataframe.drop(columns=['AdoptionSpeed', 'Description'])

In [5]:
train, test = train_test_split(dataframe, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

7383 train examples
1846 validation examples
2308 test examples


In [6]:
# 모델에 넣기 위해 pandas dataframe을 tf.keras.data.dataset으로 변환해주는 함수
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('target')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [7]:
batch_size = 20
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, batch_size=batch_size)
test_ds = df_to_dataset(test, batch_size=batch_size)

In [8]:
# 이 dataset은 feature과 label로 이루어져 있다.
# batch단위로 iteration한다
# feature_batch는 키로 열 이름을, 값으로 해당 값을 가지는 dictionary이다.

[(train_features, label_batch)] = train_ds.take(1)
print(list(train_features.keys()))
print(train_features['Age']) # 데이터프레임 열의 값들, 길이는 batch size
print(label_batch)

['Type', 'Age', 'Breed1', 'Gender', 'Color1', 'Color2', 'MaturitySize', 'FurLength', 'Vaccinated', 'Sterilized', 'Health', 'Fee', 'PhotoAmt']
tf.Tensor([ 1  4 12  2 17 60  1 24 24  3  5 12  2 42  3 36  1 36  0 24], shape=(20,), dtype=int64)
tf.Tensor([1 1 1 1 0 1 1 0 0 1 1 1 0 1 1 0 1 1 1 1], shape=(20,), dtype=int64)


In [9]:
# 주어진 데이터를 모델에 적용할 수 있게 잘 표현하는 것이 중요하다.
# 이를 Feature Engineering (특성 공학)이라고 한다.
# keras 전처리 레이어들을 통해 이를 수행할 수 있다.

# tf.keras.layers.Normalization레이어를 이용하여 수치형 데이터의 분포를 표준화할 수 있다.
def get_normalization_layer(name, dataset):
  normalizer = layers.Normalization(axis=None)
  feature_ds = dataset.map(lambda x, y: x[name])
  normalizer.adapt(feature_ds)

  return normalizer

In [10]:
photo_count_col = train_features['PhotoAmt']
print(photo_count_col)
print()

layer = get_normalization_layer('PhotoAmt', train_ds)
layer(photo_count_col)

tf.Tensor([ 2  5  5 12  1  1  1  1  1  3  9  2  6  4  4  6  5  3  8  8], shape=(20,), dtype=int64)



<tf.Tensor: shape=(20,), dtype=float32, numpy=
array([-0.511307  ,  0.45787957,  0.45787957,  2.7193148 , -0.8343691 ,
       -0.8343691 , -0.8343691 , -0.8343691 , -0.8343691 , -0.18824479,
        1.7501281 , -0.511307  ,  0.7809417 ,  0.13481739,  0.13481739,
        0.7809417 ,  0.45787957, -0.18824479,  1.427066  ,  1.427066  ],
      dtype=float32)>

In [11]:
# tf.keras.layers.StringLookup 레이어를 통해 어휘의 값을 정수 인덱스로 매핑할 수 있다.
# 이와 유사하게 tf.keras.layers.IntegerLookup 레이어를 통해 정수값을 가지는 범주형 데이터를 정수 인덱스로 매핑할 수 있다.
# 이 경우 adapt메소드를 통해 vocablary를 생성하면, 0번 index는 unknown으로, 그 이후부터는 빈도순으로 index를 가지게 된다.

# tf.keras.layers.CategoryEncoding 레이어는 정수형 데이터를 입력으로 받는다.
# output_mode="one_hot"의 경우 각 정수 값을 one-hot encoding으로 매핑해준다.
# multi_hot의 경우 배열 안에 존재하는 정수는 1, 존재하지 않는 정수는 0으로 하는 배열을 반환한다.
# count의 경우 multi_hot과 유사한데 갯수를 세어 준다.

def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
  if dtype == 'string':
    index = layers.StringLookup(max_tokens=max_tokens)
  else:
    index = layers.IntegerLookup(max_tokens=max_tokens)

  feature_ds = dataset.map(lambda x, y: x[name])
  index.adapt(feature_ds)

  encoder = layers.CategoryEncoding(num_tokens=index.vocabulary_size(), output_mode='one_hot')

  return lambda feature: encoder(index(feature))

In [12]:
test_type_col = train_features['Type']
test_type_layer = get_category_encoding_layer(name='Type',
                                              dataset=train_ds,
                                              dtype='string')
test_type_layer(test_type_col)

<tf.Tensor: shape=(20, 3), dtype=float32, numpy=
array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.]], dtype=float32)>

In [13]:
# batch size가 너무 크면 local minima에 빠질 수 있다.
# batch size가 너무 작으면 이상한 방향으로 학습할 수 있다.
# 한 논문에서는 batch size를 32~128 사이로 설정하는게 좋다고 했다고 한다.

batch_size = 256
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [14]:
all_inputs = []  # 모든 입력 레이어를 저장할 리스트
encoded_features = []  # 처리된 특성을 저장할 리스트

# Numerical features.
for header in ['PhotoAmt', 'Fee']:  # 숫자형 특성에 대해서 반복
  # tf.keras.Input은 심볼릭 텐서(=플레이스홀더) 를 생성한다.
  numeric_col = tf.keras.Input(shape=(1,), name=header)

  # 특성에 대한 정규화 레이어를 가져와서 적용
  normalization_layer = get_normalization_layer(header, train_ds)
  encoded_numeric_col = normalization_layer(numeric_col)

  # 입력 레이어와 처리된 특성을 리스트에 추가
  all_inputs.append(numeric_col)
  encoded_features.append(encoded_numeric_col)

In [15]:
age_col = tf.keras.Input(shape=(1,), name='Age', dtype='int64')

encoding_layer = get_category_encoding_layer(name='Age',
                                             dataset=train_ds,
                                             dtype='int64',
                                             max_tokens=5)
encoded_age_col = encoding_layer(age_col)
all_inputs.append(age_col)
encoded_features.append(encoded_age_col)

In [16]:
categorical_cols = ['Type', 'Color1', 'Color2', 'Gender', 'MaturitySize',
                    'FurLength', 'Vaccinated', 'Sterilized', 'Health', 'Breed1']

for header in categorical_cols:
  categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='string')
  encoding_layer = get_category_encoding_layer(name=header,
                                               dataset=train_ds,
                                               dtype='string',
                                               max_tokens=5)
  encoded_categorical_col = encoding_layer(categorical_col)
  all_inputs.append(categorical_col)
  encoded_features.append(encoded_categorical_col)

In [17]:
encoded_features

[<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'normalization_1')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'normalization_2')>,
 <KerasTensor: shape=(None, 5) dtype=float32 (created by layer 'category_encoding_1')>,
 <KerasTensor: shape=(None, 3) dtype=float32 (created by layer 'category_encoding_2')>,
 <KerasTensor: shape=(None, 5) dtype=float32 (created by layer 'category_encoding_3')>,
 <KerasTensor: shape=(None, 5) dtype=float32 (created by layer 'category_encoding_4')>,
 <KerasTensor: shape=(None, 3) dtype=float32 (created by layer 'category_encoding_5')>,
 <KerasTensor: shape=(None, 4) dtype=float32 (created by layer 'category_encoding_6')>,
 <KerasTensor: shape=(None, 4) dtype=float32 (created by layer 'category_encoding_7')>,
 <KerasTensor: shape=(None, 4) dtype=float32 (created by layer 'category_encoding_8')>,
 <KerasTensor: shape=(None, 4) dtype=float32 (created by layer 'category_encoding_9')>,
 <KerasTensor: shape=(None, 4) dtype=flo

In [18]:
all_inputs

[<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'PhotoAmt')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'Fee')>,
 <KerasTensor: shape=(None, 1) dtype=int64 (created by layer 'Age')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'Type')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'Color1')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'Color2')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'Gender')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'MaturitySize')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'FurLength')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'Vaccinated')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'Sterilized')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'Health')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'Breed1')>]

In [19]:
all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(32, activation="relu")(all_features)
x = tf.keras.layers.Dropout(0.5)(x)
output = tf.keras.layers.Dense(1)(x)

model = tf.keras.Model(all_inputs, output)

In [20]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=["accuracy"])

In [21]:
model.fit(train_ds, epochs=10, validation_data=val_ds)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7e4b411f6da0>

In [22]:
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [27]:
pred = model.predict(val_ds)
pred = np.where(pred >= 0, 1, 0)
val_label = []
for x_batch, y_batch in val_ds:
    for y in y_batch.numpy():
        val_label.append(y)
val_label = np.array(val_label)



In [31]:
get_clf_eval(val_label, pred)

오차행렬:
 [[1296   54]
 [ 402   94]]

정확도: 0.7530
정밀도: 0.7633
재현율: 0.9600
F1: 0.8504
