In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 3.3.1 수치형 피처를 정규화하기

In [1]:
import pandas as pd

file_url = "https://storage.googleapis.com/download.tensorflow.org/data/heart.csv"
heart_df = pd.read_csv(file_url)

heart_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    object 
 13  target    303 non-null    int64  
dtypes: float64(1), int64(12), object(1)
memory usage: 33.3+ KB


In [2]:
from tensorflow.keras.layers.experimental.preprocessing import Normalization

cols = ['age','trestbps','chol','thalach','oldpeak','slope']
df = heart_df[cols].to_numpy()

layer = Normalization()
layer.adapt(df)
layer_df = layer(df)

print("features mean: %.1f" % (layer_df.numpy().mean()))
print("features std: %.1f" % (layer_df.numpy().std()))

features mean: -0.0
features std: 1.0


In [3]:
heart_df[cols]

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,slope
0,63,145,233,150,2.3,3
1,67,160,286,108,1.5,2
2,67,120,229,129,2.6,2
3,37,130,250,187,3.5,3
4,41,130,204,172,1.4,1
...,...,...,...,...,...,...
298,52,118,186,190,0.0,2
299,43,132,341,136,3.0,2
300,65,135,254,127,2.8,2
301,48,130,256,150,0.0,1


In [4]:
pd.DataFrame(layer_df.numpy(), columns=cols)

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,slope
0,0.933839,0.745781,-0.260087,0.034807,1.068045,2.284959
1,1.378211,1.592329,0.757388,-1.780617,0.380229,0.663548
2,1.378211,-0.665132,-0.336877,-0.872905,1.325976,0.663548
3,-1.954575,-0.100767,0.066273,1.634109,2.099770,2.284959
4,-1.510204,-0.100767,-0.816818,0.985743,0.294252,-0.957863
...,...,...,...,...,...,...
298,-0.288182,-0.778005,-1.162375,1.763782,-0.909428,0.663548
299,-1.288018,0.012106,1.813258,-0.570334,1.669885,0.663548
300,1.156025,0.181416,0.143064,-0.959353,1.497931,0.663548
301,-0.732554,-0.100767,0.181459,0.034807,-0.909428,-0.957863


## 3.3.2 문자 범주형 피처를 인코딩하기

In [5]:
heart_df['thal'].value_counts()

normal        168
reversible    115
fixed          18
1               1
2               1
Name: thal, dtype: int64

In [6]:
heart_df['thal'][:10]

0         fixed
1        normal
2    reversible
3        normal
4        normal
5        normal
6        normal
7        normal
8    reversible
9    reversible
Name: thal, dtype: object

In [7]:
from tensorflow.keras.layers.experimental.preprocessing import StringLookup
from tensorflow.keras.layers.experimental.preprocessing import CategoryEncoding

df = heart_df['thal'].to_numpy()

layer1 = StringLookup()
layer1.adapt(df)

layer2 = CategoryEncoding(num_tokens=6, output_mode="one_hot")
layer2_df = layer2(layer1(df))

print( type(layer2_df) )
print(layer2_df[:10])

<class 'tensorflow.python.framework.ops.EagerTensor'>
tf.Tensor(
[[0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]], shape=(10, 6), dtype=float32)


## 3.3.3 정수 범주형 피처를 인코딩하기


In [8]:
heart_df['fbs'].value_counts()

0    258
1     45
Name: fbs, dtype: int64

In [9]:
heart_df['fbs'][:10]

0    1
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    1
Name: fbs, dtype: int64

In [10]:
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing

df = heart_df['fbs'].to_numpy()

layer1 = preprocessing.IntegerLookup()
layer1.adapt(df)

layer2 = preprocessing.CategoryEncoding(num_tokens=3, output_mode="one_hot")
#layer2.adapt(layer1(df))
layer2_df = layer2(layer1(df))

print( type(layer2_df) )
print(layer2_df[:10])

<class 'tensorflow.python.framework.ops.EagerTensor'>
tf.Tensor(
[[0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]], shape=(10, 3), dtype=float32)


## 3.3.4 수량형 피처 인코딩하기

In [11]:
heart_df['age'].value_counts().count()

41

In [12]:
heart_df['age'][:10]

0    63
1    67
2    67
3    37
4    41
5    56
6    62
7    57
8    63
9    53
Name: age, dtype: int64

In [13]:
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing

df = heart_df['age'].to_numpy()

layer1 = preprocessing.Hashing(num_bins=16)

layer2 = preprocessing.CategoryEncoding(num_tokens=16, output_mode="one_hot")
layer2_df = layer2(layer1(df))

print( type(layer2_df) )
print(layer2_df[:10])

<class 'tensorflow.python.framework.ops.EagerTensor'>
tf.Tensor(
[[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]], shape=(10, 16), dtype=float32)


## 3.3.5 이미지 데이터 증대 레이어

In [15]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.layers.experimental.preprocessing import RandomFlip, RandomRotation

data_augmentation = Sequential(
  [
      RandomFlip("horizontal_and_vertical", input_shape=(256, 256, 3)),
      RandomRotation(0.3),
  ]
)

model = Sequential([
  # 데이터 증강
  data_augmentation,

  # 합성곱 신경망
  Conv2D(16, 3, padding='same', activation='relu'),
  MaxPooling2D(),
  
  # 심층 신경망
  Flatten(),
  Dense(128, activation='relu'),
  Dense(1, activation='sigmoid')
])

model.summary()



Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 sequential_2 (Sequential)   (None, 256, 256, 3)       0         
                                                                 
 conv2d_1 (Conv2D)           (None, 256, 256, 16)      448       
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 128, 128, 16)     0         
 2D)                                                             
                                                                 
 flatten_1 (Flatten)         (None, 262144)            0         
                                                                 
 dense_2 (Dense)             (None, 128)               33554560  
                                                                 
 dense_3 (Dense)             (None, 1)                 129       
                                                      

## 3.3.6 토큰 인덱스로 자연어 인코딩하기

In [16]:
from tensorflow.keras.preprocessing import text_dataset_from_directory

# Load the data: IMDB movie review sentiment classification
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

aclImdb_ds = text_dataset_from_directory(
    directory = "/content/aclImdb/train",
    labels="inferred",
    batch_size=32,
    seed=0
)

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  6361k      0  0:00:12  0:00:12 --:--:-- 13.7M
Found 75000 files belonging to 3 classes.


In [17]:
for text, label in aclImdb_ds.take(1):
    for i in range(1):
        print(text.numpy()[i])
        print(label.numpy()[i])

b"'Playing for Time' starring Vanessa Redgrave first aired in 1980 and is based on a true story. A true story teaching the lessons of intolerance and the horrors of the Holocaust. In much the way Sharazod (the mythical wife of a King) would save her own life each night by telling her husband a story but never finishing the tale; in 'Playing for Time' the inmates of a death camp play music for the amusement of the guards thereby escaping extermination. But this is no myth it's a very real means of survival for desperate inmates. <br /><br />'Playing for Time' is a great movie but gets a little boring. I found myself playing a video game instead of paying close attention to the movie."
2


In [18]:
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing

layer1 = preprocessing.TextVectorization()
layer1.adapt(aclImdb_ds.map(lambda text, label: text))
layer1_df = layer1(['Playing for Time'])

print( type(layer1_df) )
print( layer1_df )

<class 'tensorflow.python.framework.ops.EagerTensor'>
tf.Tensor([[382  17  62]], shape=(1, 3), dtype=int64)


In [19]:
import numpy as np

vocab = np.array(layer1.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it',
       'this', 'i', 'that', 'br', 'was', 'as', 'with', 'for', 'movie',
       'but'], dtype='<U77')