In [9]:
import os 
import tensorflow as tf
import numpy as np

from tensorflow.keras import preprocessing

samples = ['너 오늘 이뻐 보인다', 
           '나는 오늘 기분이 더러워', 
           '끝내주는데, 좋은 일이 있나봐', 
           '나 좋은 일이 생겼어', 
           '아 오늘 진짜 짜증나', 
           '환상적인데, 정말 좋은거 같아']

label = [[1], [0], [1], [1], [0], [1]]

In [10]:
tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(samples)
sequences = tokenizer.texts_to_sequences(samples)

word_index = tokenizer.word_index

print("수치화된 텍스트 데이터: \n", sequences)
print("각 단어의 인덱스: \n", word_index)
print("라벨: \n", label)

수치화된 텍스트 데이터: 
 [[4, 1, 5, 6], [7, 1, 8, 9], [10, 2, 3, 11], [12, 2, 3, 13], [14, 1, 15, 16], [17, 18, 19, 20]]
각 단어의 인덱스: 
 {'오늘': 1, '좋은': 2, '일이': 3, '너': 4, '이뻐': 5, '보인다': 6, '나는': 7, '기분이': 8, '더러워': 9, '끝내주는데': 10, '있나봐': 11, '나': 12, '생겼어': 13, '아': 14, '진짜': 15, '짜증나': 16, '환상적인데': 17, '정말': 18, '좋은거': 19, '같아': 20}
라벨: 
 [[1], [0], [1], [1], [0], [1]]


In [16]:
# 데이터셋 생성
dataset = tf.data.Dataset.from_tensor_slices((sequences, label))

# 데이터셋을 반복하여 처리
for data in dataset:
    print(data)


(<tf.Tensor: shape=(4,), dtype=int32, numpy=array([4, 1, 5, 6], dtype=int32)>, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>)
(<tf.Tensor: shape=(4,), dtype=int32, numpy=array([7, 1, 8, 9], dtype=int32)>, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>)
(<tf.Tensor: shape=(4,), dtype=int32, numpy=array([10,  2,  3, 11], dtype=int32)>, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>)
(<tf.Tensor: shape=(4,), dtype=int32, numpy=array([12,  2,  3, 13], dtype=int32)>, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>)
(<tf.Tensor: shape=(4,), dtype=int32, numpy=array([14,  1, 15, 16], dtype=int32)>, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>)
(<tf.Tensor: shape=(4,), dtype=int32, numpy=array([17, 18, 19, 20], dtype=int32)>, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>)


2024-08-20 14:10:05.237153: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [18]:
# 배치 사이즈를 조절하여 2개씩 형태 불러오는 방법
BATCH_SIZE = 2

dataset = tf.data.Dataset.from_tensor_slices((sequences, label))
dataset = dataset.batch(BATCH_SIZE)

for data in dataset:
    print(data)

(<tf.Tensor: shape=(2, 4), dtype=int32, numpy=
array([[4, 1, 5, 6],
       [7, 1, 8, 9]], dtype=int32)>, <tf.Tensor: shape=(2, 1), dtype=int32, numpy=
array([[1],
       [0]], dtype=int32)>)
(<tf.Tensor: shape=(2, 4), dtype=int32, numpy=
array([[10,  2,  3, 11],
       [12,  2,  3, 13]], dtype=int32)>, <tf.Tensor: shape=(2, 1), dtype=int32, numpy=
array([[1],
       [1]], dtype=int32)>)
(<tf.Tensor: shape=(2, 4), dtype=int32, numpy=
array([[14,  1, 15, 16],
       [17, 18, 19, 20]], dtype=int32)>, <tf.Tensor: shape=(2, 1), dtype=int32, numpy=
array([[0],
       [1]], dtype=int32)>)


2024-08-20 14:11:43.559599: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [25]:
# 데이터가 셔플되어 출력되도록 설정하는 방법
dataset = tf.data.Dataset.from_tensor_slices((sequences, label))
dataset = dataset.shuffle(len(sequences))

for data in dataset:
    print(data)


(<tf.Tensor: shape=(4,), dtype=int32, numpy=array([12,  2,  3, 13], dtype=int32)>, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>)
(<tf.Tensor: shape=(4,), dtype=int32, numpy=array([7, 1, 8, 9], dtype=int32)>, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>)
(<tf.Tensor: shape=(4,), dtype=int32, numpy=array([4, 1, 5, 6], dtype=int32)>, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>)
(<tf.Tensor: shape=(4,), dtype=int32, numpy=array([17, 18, 19, 20], dtype=int32)>, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>)
(<tf.Tensor: shape=(4,), dtype=int32, numpy=array([10,  2,  3, 11], dtype=int32)>, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>)
(<tf.Tensor: shape=(4,), dtype=int32, numpy=array([14,  1, 15, 16], dtype=int32)>, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>)


2024-08-20 14:14:54.636504: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [29]:
# 데이터를 여러번 사용하여 가져올 수 있는 방법

epochs = 2

dataset = tf.data.Dataset.from_tensor_slices((sequences, label))
dataset = dataset.repeat(epochs)

for data in dataset:
    print(data)

(<tf.Tensor: shape=(4,), dtype=int32, numpy=array([4, 1, 5, 6], dtype=int32)>, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>)
(<tf.Tensor: shape=(4,), dtype=int32, numpy=array([7, 1, 8, 9], dtype=int32)>, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>)
(<tf.Tensor: shape=(4,), dtype=int32, numpy=array([10,  2,  3, 11], dtype=int32)>, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>)
(<tf.Tensor: shape=(4,), dtype=int32, numpy=array([12,  2,  3, 13], dtype=int32)>, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>)
(<tf.Tensor: shape=(4,), dtype=int32, numpy=array([14,  1, 15, 16], dtype=int32)>, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>)
(<tf.Tensor: shape=(4,), dtype=int32, numpy=array([17, 18, 19, 20], dtype=int32)>, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>)
(<tf.Tensor: shape=(4,), dtype=int32, numpy=array([4, 1, 5, 6], dtype=int32)>, <tf.Tensor: shape

2024-08-20 14:16:17.150995: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [33]:
# 모델에 따라 입력값이 하나가 아니라 두개 이상이 될 수 도 있는데 라벨을 제외한 데이터를 하나의 입력값으로 묶기위한 매핑 방법
# 매핑 작업

def mapping_fn(X, Y=None):
    input = {'x': X}
    label = Y
    return input, label

dataset = tf.data.Dataset.from_tensor_slices((sequences, label))
dataset = dataset.map(mapping_fn)

for data in dataset:
    print(data)


({'x': <tf.Tensor: shape=(4,), dtype=int32, numpy=array([4, 1, 5, 6], dtype=int32)>}, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>)
({'x': <tf.Tensor: shape=(4,), dtype=int32, numpy=array([7, 1, 8, 9], dtype=int32)>}, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>)
({'x': <tf.Tensor: shape=(4,), dtype=int32, numpy=array([10,  2,  3, 11], dtype=int32)>}, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>)
({'x': <tf.Tensor: shape=(4,), dtype=int32, numpy=array([12,  2,  3, 13], dtype=int32)>}, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>)
({'x': <tf.Tensor: shape=(4,), dtype=int32, numpy=array([14,  1, 15, 16], dtype=int32)>}, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>)
({'x': <tf.Tensor: shape=(4,), dtype=int32, numpy=array([17, 18, 19, 20], dtype=int32)>}, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>)


2024-08-20 14:20:35.810428: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [34]:
# 배치, 셔플, 반복, 매핑 과정을 한번에 사용하는 방법
BATCH_SIZE = 2
EPOCH = 2

def mapping_fn(X, Y=None):
    input = {'x': X}
    label = Y
    return input, label

dataset = tf.data.Dataset.from_tensor_slices((sequences, label))
dataset = dataset.map(mapping_fn)
dataset = dataset.shuffle(len(sequences))
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.repeat(EPOCH)

for data in dataset:
    print(data)

({'x': <tf.Tensor: shape=(2, 4), dtype=int32, numpy=
array([[10,  2,  3, 11],
       [14,  1, 15, 16]], dtype=int32)>}, <tf.Tensor: shape=(2, 1), dtype=int32, numpy=
array([[1],
       [0]], dtype=int32)>)
({'x': <tf.Tensor: shape=(2, 4), dtype=int32, numpy=
array([[12,  2,  3, 13],
       [ 4,  1,  5,  6]], dtype=int32)>}, <tf.Tensor: shape=(2, 1), dtype=int32, numpy=
array([[1],
       [1]], dtype=int32)>)
({'x': <tf.Tensor: shape=(2, 4), dtype=int32, numpy=
array([[17, 18, 19, 20],
       [ 7,  1,  8,  9]], dtype=int32)>}, <tf.Tensor: shape=(2, 1), dtype=int32, numpy=
array([[1],
       [0]], dtype=int32)>)
({'x': <tf.Tensor: shape=(2, 4), dtype=int32, numpy=
array([[10,  2,  3, 11],
       [14,  1, 15, 16]], dtype=int32)>}, <tf.Tensor: shape=(2, 1), dtype=int32, numpy=
array([[1],
       [0]], dtype=int32)>)
({'x': <tf.Tensor: shape=(2, 4), dtype=int32, numpy=
array([[ 4,  1,  5,  6],
       [12,  2,  3, 13]], dtype=int32)>}, <tf.Tensor: shape=(2, 1), dtype=int32, numpy=
array([[1]

2024-08-20 14:23:43.818964: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
