### 문장 & 단어 예측
- 데이터셋
    * 문장의 단어를 N-gram 기반으로 구성해서 생성
        - 예) "오늘은 즐거운 피자 먹는 금요일 입니다."
        - '오늘은 즐거운', '오늘은 즐거운 피자', '오늘은 즐거운 피자 먹는',
        - '오늘은 즐거운 피자 먹는 금요일', '오늘은 즐거운 피자 먹는 금요일 입니다.'
    * N-gram 기반으로 생성된 데이터셋의 마지막 요소 ==> 레이블

- 정확한 텍스트 데이터 준비가 중요
    * 맞춤법, 띄어쓰기 잘 되어있는 데이터 준비

#### 데이터 준비

In [13]:
songData = '''Last Christmas I gave you my heart
But the very next day you gave it away
This year, to save me from tears
I'll give it to someone special
Last Christmas I gave you my heart
But the very next day you gave it away
This year, to save me from tears
I'll give it to someone special
Once bitten and twice shy
I keep my distance, but you still catch my eye
Tell me baby, do you recognize me?
Well, it's been a year, it doesn't surprise me
Happy Christmas, I wrapped it up and sent it
With a note saying "I love you", I meant it
Now I know what a fool I've been
But if you kissed me now, I know you'd fool me again
Last Christmas I gave you my heart
But the very next day you gave it away
This year, to save me from tears
I'll give it to someone special
Last Christmas I gave you my heart
But the very next day you gave it away
This year, to save me from tears
I'll give it to someone special'''

In [14]:
songDataList = songData.split('\n')

print(f'갯수 : {len(songDataList)}')

갯수 : 24


#### 텍스트 데이터 기반 단어 사전 생성
- Kearas Tokenizer() 사용 -> 구두점 삭제, 공백 기준으로 분할
- 단어 사전 생성

In [15]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

##### [1] tokenizer

In [17]:
tk = Tokenizer()
tk.fit_on_texts(songDataList)

In [18]:
tk.word_index

{'you': 1,
 'it': 2,
 'i': 3,
 'me': 4,
 'gave': 5,
 'to': 6,
 'my': 7,
 'but': 8,
 'christmas': 9,
 'year': 10,
 'last': 11,
 'heart': 12,
 'the': 13,
 'very': 14,
 'next': 15,
 'day': 16,
 'away': 17,
 'this': 18,
 'save': 19,
 'from': 20,
 'tears': 21,
 "i'll": 22,
 'give': 23,
 'someone': 24,
 'special': 25,
 'a': 26,
 'and': 27,
 'been': 28,
 'now': 29,
 'know': 30,
 'fool': 31,
 'once': 32,
 'bitten': 33,
 'twice': 34,
 'shy': 35,
 'keep': 36,
 'distance': 37,
 'still': 38,
 'catch': 39,
 'eye': 40,
 'tell': 41,
 'baby': 42,
 'do': 43,
 'recognize': 44,
 'well': 45,
 "it's": 46,
 "doesn't": 47,
 'surprise': 48,
 'happy': 49,
 'wrapped': 50,
 'up': 51,
 'sent': 52,
 'with': 53,
 'note': 54,
 'saying': 55,
 'love': 56,
 'meant': 57,
 'what': 58,
 "i've": 59,
 'if': 60,
 'kissed': 61,
 "you'd": 62,
 'again': 63}

In [21]:
# 문장 --> 숫자
seqData1 = tk.texts_to_sequences(songDataList)

In [29]:
# 숫자화된 데이터 --> N-gram 방식으로 새롭게 생성
seqData2 = []
for line in seqData1:
    print(line)
    for idx in range(2, len(line)+1):
        seqData2.append(line[:idx])

[11, 9, 3, 5, 1, 7, 12]
[8, 13, 14, 15, 16, 1, 5, 2, 17]
[18, 10, 6, 19, 4, 20, 21]
[22, 23, 2, 6, 24, 25]
[11, 9, 3, 5, 1, 7, 12]
[8, 13, 14, 15, 16, 1, 5, 2, 17]
[18, 10, 6, 19, 4, 20, 21]
[22, 23, 2, 6, 24, 25]
[32, 33, 27, 34, 35]
[3, 36, 7, 37, 8, 1, 38, 39, 7, 40]
[41, 4, 42, 43, 1, 44, 4]
[45, 46, 28, 26, 10, 2, 47, 48, 4]
[49, 9, 3, 50, 2, 51, 27, 52, 2]
[53, 26, 54, 55, 3, 56, 1, 3, 57, 2]
[29, 3, 30, 58, 26, 31, 59, 28]
[8, 60, 1, 61, 4, 29, 3, 30, 62, 31, 4, 63]
[11, 9, 3, 5, 1, 7, 12]
[8, 13, 14, 15, 16, 1, 5, 2, 17]
[18, 10, 6, 19, 4, 20, 21]
[22, 23, 2, 6, 24, 25]
[11, 9, 3, 5, 1, 7, 12]
[8, 13, 14, 15, 16, 1, 5, 2, 17]
[18, 10, 6, 19, 4, 20, 21]
[22, 23, 2, 6, 24, 25]


In [30]:
print(seqData2)

[[11, 9], [11, 9, 3], [11, 9, 3, 5], [11, 9, 3, 5, 1], [11, 9, 3, 5, 1, 7], [11, 9, 3, 5, 1, 7, 12], [8, 13], [8, 13, 14], [8, 13, 14, 15], [8, 13, 14, 15, 16], [8, 13, 14, 15, 16, 1], [8, 13, 14, 15, 16, 1, 5], [8, 13, 14, 15, 16, 1, 5, 2], [8, 13, 14, 15, 16, 1, 5, 2, 17], [18, 10], [18, 10, 6], [18, 10, 6, 19], [18, 10, 6, 19, 4], [18, 10, 6, 19, 4, 20], [18, 10, 6, 19, 4, 20, 21], [22, 23], [22, 23, 2], [22, 23, 2, 6], [22, 23, 2, 6, 24], [22, 23, 2, 6, 24, 25], [11, 9], [11, 9, 3], [11, 9, 3, 5], [11, 9, 3, 5, 1], [11, 9, 3, 5, 1, 7], [11, 9, 3, 5, 1, 7, 12], [8, 13], [8, 13, 14], [8, 13, 14, 15], [8, 13, 14, 15, 16], [8, 13, 14, 15, 16, 1], [8, 13, 14, 15, 16, 1, 5], [8, 13, 14, 15, 16, 1, 5, 2], [8, 13, 14, 15, 16, 1, 5, 2, 17], [18, 10], [18, 10, 6], [18, 10, 6, 19], [18, 10, 6, 19, 4], [18, 10, 6, 19, 4, 20], [18, 10, 6, 19, 4, 20, 21], [22, 23], [22, 23, 2], [22, 23, 2, 6], [22, 23, 2, 6, 24], [22, 23, 2, 6, 24, 25], [32, 33], [32, 33, 27], [32, 33, 27, 34], [32, 33, 27, 34, 

In [31]:
print(len(seqData2))

162


In [32]:
seqData2

[[11, 9],
 [11, 9, 3],
 [11, 9, 3, 5],
 [11, 9, 3, 5, 1],
 [11, 9, 3, 5, 1, 7],
 [11, 9, 3, 5, 1, 7, 12],
 [8, 13],
 [8, 13, 14],
 [8, 13, 14, 15],
 [8, 13, 14, 15, 16],
 [8, 13, 14, 15, 16, 1],
 [8, 13, 14, 15, 16, 1, 5],
 [8, 13, 14, 15, 16, 1, 5, 2],
 [8, 13, 14, 15, 16, 1, 5, 2, 17],
 [18, 10],
 [18, 10, 6],
 [18, 10, 6, 19],
 [18, 10, 6, 19, 4],
 [18, 10, 6, 19, 4, 20],
 [18, 10, 6, 19, 4, 20, 21],
 [22, 23],
 [22, 23, 2],
 [22, 23, 2, 6],
 [22, 23, 2, 6, 24],
 [22, 23, 2, 6, 24, 25],
 [11, 9],
 [11, 9, 3],
 [11, 9, 3, 5],
 [11, 9, 3, 5, 1],
 [11, 9, 3, 5, 1, 7],
 [11, 9, 3, 5, 1, 7, 12],
 [8, 13],
 [8, 13, 14],
 [8, 13, 14, 15],
 [8, 13, 14, 15, 16],
 [8, 13, 14, 15, 16, 1],
 [8, 13, 14, 15, 16, 1, 5],
 [8, 13, 14, 15, 16, 1, 5, 2],
 [8, 13, 14, 15, 16, 1, 5, 2, 17],
 [18, 10],
 [18, 10, 6],
 [18, 10, 6, 19],
 [18, 10, 6, 19, 4],
 [18, 10, 6, 19, 4, 20],
 [18, 10, 6, 19, 4, 20, 21],
 [22, 23],
 [22, 23, 2],
 [22, 23, 2, 6],
 [22, 23, 2, 6, 24],
 [22, 23, 2, 6, 24, 25],
 [32, 33],

In [35]:
MAX_LENGTH = max([len(_) for _ in seqData2])

In [37]:
# Padding => 앞부분을 0으로 채우기 (기본값)
songData3 = pad_sequences(seqData2, maxlen=MAX_LENGTH)

In [39]:
songData3.shape, type(songData3)

((162, 12), numpy.ndarray)

##### [2] 학습용 데이터 준비

[2-1] 데이터와 라벨 분리

In [43]:
data, label = songData3[:, :-1], songData3[:, -1]

In [44]:
data.shape, label.shape

((162, 11), (162,))

[2-2] 라벨 인코딩

In [48]:
VOCA_NUM = len(tk.word_index)+1
VOCA_NUM

64

In [49]:
from tensorflow.keras.utils import to_categorical

In [52]:
label_oh = to_categorical(label, num_classes=VOCA_NUM)

In [53]:
label_oh[:3]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
      dtype=float32)

In [56]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional

In [None]:
model = Sequential()

model.add(Embedding(VOCA_NUM, 8))
model.add(B)