# 텍스트 전처리
---
- 패키지 설치
    - NLTK: pip install nltk
    - Konply : pip install konply

## [1] 토큰화(Tokenization)
---
- 문장/문서를 의미르 지닌 작은 단위로 나누는 것
- 나누어진 단어를 토큰(Token)이라 함
- 종류
    - 문장 토큰화
    - 단어 토큰화

In [22]:
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk

In [30]:
# nltk Corpus(말뭉치) 데이터셋 다운받기
nltk.download('all', quiet=True)

True

In [65]:
raw_text1='''
In Korea, more than half of residents speak Korean?
GitHub Actions makes it easy to automate? 
all your software workflows!
'''
raw_text2='''
GitHub Actions makes it easy to automate all your software workflows 
'''

In [66]:
result1 = word_tokenize(raw_text1)
print(result1)

['In', 'Korea', ',', 'more', 'than', 'half', 'of', 'residents', 'speak', 'Korean', '?', 'GitHub', 'Actions', 'makes', 'it', 'easy', 'to', 'automate', '?', 'all', 'your', 'software', 'workflows', '!']


In [67]:
result2 = word_tokenize(raw_text2)
print(result2)

['GitHub', 'Actions', 'makes', 'it', 'easy', 'to', 'automate', 'all', 'your', 'software', 'workflows']


In [68]:
text_raw = [raw_text1,raw_text2]

In [69]:
sent_result=sent_tokenize(raw_text1)

In [70]:
sent_result

['\nIn Korea, more than half of residents speak Korean?',
 'GitHub Actions makes it easy to automate?',
 'all your software workflows!']

In [71]:
len(sent_result)

3

## 여러 문장에 토큰 추출
---

In [72]:
# 문장 단위로 추출
for sent in text_raw:
    total_token=[]
    
    # 문장에서 추출한 토큰
    print(f'sent => {sent}')
    sentToken=word_tokenize(sent)
    print(sentToken,'-----',sep='\n')
    # 모든 문장의 토큰에 추가
    total_token.append(sentToken)
print('------')
print(total_token)

sent => 
In Korea, more than half of residents speak Korean?
GitHub Actions makes it easy to automate? 
all your software workflows!

['In', 'Korea', ',', 'more', 'than', 'half', 'of', 'residents', 'speak', 'Korean', '?', 'GitHub', 'Actions', 'makes', 'it', 'easy', 'to', 'automate', '?', 'all', 'your', 'software', 'workflows', '!']
-----
sent => 
GitHub Actions makes it easy to automate all your software workflows 

['GitHub', 'Actions', 'makes', 'it', 'easy', 'to', 'automate', 'all', 'your', 'software', 'workflows']
-----
------
[['GitHub', 'Actions', 'makes', 'it', 'easy', 'to', 'automate', 'all', 'your', 'software', 'workflows']]


## 한글
---

In [73]:
from konlpy.tag import Okt

# 형태소 분리 객체
okt = Okt()

In [74]:
result=okt.morphs("오늘은 월요일입니다")
print(result)

['오늘', '은', '월요일', '입니다']


In [75]:
# 형태소 분리 후 태깅(Tagging) => 품사
result2=okt.pos("오늘은 월요일입니다.", stem=True)
print(result2)

[('오늘', 'Noun'), ('은', 'Josa'), ('월요일', 'Noun'), ('이다', 'Adjective'), ('.', 'Punctuation')]


### [2] 정제 & 정규화
---
- 불영어 제거 => 노이즈 제거
- 텍스트의 동일화
    - 대문자 또는 소문자로 통일
    - 문장의 길이

### [2-1] 불용어(Stopword)

In [76]:
nltk.corpus.stopwords

<WordListCorpusReader in 'C:\\Users\\ss\\AppData\\Roaming\\nltk_data\\corpora\\stopwords'>

In [77]:
en_stopwords=nltk.corpus.stopwords.words('english')

In [78]:
len(en_stopwords)

179

In [79]:
en_stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

### [2-2] 어간 및 표제어 처리
---

In [80]:
from nltk.stem import LancasterStemmer

In [81]:
lstem = LancasterStemmer()

In [82]:
lstem.stem('working'), lstem.stem('works'), lstem.stem('worked')

('work', 'work', 'work')

In [83]:
lstem.stem('amuse'), lstem.stem('amused')

('amus', 'amus')

In [84]:
lstem.stem('happy'), lstem.stem('happiness')

('happy', 'happy')

In [85]:
# 표제어(사전에 등록된 단어 추출)
from nltk.stem import WordNetLemmatizer

In [86]:
wlemma=WordNetLemmatizer()

In [87]:
wlemma.lemmatize('working','v'), wlemma.lemmatize('worked','v')

('work', 'work')

In [88]:
wlemma.lemmatize('amusing','v'), wlemma.lemmatize('amused','v')

('amuse', 'amuse')

### [3] 텍스트 벡터화
---
- 텍스트 => 수치화
- 희소벡터(OHE)BOW 방식 --> Count기반, TH-IDF 기반
- 밀집벡터 : Embedding 방식, Word2Vect

In [89]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [90]:
corpus=[raw_text1, raw_text2]
corpus

['\nIn Korea, more than half of residents speak Korean?\nGitHub Actions makes it easy to automate? \nall your software workflows!\n',
 '\nGitHub Actions makes it easy to automate all your software workflows \n']

In [91]:
ohe = CountVectorizer()

In [93]:
result = ohe.fit_transform(corpus)

In [94]:
print(result)

  (0, 6)	1
  (0, 8)	1
  (0, 11)	1
  (0, 16)	1
  (0, 5)	1
  (0, 12)	1
  (0, 13)	1
  (0, 15)	1
  (0, 9)	1
  (0, 4)	1
  (0, 0)	1
  (0, 10)	1
  (0, 7)	1
  (0, 3)	1
  (0, 17)	1
  (0, 2)	1
  (0, 1)	1
  (0, 19)	1
  (0, 14)	1
  (0, 18)	1
  (1, 4)	1
  (1, 0)	1
  (1, 10)	1
  (1, 7)	1
  (1, 3)	1
  (1, 17)	1
  (1, 2)	1
  (1, 1)	1
  (1, 19)	1
  (1, 14)	1
  (1, 18)	1


In [95]:
result=result.toarray()

In [97]:
print(result.shape, result)

(2, 20) [[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 0 0 1 0 0 1 0 0 0 1 0 0 1 1 1]]


In [98]:
# TF-IDF 기반
tfidf = TfidfVectorizer()

In [100]:
tf_corpus=tfidf.fit_transform(corpus)

In [101]:
type(tf_corpus)

scipy.sparse.csr.csr_matrix

In [102]:
tf_corpus=tf_corpus.toarray()

In [103]:
tf_corpus

array([[0.18641024, 0.18641024, 0.18641024, 0.18641024, 0.18641024,
        0.26199309, 0.26199309, 0.18641024, 0.26199309, 0.26199309,
        0.18641024, 0.26199309, 0.26199309, 0.26199309, 0.18641024,
        0.26199309, 0.26199309, 0.18641024, 0.18641024, 0.18641024],
       [0.30151134, 0.30151134, 0.30151134, 0.30151134, 0.30151134,
        0.        , 0.        , 0.30151134, 0.        , 0.        ,
        0.30151134, 0.        , 0.        , 0.        , 0.30151134,
        0.        , 0.        , 0.30151134, 0.30151134, 0.30151134]])

In [104]:
text='''
sent='Wiki is in Ward is original description: The simplest online database that could possibly work.\
Wiki is a piece of server software that allows users to freely create and edit Web page content using 
any Web browser. Wiki supports hyperlinks and has a simple text syntax for creating new pages and crosslinks 
between internal pages on the fly.\ Wiki is unusual among group communication mechanisms in that 
it allows the organization of contributions to be edited in addition to the content itself.Like many simple concepts, 
"open editing" has some profound and subtle effects on Wiki usage. Allowing everyday users to create and edit any page 
in a Web site is exciting in that it encourages democratic use of the Web and promotes content composition 
by nontechnical users.'
'''

In [105]:
sen_text = sent_tokenize(text)

In [106]:
sen_text

["\nsent='Wiki is in Ward is original description: The simplest online database that could possibly work.Wiki is a piece of server software that allows users to freely create and edit Web page content using \nany Web browser.",
 'Wiki supports hyperlinks and has a simple text syntax for creating new pages and crosslinks \nbetween internal pages on the fly.\\ Wiki is unusual among group communication mechanisms in that \nit allows the organization of contributions to be edited in addition to the content itself.Like many simple concepts, \n"open editing" has some profound and subtle effects on Wiki usage.',
 "Allowing everyday users to create and edit any page \nin a Web site is exciting in that it encourages democratic use of the Web and promotes content composition \nby nontechnical users.'"]

In [108]:
len(sen_text)

3

In [131]:
en_stopwords=nltk.corpus.stopwords.words('english')
en_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [141]:
result=[]
for sent in sen_text:
    word = word_tokenize(sent)

    for w in word:
        if w not in en_stopwords:
            result.append(w)
print(result, len(result))

["sent='Wiki", 'Ward', 'original', 'description', ':', 'The', 'simplest', 'online', 'database', 'could', 'possibly', 'work.Wiki', 'piece', 'server', 'software', 'allows', 'users', 'freely', 'create', 'edit', 'Web', 'page', 'content', 'using', 'Web', 'browser', '.', 'Wiki', 'supports', 'hyperlinks', 'simple', 'text', 'syntax', 'creating', 'new', 'pages', 'crosslinks', 'internal', 'pages', 'fly.\\', 'Wiki', 'unusual', 'among', 'group', 'communication', 'mechanisms', 'allows', 'organization', 'contributions', 'edited', 'addition', 'content', 'itself.Like', 'many', 'simple', 'concepts', ',', "''", 'open', 'editing', "''", 'profound', 'subtle', 'effects', 'Wiki', 'usage', '.', 'Allowing', 'everyday', 'users', 'create', 'edit', 'page', 'Web', 'site', 'exciting', 'encourages', 'democratic', 'use', 'Web', 'promotes', 'content', 'composition', 'nontechnical', 'users', '.', "'"] 87


## Tokenizer 객체 생성
---

In [125]:
import tensorflow as th
from tensorflow.keras.preprocessing.text import text_to_word_sequence, Tokenizer

In [128]:
text='''
sent='Wiki is in Ward is original description: The simplest online database that could possibly work.\
Wiki is a piece of server software that allows users to freely create and edit Web page content using 
any Web browser. Wiki supports hyperlinks and has a simple text syntax for creating new pages and crosslinks 
between internal pages on the fly.\ Wiki is unusual among group communication mechanisms in that 
it allows the organization of contributions to be edited in addition to the content itself.Like many simple concepts, 
"open editing" has some profound and subtle effects on Wiki usage. Allowing everyday users to create and edit any page 
in a Web site is exciting in that it encourages democratic use of the Web and promotes content composition 
by nontechnical users.'
'''

In [132]:
# 토큰으로 나누기
tokens=text_to_word_sequence(text)
print(tokens)

['sent', "'wiki", 'is', 'in', 'ward', 'is', 'original', 'description', 'the', 'simplest', 'online', 'database', 'that', 'could', 'possibly', 'work', 'wiki', 'is', 'a', 'piece', 'of', 'server', 'software', 'that', 'allows', 'users', 'to', 'freely', 'create', 'and', 'edit', 'web', 'page', 'content', 'using', 'any', 'web', 'browser', 'wiki', 'supports', 'hyperlinks', 'and', 'has', 'a', 'simple', 'text', 'syntax', 'for', 'creating', 'new', 'pages', 'and', 'crosslinks', 'between', 'internal', 'pages', 'on', 'the', 'fly', 'wiki', 'is', 'unusual', 'among', 'group', 'communication', 'mechanisms', 'in', 'that', 'it', 'allows', 'the', 'organization', 'of', 'contributions', 'to', 'be', 'edited', 'in', 'addition', 'to', 'the', 'content', 'itself', 'like', 'many', 'simple', 'concepts', 'open', 'editing', 'has', 'some', 'profound', 'and', 'subtle', 'effects', 'on', 'wiki', 'usage', 'allowing', 'everyday', 'users', 'to', 'create', 'and', 'edit', 'any', 'page', 'in', 'a', 'web', 'site', 'is', 'excitin

In [133]:
myToken=Tokenizer()

In [134]:
myToken.fit_on_texts(tokens)

In [138]:
print(myToken.word_counts)

OrderedDict([('sent', 1), ("'wiki", 1), ('is', 5), ('in', 5), ('ward', 1), ('original', 1), ('description', 1), ('the', 5), ('simplest', 1), ('online', 1), ('database', 1), ('that', 4), ('could', 1), ('possibly', 1), ('work', 1), ('wiki', 4), ('a', 3), ('piece', 1), ('of', 3), ('server', 1), ('software', 1), ('allows', 2), ('users', 3), ('to', 4), ('freely', 1), ('create', 2), ('and', 6), ('edit', 2), ('web', 4), ('page', 2), ('content', 3), ('using', 1), ('any', 2), ('browser', 1), ('supports', 1), ('hyperlinks', 1), ('has', 2), ('simple', 2), ('text', 1), ('syntax', 1), ('for', 1), ('creating', 1), ('new', 1), ('pages', 2), ('crosslinks', 1), ('between', 1), ('internal', 1), ('on', 2), ('fly', 1), ('unusual', 1), ('among', 1), ('group', 1), ('communication', 1), ('mechanisms', 1), ('it', 2), ('organization', 1), ('contributions', 1), ('be', 1), ('edited', 1), ('addition', 1), ('itself', 1), ('like', 1), ('many', 1), ('concepts', 1), ('open', 1), ('editing', 1), ('some', 1), ('profoun

In [139]:
print(myToken.word_index)

{'and': 1, 'is': 2, 'in': 3, 'the': 4, 'that': 5, 'wiki': 6, 'to': 7, 'web': 8, 'a': 9, 'of': 10, 'users': 11, 'content': 12, 'allows': 13, 'create': 14, 'edit': 15, 'page': 16, 'any': 17, 'has': 18, 'simple': 19, 'pages': 20, 'on': 21, 'it': 22, 'sent': 23, "'wiki": 24, 'ward': 25, 'original': 26, 'description': 27, 'simplest': 28, 'online': 29, 'database': 30, 'could': 31, 'possibly': 32, 'work': 33, 'piece': 34, 'server': 35, 'software': 36, 'freely': 37, 'using': 38, 'browser': 39, 'supports': 40, 'hyperlinks': 41, 'text': 42, 'syntax': 43, 'for': 44, 'creating': 45, 'new': 46, 'crosslinks': 47, 'between': 48, 'internal': 49, 'fly': 50, 'unusual': 51, 'among': 52, 'group': 53, 'communication': 54, 'mechanisms': 55, 'organization': 56, 'contributions': 57, 'be': 58, 'edited': 59, 'addition': 60, 'itself': 61, 'like': 62, 'many': 63, 'concepts': 64, 'open': 65, 'editing': 66, 'some': 67, 'profound': 68, 'subtle': 69, 'effects': 70, 'usage': 71, 'allowing': 72, 'everyday': 73, 'site':

In [144]:
print(myToken.texts_to_sequences(tokens))

[[23], [24], [2], [3], [25], [2], [26], [27], [4], [28], [29], [30], [5], [31], [32], [33], [6], [2], [9], [34], [10], [35], [36], [5], [13], [11], [7], [37], [14], [1], [15], [8], [16], [12], [38], [17], [8], [39], [6], [40], [41], [1], [18], [9], [19], [42], [43], [44], [45], [46], [20], [1], [47], [48], [49], [20], [21], [4], [50], [6], [2], [51], [52], [53], [54], [55], [3], [5], [22], [13], [4], [56], [10], [57], [7], [58], [59], [3], [60], [7], [4], [12], [61], [62], [63], [19], [64], [65], [66], [18], [67], [68], [1], [69], [70], [21], [6], [71], [72], [73], [11], [7], [14], [1], [15], [17], [16], [3], [9], [8], [74], [2], [75], [3], [5], [22], [76], [77], [78], [10], [4], [8], [1], [79], [12], [80], [81], [82], [11], [83]]


In [170]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [162]:
sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?'
]

In [163]:
tokenizer = Tokenizer(num_words = 100, oov_token="<oov>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

In [164]:
sequences = tokenizer.texts_to_sequences(sentences)

In [None]:
padded = pa

In [165]:
test_sentences = [
    'i really love my dog',
    'my dog loves my friend'
]

In [166]:
print(sequences)

[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]


In [167]:
test_sequences = tokenizer.texts_to_sequences(test_sentences)
print(test_sequences)
print(word_index)

[[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]
{'<oov>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}


## One-Hot-Encoding 변환
---
- sklearn OneHotEncoder 객체 생성
- keras 함수

In [150]:
from tensorflow.keras.utils import to_categorical

In [168]:
for i in range(len(sequences)):
    sequences[i] = to_categorical(sequences[i])

In [169]:

sequences

[array([[0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0.]], dtype=float32),
 array([[0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1.]], dtype=float32),
 array([[0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0.]], dtype=float32),
 array([[0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]], dtype=float32)]

In [250]:
import pandas as pd

In [303]:
en = pd.read_table('C:/Users/ss/Downloads/example.txt', header=None)

In [304]:
en

Unnamed: 0,0
0,The main Henry Ford Museum building houses som...
1,Henry Ford Academy is the first charter school...
2,Freshman meet inside the main museum building ...
3,The Henry Ford Learning Institute is using the...
4,The building received the international annual...
5,See also[edit]


In [305]:
en = en.to_numpy()

In [306]:
en_text = ''
for i in en:
    en_text = en_text + i[0]
en_text = [en_text]

In [307]:
tokenizer = Tokenizer(num_words = 1000, oov_token="<oov>")
tokenizer.fit_on_texts(en_text)
word_index = tokenizer.word_index

In [308]:
enSeq = tokenizer.texts_to_sequences(en_text)

In [309]:
len(word_index)

139

In [310]:
print(enSeq)

[[2, 16, 9, 4, 10, 13, 34, 35, 5, 2, 17, 11, 2, 9, 4, 36, 4, 18, 12, 2, 37, 19, 6, 3, 2, 38, 39, 14, 40, 41, 42, 20, 7, 43, 44, 45, 46, 8, 7, 47, 48, 49, 50, 2, 6, 12, 51, 20, 2, 4, 52, 53, 54, 55, 56, 15, 57, 58, 8, 2, 9, 4, 10, 8, 59, 21, 6, 22, 23, 12, 60, 3, 61, 62, 24, 2, 63, 5, 2, 9, 4, 10, 64, 12, 65, 66, 7, 67, 3, 2, 68, 8, 69, 70, 3, 71, 72, 73, 74, 75, 2, 16, 10, 13, 3, 76, 77, 17, 78, 79, 22, 25, 7, 80, 81, 13, 8, 82, 83, 24, 7, 84, 5, 2, 85, 26, 86, 87, 88, 89, 14, 27, 25, 5, 2, 10, 90, 7, 91, 5, 2, 28, 26, 29, 92, 2, 10, 93, 94, 3, 95, 23, 96, 7, 6, 97, 98, 99, 100, 14, 101, 102, 6, 103, 2, 104, 105, 5, 2, 28, 6, 106, 3, 107, 2, 9, 4, 108, 109, 12, 110, 2, 9, 4, 18, 111, 11, 112, 19, 29, 113, 2, 114, 115, 21, 3, 116, 8, 117, 6, 11, 118, 30, 3, 119, 120, 2, 13, 121, 2, 31, 122, 30, 32, 5, 2, 123, 5, 15, 33, 124, 31, 11, 125, 2, 126, 127, 128, 32, 11, 129, 130, 15, 33, 131, 132, 27, 133, 134, 8, 135, 136, 137, 138, 139]]


In [311]:
print(word_index)

{'<oov>': 1, 'the': 2, 'in': 3, 'ford': 4, 'of': 5, 'school': 6, 'a': 7, 'and': 8, 'henry': 9, 'museum': 10, 'for': 11, 'is': 12, 'building': 13, 'to': 14, 'educational': 15, 'main': 16, 'classrooms': 17, 'academy': 18, 'charter': 19, 'by': 20, 'high': 21, 'students': 22, 'it': 23, 'on': 24, 'use': 25, 'village': 26, 'include': 27, 'original': 28, 'schools': 29, 'design': 30, 'international': 31, 'award': 32, 'facilities': 33, 'houses': 34, 'some': 35, 'academyhenry': 36, 'first': 37, 'united': 38, 'states': 39, 'be': 40, 'developed': 41, 'jointly': 42, 'global': 43, 'corporation': 44, 'public': 45, 'education': 46, 'major': 47, 'nonprofit': 48, 'cultural': 49, 'institution': 50, 'sponsored': 51, 'motor': 52, 'company': 53, 'wayne': 54, 'county': 55, 'regional': 56, 'service': 57, 'agency': 58, 'admits': 59, 'located': 60, 'dearborn': 61, 'michigan': 62, 'campus': 63, 'enrollment': 64, 'taken': 65, 'from': 66, 'lottery': 67, 'area': 68, 'totaled': 69, '467': 70, '2010': 71, '1': 72, 'f

In [312]:
enSeq = to_categorical(enSeq)

In [314]:
enSeq

array([[[0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 1., 0., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 0., 1.]]], dtype=float32)

In [315]:
test_text = ['Do you think my dog is amazing?']

In [316]:
tokenizer.texts_to_sequences(test_text)

[[1, 1, 1, 1, 1, 12, 1]]