In [4]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [5]:
raw_text = "A barber is a person. a barber is good person. a barber is huge person. he Knew A Secret! The Secret He Kept is huge secret. Huge secret. His barber kept his word. a barber kept his word. His barber kept his secret. But keeping and keeping such a huge secret to himself was arber went up a huge mountain."

In [6]:
# 1. 문장 토큰화
sentences = sent_tokenize(raw_text)
print(sentences)

['A barber is a person.', 'a barber is good person.', 'a barber is huge person.', 'he Knew A Secret!', 'The Secret He Kept is huge secret.', 'Huge secret.', 'His barber kept his word.', 'a barber kept his word.', 'His barber kept his secret.', 'But keeping and keeping such a huge secret to himself was arber went up a huge mountain.']


In [36]:
vocab = {}
preprocessed_sentences = []
stop_words = set(stopwords.words("english"))

# print(stop_words)
for sentence in sentences:
    tokenized_sentence = word_tokenize(sentence)
    result = []
    # 토큰화된 문장을 단어단위로 토큰화
    for word in tokenized_sentence:
        # 모든 문자를 소문자로
        word = word.lower()
        if word not in stop_words:
            if len(word)>2:
                result.append(word)
                if word not in vocab:
                    vocab[word] = 0
                vocab[word] +=1
    preprocessed_sentences.append(result)


In [8]:
# 전처리 후 출력
print(preprocessed_sentences)

[['barber', 'person'], ['barber', 'good', 'person'], ['barber', 'huge', 'person'], ['knew', 'secret'], ['secret', 'kept', 'huge', 'secret'], ['huge', 'secret'], ['barber', 'kept', 'word'], ['barber', 'kept', 'word'], ['barber', 'kept', 'secret'], ['keeping', 'keeping', 'huge', 'secret', 'arber', 'went', 'huge', 'mountain']]


In [9]:
# 빈도수 출력
print(vocab)

{'barber': 6, 'person': 3, 'good': 1, 'huge': 5, 'knew': 1, 'secret': 6, 'kept': 4, 'word': 2, 'keeping': 2, 'arber': 1, 'went': 1, 'mountain': 1}


In [10]:
vocab["barber"]

6

In [11]:
vocab_sorted = sorted(vocab.items(),key = lambda x : x[1], reverse=True)

print(vocab_sorted)

[('barber', 6), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3), ('word', 2), ('keeping', 2), ('good', 1), ('knew', 1), ('arber', 1), ('went', 1), ('mountain', 1)]


In [12]:
word_to_index={}

i = 0 
# 정렬 된후 많이 1위에 1 2위에 2 이런식으로 정수 부여
for (word,frequency) in vocab_sorted:
    # 나온수가 1개 이하는 제거
    if frequency > 1 :
        i +=1
        word_to_index[word] = i

print(word_to_index)
        

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5, 'word': 6, 'keeping': 7}


In [13]:
vocab_size = 5

# 인덱스가 5 초과인 단어 생성
words_frequency = [word for word, index in word_to_index.items() if index >= vocab_size + 1]

# 해당 단어에 대한 인덱스 정보를 삭제
for w in words_frequency:
    del word_to_index[w]
print(word_to_index)

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5}


In [14]:
# 단어집합에 존재하지 않는 단어 OOV 
word_to_index["OOV"]  = len(word_to_index)+1

print(word_to_index)

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5, 'OOV': 6}


In [15]:
enocoded_sentences = []

for sentence in preprocessed_sentences:
    enocoded_sentence = []
    for word in sentence:
        try:
            # 단어집합에 있는 단어라면 그 단어의 정수를 리턴
            enocoded_sentence.append(word_to_index[word])
        except KeyError:
            enocoded_sentence.append(word_to_index["OOV"])
    enocoded_sentences.append(enocoded_sentence)


In [16]:
#  인코딩 이전
print(preprocessed_sentences)

[['barber', 'person'], ['barber', 'good', 'person'], ['barber', 'huge', 'person'], ['knew', 'secret'], ['secret', 'kept', 'huge', 'secret'], ['huge', 'secret'], ['barber', 'kept', 'word'], ['barber', 'kept', 'word'], ['barber', 'kept', 'secret'], ['keeping', 'keeping', 'huge', 'secret', 'arber', 'went', 'huge', 'mountain']]


In [17]:
# 인코딩 이후
print(enocoded_sentences)

[[1, 5], [1, 6, 5], [1, 3, 5], [6, 2], [2, 4, 3, 2], [3, 2], [1, 4, 6], [1, 4, 6], [1, 4, 2], [6, 6, 3, 2, 6, 6, 3, 6]]


## Counter 사용해서 더 편하게 정수인코딩하기

In [18]:
from collections import Counter

print(preprocessed_sentences)

[['barber', 'person'], ['barber', 'good', 'person'], ['barber', 'huge', 'person'], ['knew', 'secret'], ['secret', 'kept', 'huge', 'secret'], ['huge', 'secret'], ['barber', 'kept', 'word'], ['barber', 'kept', 'word'], ['barber', 'kept', 'secret'], ['keeping', 'keeping', 'huge', 'secret', 'arber', 'went', 'huge', 'mountain']]


In [19]:
# words = np.hstack(preprocessed_sentences)으로도 수행 가능.
all_words_list = sum(preprocessed_sentences, [])
print(all_words_list)

['barber', 'person', 'barber', 'good', 'person', 'barber', 'huge', 'person', 'knew', 'secret', 'secret', 'kept', 'huge', 'secret', 'huge', 'secret', 'barber', 'kept', 'word', 'barber', 'kept', 'word', 'barber', 'kept', 'secret', 'keeping', 'keeping', 'huge', 'secret', 'arber', 'went', 'huge', 'mountain']


In [20]:
# 파이썬의 Counter 모듈을 이용하여 단어의 빈도수 카운트
vocab = Counter(all_words_list)
print(vocab)

Counter({'barber': 6, 'secret': 6, 'huge': 5, 'kept': 4, 'person': 3, 'word': 2, 'keeping': 2, 'good': 1, 'knew': 1, 'arber': 1, 'went': 1, 'mountain': 1})


In [21]:
print(vocab["secret"])

6


In [22]:
size = 5
vocab_size = vocab.most_common(size)

print(vocab)

Counter({'barber': 6, 'secret': 6, 'huge': 5, 'kept': 4, 'person': 3, 'word': 2, 'keeping': 2, 'good': 1, 'knew': 1, 'arber': 1, 'went': 1, 'mountain': 1})


In [23]:
# 많이 등장하는 순서대로 정수값 부여
word_to_index = {}
i = 0
for word in vocab :
    i = i + 1
    word_to_index[word] = i

print(word_to_index)

{'barber': 1, 'person': 2, 'good': 3, 'huge': 4, 'knew': 5, 'secret': 6, 'kept': 7, 'word': 8, 'keeping': 9, 'arber': 10, 'went': 11, 'mountain': 12}


### NLTK의 FreqDist 사용하기


In [24]:
from nltk import FreqDist
import numpy as np

In [25]:
# np.hstack으로 문장 구분을 제거
vocab = FreqDist(np.hstack(preprocessed_sentences))

In [26]:
vocab

FreqDist({'barber': 6, 'secret': 6, 'huge': 5, 'kept': 4, 'person': 3, 'word': 2, 'keeping': 2, 'good': 1, 'knew': 1, 'arber': 1, ...})

In [27]:
size=5
vocab = vocab.most_common(size)
vocab

[('barber', 6), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3)]

In [28]:
word_to_index = {word[0] : index + 1 for index, word in enumerate(vocab)}
print(word_to_index)

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5}


In [29]:
### 케라스의 텍스트 전처리

In [30]:
from tensorflow.keras.preprocessing.text import Tokenizer


In [31]:
preprocessed_sentences = [['barber', 'person'], ['barber', 'good', 'person'], ['barber', 'huge', 'person'], ['knew', 'secret'], ['secret', 'kept', 'huge', 'secret'], ['huge', 'secret'], ['barber', 'kept', 'word'], ['barber', 'kept', 'word'], ['barber', 'kept', 'secret'], ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'], ['barber', 'went', 'huge', 'mountain']]

In [32]:
tokenizer = Tokenizer()


# fit_on_texts()안에 코퍼스를 입력으로 하면 빈도수를 기준으로 단어 집합을 생성.
tokenizer.fit_on_texts(preprocessed_sentences) 

In [33]:
# 빈도 순대로 index 부여
print(tokenizer.word_index)

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5, 'word': 6, 'keeping': 7, 'good': 8, 'knew': 9, 'driving': 10, 'crazy': 11, 'went': 12, 'mountain': 13}


In [34]:
# 카운트
print(tokenizer.word_counts)

OrderedDict([('barber', 8), ('person', 3), ('good', 1), ('huge', 5), ('knew', 1), ('secret', 6), ('kept', 4), ('word', 2), ('keeping', 2), ('driving', 1), ('crazy', 1), ('went', 1), ('mountain', 1)])


In [35]:
print(tokenizer.texts_to_sequences(preprocessed_sentences))

[[1, 5], [1, 8, 5], [1, 3, 5], [9, 2], [2, 4, 3, 2], [3, 2], [1, 4, 6], [1, 4, 6], [1, 4, 2], [7, 7, 3, 2, 10, 1, 11], [1, 12, 3, 13]]
