# 단어 토큰화

In [1]:
from nltk.tokenize import word_tokenize

In [2]:
import nltk

In [3]:
# punkt : 마침표나 약어와 같은 특별한 언어적 특성을 고려하여 토큰화를 할 수 있게 해주는 모듈
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
text = "Although it's not a happily-ever-after ending, it is very realistic."

# 단어 토큰화
tokenized_words = word_tokenize(text)

print(tokenized_words)

['Although', 'it', "'s", 'not', 'a', 'happily-ever-after', 'ending', ',', 'it', 'is', 'very', 'realistic', '.']


# 정제(Cleaning)

In [81]:
import os
import sys

sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname(''))))

In [86]:
from data import text

In [90]:
corpus = text.TEXT

In [91]:
print(corpus)

After reading the comments for this movie, I am not sure whether I should be angry, sad or sickened. Seeing comments typical of people who a)know absolutely nothing about the military or b)who base everything they think they know on movies like this or on CNN reports about Abu-Gharib makes me wonder about the state of intellectual stimulation in the world. At the time I type this the number of people in the US military: 1.4 million on Active Duty with another almost 900,000 in the Guard and Reserves for a total of roughly 2.3 million. The number of people indicted for abuses at at Abu-Gharib: Currently less than 20 That makes the total of people indicted .00083% of the total military. Even if you indict every single military member that ever stepped in to Abu-Gharib, you would not come close to making that a whole number.  The flaws in this movie would take YEARS to cover. I understand that it's supposed to be sarcastic, but in reality, the writer and director are trying to make commen

In [92]:
from collections import Counter

In [93]:
# 전체 단어 토큰 리스트
tokenized_words = word_tokenize(corpus)

# 파이썬의 Counter 모듈을 통해 단어의 빈도수 카운트하여 단어 집합 생성
vocab = Counter(tokenized_words)

# 빈도수가 2 이하인 단어 리스트 추출
uncommon_words = [key for key, value in vocab.items() if value <= 2]

# 빈도수가 2 이하인 단어들만 제거한 결과를 따로 저장
cleaned_by_freq = [word for word in tokenized_words if word not in uncommon_words]

In [94]:
print(vocab)

Counter({'the': 30, '.': 28, ',': 21, 'of': 15, 'and': 14, 'to': 13, 'a': 12, 'military': 12, 'in': 12, 'people': 9, 'on': 9, 'are': 9, 'for': 7, 'this': 7, 'that': 6, 'I': 5, 'The': 5, 'you': 5, 'not': 4, 'or': 4, 'about': 4, 'US': 4, 'at': 4, 'every': 4, 'it': 4, 'make': 4, 'was': 4, 'movie': 3, 'be': 3, 'who': 3, 'they': 3, 'Abu-Gharib': 3, 'makes': 3, 'number': 3, 'million': 3, 'with': 3, 'total': 3, 'would': 3, 'an': 3, 'there': 3, 'days': 3, 'hour': 3, 'minimum': 3, 'get': 3, 'comments': 2, ')': 2, 'know': 2, 'nothing': 2, 'base': 2, 'state': 2, 'world': 2, 'time': 2, ':': 2, '2.3': 2, 'indicted': 2, 'than': 2, 'That': 2, "'s": 2, 'but': 2, 'reality': 2, 'is': 2, 'first': 2, 'aid': 2, 'When': 2, 'their': 2, 'Within': 2, 'hours': 2, 'food': 2, 'months': 2, 'But': 2, 'website': 2, 'men': 2, 'women': 2, 'so': 2, 'personal': 2, 'gain': 2, 'under': 2, '40': 2, 'work': 2, 'week': 2, 'much': 2, 'ranks': 2, 'degrees': 2, 'After': 1, 'reading': 1, 'am': 1, 'sure': 1, 'whether': 1, 'should

In [98]:
print('빈도수가 2 이하인 단어 수:', len(uncommon_words))

빈도수가 2 이하인 단어 수: 234


In [97]:
print('빈도수 3 이상인 토큰 수:', len(cleaned_by_freq))

빈도수 3 이상인 토큰 수: 306


In [99]:
# 길이가 2 이하인 단어 제거
cleaned_by_freq_len = []

for word in cleaned_by_freq:
    if len(word) > 2:
        cleaned_by_freq_len.append(word)

In [100]:
# 정제 결과 확인
print('정제 전:', cleaned_by_freq[:10])
print('정제 후:', cleaned_by_freq_len[:10])

정제 전: ['the', 'for', 'this', 'movie', ',', 'I', 'not', 'I', 'be', ',']
정제 후: ['the', 'for', 'this', 'movie', 'not', 'people', 'who', 'about', 'the', 'military']


## 정제 함수

In [101]:
from collections import Counter

# 등장 빈도 기준 정제 함수
def clean_by_freq(tokenized_words, cut_off_count):
    # 파이썬의 Counter 모듈을 통해 단어의 빈도수 카운트하여 단어 집합 생성
    vocab = Counter(tokenized_words)
    
    # 빈도수가 cut_off_count 이하인 단어 set 추출
    uncommon_words = {key for key, value in vocab.items() if value <= cut_off_count}
    
    # uncommon_words에 포함되지 않는 단어 리스트 생성
    cleaned_words = [word for word in tokenized_words if word not in uncommon_words]

    return cleaned_words

# 단어 길이 기준 정제 함수
def clean_by_len(tokenized_words, cut_off_length):
    # 길이가 cut_off_length 이하인 단어 제거
    cleaned_by_freq_len = []
    
    for word in tokenized_words:
        if len(word) > cut_off_length:
            cleaned_by_freq_len.append(word)

    return cleaned_by_freq_len