# 단어 토큰화

In [1]:
from nltk.tokenize import word_tokenize

In [2]:
import nltk

In [3]:
# punkt : 마침표나 약어와 같은 특별한 언어적 특성을 고려하여 토큰화를 할 수 있게 해주는 모듈
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
text = "Although it's not a happily-ever-after ending, it is very realistic."

# 단어 토큰화
tokenized_words = word_tokenize(text)

print(tokenized_words)

['Although', 'it', "'s", 'not', 'a', 'happily-ever-after', 'ending', ',', 'it', 'is', 'very', 'realistic', '.']


# 정제(Cleaning)

In [5]:
import os
import sys

sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname(''))))

In [6]:
from data import text

In [7]:
corpus = text.TEXT

In [8]:
print(corpus)

After reading the comments for this movie, I am not sure whether I should be angry, sad or sickened. Seeing comments typical of people who a)know absolutely nothing about the military or b)who base everything they think they know on movies like this or on CNN reports about Abu-Gharib makes me wonder about the state of intellectual stimulation in the world. At the time I type this the number of people in the US military: 1.4 million on Active Duty with another almost 900,000 in the Guard and Reserves for a total of roughly 2.3 million. The number of people indicted for abuses at at Abu-Gharib: Currently less than 20 That makes the total of people indicted .00083% of the total military. Even if you indict every single military member that ever stepped in to Abu-Gharib, you would not come close to making that a whole number.  The flaws in this movie would take YEARS to cover. I understand that it's supposed to be sarcastic, but in reality, the writer and director are trying to make commen

In [9]:
from collections import Counter

In [10]:
# 전체 단어 토큰 리스트
tokenized_words = word_tokenize(corpus)

# 파이썬의 Counter 모듈을 통해 단어의 빈도수 카운트하여 단어 집합 생성
vocab = Counter(tokenized_words)

# 빈도수가 2 이하인 단어 리스트 추출
uncommon_words = [key for key, value in vocab.items() if value <= 2]

# 빈도수가 2 이하인 단어들만 제거한 결과를 따로 저장
cleaned_by_freq = [word for word in tokenized_words if word not in uncommon_words]

In [11]:
print(vocab)

Counter({'the': 30, '.': 28, ',': 21, 'of': 15, 'and': 14, 'to': 13, 'a': 12, 'military': 12, 'in': 12, 'people': 9, 'on': 9, 'are': 9, 'for': 7, 'this': 7, 'that': 6, 'I': 5, 'The': 5, 'you': 5, 'not': 4, 'or': 4, 'about': 4, 'US': 4, 'at': 4, 'every': 4, 'it': 4, 'make': 4, 'was': 4, 'movie': 3, 'be': 3, 'who': 3, 'they': 3, 'Abu-Gharib': 3, 'makes': 3, 'number': 3, 'million': 3, 'with': 3, 'total': 3, 'would': 3, 'an': 3, 'there': 3, 'days': 3, 'hour': 3, 'minimum': 3, 'get': 3, 'comments': 2, ')': 2, 'know': 2, 'nothing': 2, 'base': 2, 'state': 2, 'world': 2, 'time': 2, ':': 2, '2.3': 2, 'indicted': 2, 'than': 2, 'That': 2, "'s": 2, 'but': 2, 'reality': 2, 'is': 2, 'first': 2, 'aid': 2, 'When': 2, 'their': 2, 'Within': 2, 'hours': 2, 'food': 2, 'months': 2, 'But': 2, 'website': 2, 'men': 2, 'women': 2, 'so': 2, 'personal': 2, 'gain': 2, 'under': 2, '40': 2, 'work': 2, 'week': 2, 'much': 2, 'ranks': 2, 'degrees': 2, 'After': 1, 'reading': 1, 'am': 1, 'sure': 1, 'whether': 1, 'should

In [12]:
print('빈도수가 2 이하인 단어 수:', len(uncommon_words))

빈도수가 2 이하인 단어 수: 234


In [13]:
print('빈도수 3 이상인 토큰 수:', len(cleaned_by_freq))

빈도수 3 이상인 토큰 수: 306


In [14]:
# 길이가 2 이하인 단어 제거
cleaned_by_freq_len = []

for word in cleaned_by_freq:
    if len(word) > 2:
        cleaned_by_freq_len.append(word)

In [15]:
# 정제 결과 확인
print('정제 전:', cleaned_by_freq[:10])
print('정제 후:', cleaned_by_freq_len[:10])

정제 전: ['the', 'for', 'this', 'movie', ',', 'I', 'not', 'I', 'be', ',']
정제 후: ['the', 'for', 'this', 'movie', 'not', 'people', 'who', 'about', 'the', 'military']


## 정제 함수

In [27]:
from collections import Counter

# 등장 빈도 기준 정제 함수
def clean_by_freq(tokenized_words, cut_off_count):
    # 파이썬의 Counter 모듈을 통해 단어의 빈도수 카운트하여 단어 집합 생성
    vocab = Counter(tokenized_words)
    
    # 빈도수가 cut_off_count 이하인 단어 set 추출
    uncommon_words = {key for key, value in vocab.items() if value <= cut_off_count}
    
    # uncommon_words에 포함되지 않는 단어 리스트 생성
    cleaned_words = [word for word in tokenized_words if word not in uncommon_words]

    return cleaned_words

# 단어 길이 기준 정제 함수
def clean_by_len(tokenized_words, cut_off_length):
    # 길이가 cut_off_length 이하인 단어 제거
    cleaned_by_freq_len = []
    
    for word in tokenized_words:
        if len(word) > cut_off_length:
            cleaned_by_freq_len.append(word)

    return cleaned_by_freq_len

In [28]:
clean_by_freq = clean_by_freq(tokenized_words, 2)
cleaned_words = clean_by_len(clean_by_freq, 2)

## 불용어(stopwords)
    - Corpus에서 큰 의미가 없거나, 분석 목적에 벗어나는 단어들

In [29]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [30]:
stopwords_set = set(stopwords.words('english'))

print('불용어 개수 :', len(stopwords_set))
print(stopwords_set)

불용어 개수 : 179
{'he', 'above', 'then', 'which', 'those', 'she', 'their', 'until', "weren't", 'her', 'not', 'them', 'or', 'again', 'very', 'hasn', 'who', 'didn', 'mustn', 're', 'both', 'and', 'before', 'aren', "wouldn't", 'ours', 'being', 'at', 'having', "needn't", "hasn't", "isn't", 'wouldn', 'me', 'most', 'ma', 'mightn', 'while', 'your', 'too', 'other', 'themselves', 'than', 'was', 'because', "shan't", 'am', 'from', 'after', 'how', 'if', 'had', "it's", 'now', 'won', 'a', 'do', 'isn', 'why', 'these', 'during', 'between', 'on', 'only', 'of', "you're", 'haven', 'his', 'for', 'off', 'ain', 'be', 'such', "you'd", 'shan', 'what', 'm', "mustn't", 'no', 'there', 'that', 'all', 'd', 'just', 'its', 'over', 'in', 'needn', 'hadn', 'up', 'it', 'has', 'an', 've', "hadn't", 'ourselves', 'does', 'couldn', 'same', "doesn't", 'my', 'some', "won't", 'through', "shouldn't", 'is', 'don', 'yours', 'weren', 'were', 'below', 'down', 'herself', 'been', 'nor', 'but', 'myself', 'should', 'shouldn', 'to', 'once', 

In [31]:
stopwords_set.add('hello')
stopwords_set.remove('the')
stopwords_set.remove('me')

print('불용어 개수 :', len(stopwords_set))
print('불용어 출력 :',stopwords_set)

불용어 개수 : 178
불용어 출력 : {'he', 'above', 'then', 'which', 'those', 'she', 'their', 'until', "weren't", 'her', 'not', 'them', 'or', 'again', 'very', 'hasn', 'who', 'didn', 'mustn', 're', 'both', 'and', 'before', 'aren', "wouldn't", 'ours', 'being', 'at', 'having', "needn't", "hasn't", "isn't", 'wouldn', 'most', 'ma', 'mightn', 'while', 'your', 'too', 'other', 'themselves', 'than', 'was', 'because', "shan't", 'am', 'from', 'after', 'how', 'if', 'had', "it's", 'now', 'won', 'a', 'do', 'isn', 'why', 'these', 'during', 'between', 'on', 'only', 'of', "you're", 'haven', 'his', 'for', 'off', 'ain', 'be', 'such', "you'd", 'shan', 'what', 'm', "mustn't", 'no', 'there', 'that', 'all', 'd', 'just', 'its', 'over', 'in', 'needn', 'hadn', 'up', 'it', 'has', 'an', 've', "hadn't", 'ourselves', 'does', 'couldn', 'same', "doesn't", 'my', 'some', "won't", 'through', "shouldn't", 'is', 'don', 'yours', 'weren', 'were', 'below', 'down', 'herself', 'been', 'nor', 'but', 'myself', 'should', 'shouldn', 'to', 'once

In [32]:
my_stopwords_set = {'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves'}

print(my_stopwords_set)

{'myself', 'ourselves', 'ours', 'my', 'we', 'i', 'me', 'our'}


### 불용어 제거하기

In [33]:
stop_words_set = set(stopwords.words('english'))

# 불용어 제거
cleaned_words = []

for word in cleaned_by_freq_len:
    if word not in stop_words_set:
        cleaned_words.append(word)

In [34]:
# 불용어 제거 결과 확인
print('불용어 제거 전:', len(cleaned_by_freq_len))
print('불용어 제거 후:', len(cleaned_words))

불용어 제거 전: 169
불용어 제거 후: 67


### 불용어 처리 함수

In [35]:
# 불용어 제거 함수
def clean_by_stopwords(tokenized_words, stop_words_set):
    cleaned_words = []
    
    for word in tokenized_words:
        if word not in stop_words_set:
            cleaned_words.append(word)
            
    return cleaned_words

In [37]:
cleaned_words = clean_by_stopwords(tokenized_words, stop_words_set)

In [39]:
len(tokenized_words)

579

In [38]:
len(cleaned_words)

365

## 정규화

### 대소문자 통합

In [40]:
text = "What can I do for you? Do your homework now."

# 소문자로 변환
print(text.lower())

what can i do for you? do your homework now.


### 규칙 기반 정규화

In [41]:
# 동의어 사전
synonym_dict = {'US':'USA', 'U.S':'USA', 'Ummm':'Umm', 'Ummmm':'Umm' }
text = "She became a US citizen. Ummmm, I think, maybe and or."
normalized_words = []

# 단어 토큰화
tokenized_words = nltk.word_tokenize(text)

for word in tokenized_words:
    # 동의어 사전에 있는 단어라면, value에 해당하는 값으로 변환
    if word in synonym_dict.keys():
        word = synonym_dict[word]

    normalized_words.append(word)

In [43]:
print(normalized_words)

['She', 'became', 'a', 'USA', 'citizen', '.', 'Umm', ',', 'I', 'think', ',', 'maybe', 'and', 'or', '.']


## 어간 추출하기

### 포터 스테머 알고리즘

In [44]:
from nltk.stem import PorterStemmer

In [45]:
porter_stemmer = PorterStemmer()
text = "You are so lovely. I am loving you now."
porter_stemmed_words = []

# 단어 토큰화
tokenized_words = nltk.word_tokenize(text)

# 포터 스테머의 어간 추출
for word in tokenized_words:
    stem = porter_stemmer.stem(word)
    porter_stemmed_words.append(stem)

In [46]:
# 단어 토큰화
tokenized_words = nltk.word_tokenize(text)

In [48]:
# 포터 스테머의 어간 추출
for word in tokenized_words:
    stem = porter_stemmer.stem(word)
    porter_stemmed_words.append(stem)

In [49]:
print('어간 추출 전 :', tokenized_words)
print('포터 스테머의 어간 추출 후:', porter_stemmed_words)

어간 추출 전 : ['You', 'are', 'so', 'lovely', '.', 'I', 'am', 'loving', 'you', 'now', '.']
포터 스테머의 어간 추출 후: ['you', 'are', 'so', 'love', '.', 'i', 'am', 'love', 'you', 'now', '.', 'you', 'are', 'so', 'love', '.', 'i', 'am', 'love', 'you', 'now', '.']


In [51]:
from nltk.stem import PorterStemmer

# 포터 스테머 어간 추출 함수
def stemming_by_porter(tokenized_words):
    porter_stemmer = PorterStemmer()
    porter_stemmed_words = []

    for word in tokenized_words:
        stem = porter_stemmer.stem(word)
        porter_stemmed_words.append(stem)

    return porter_stemmed_words

### 랭커스터 스테머 알고리즘

In [52]:
from nltk.stem import LancasterStemmer

lancaster_stemmer = LancasterStemmer()
text = "You are so lovely. I am loving you now."
lancaster_stemmed_words = []

# 랭커스터 스테머의 어간 추출
for word in tokenized_words:
    stem = lancaster_stemmer.stem(word)
    lancaster_stemmed_words.append(stem)

# 실습

In [53]:
import pandas as pd

In [59]:
df = pd.read_csv("../data/imdb.tsv", delimiter="\\t", engine='python')

## 대소문자 통합

In [62]:
df['review'] = df['review'].str.lower()

## 단어 토큰화

In [63]:
df['word_tokens'] = df['review'].apply(word_tokenize)

In [65]:
%load_ext autoreload
%autoreload 2

In [72]:
import sys
sys.path.append("../function")

In [75]:
from preprocess import clean_by_freq
from preprocess import clean_by_len
from preprocess import clean_by_stopwords

In [76]:
stopwords_set = set(stopwords.words('english'))

df['cleaned_tokens'] = df['word_tokens'].apply(lambda x: clean_by_freq(x, 1))
df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: clean_by_len(x, 2))
df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: clean_by_stopwords(x, stopwords_set))

In [78]:
print(df['cleaned_tokens'][0])

['one', 'film', 'said', 'really', 'bad', 'movie', 'like', 'said', 'really', 'bad', 'movie', 'bad', 'one', 'film', 'like']


## 어간 추출

In [79]:
from preprocess import stemming_by_porter

In [80]:
df['stemmed_tokens'] = df['cleaned_tokens'].apply(stemming_by_porter)

In [81]:
print(df['stemmed_tokens'][0])

['one', 'film', 'said', 'realli', 'bad', 'movi', 'like', 'said', 'realli', 'bad', 'movi', 'bad', 'one', 'film', 'like']


In [87]:
df

Unnamed: 0,review,word_tokens,cleaned_tokens,stemmed_tokens
0,"""watching time chasers, it obvious that it was...","[``, watching, time, chasers, ,, it, obvious, ...","[one, film, said, really, bad, movie, like, sa...","[one, film, said, realli, bad, movi, like, sai..."
1,i saw this film about 20 years ago and remembe...,"[i, saw, this, film, about, 20, years, ago, an...","[film, film]","[film, film]"
2,"minor spoilers in new york, joan barnard (elvi...","[minor, spoilers, in, new, york, ,, joan, barn...","[new, york, joan, barnard, elvire, audrey, bar...","[new, york, joan, barnard, elvir, audrey, barn..."
3,i went to see this film with a great deal of e...,"[i, went, to, see, this, film, with, a, great,...","[went, film, film, went, jump, send, n't, jump...","[went, film, film, went, jump, send, n't, jump..."
4,"""yes, i agree with everyone on this site this ...","[``, yes, ,, i, agree, with, everyone, on, thi...","[site, movie, bad, even, movie, made, movie, s...","[site, movi, bad, even, movi, made, movi, spec..."
5,"""jennifer ehle was sparkling in \""""pride and p...","[``, jennifer, ehle, was, sparkling, in, \, ''...","[ehle, northam, wonderful, wonderful, ehle, no...","[ehl, northam, wonder, wonder, ehl, northam, l..."
6,amy poehler is a terrific comedian on saturday...,"[amy, poehler, is, a, terrific, comedian, on, ...","[role, movie, n't, author, book, author, autho...","[role, movi, n't, author, book, author, author..."
7,"""a plane carrying employees of a large biotech...","[``, a, plane, carrying, employees, of, a, lar...","[plane, ceo, search, rescue, mission, ceo, har...","[plane, ceo, search, rescu, mission, ceo, harl..."
8,"a well made, gritty science fiction movie, it ...","[a, well, made, ,, gritty, science, fiction, m...","[gritty, movie, sci-fi, good, suspense, movie,...","[gritti, movi, sci-fi, good, suspens, movi, sc..."
9,"""incredibly dumb and utterly predictable story...","[``, incredibly, dumb, and, utterly, predictab...","[girl, girl]","[girl, girl]"
