In [1]:
import nltk
import numpy as np
import pandas as pd

<br>
<br>

# 토큰화

In [2]:
from nltk.tokenize import sent_tokenize

# 텍스트를 문장 토큰으로 분리
sentences = "Hello World. It's good to see you. Thanks for buying this book."
print(sent_tokenize(sentences))


['Hello World.', "It's good to see you.", 'Thanks for buying this book.']


In [3]:
# 마침표를 기준으로 분리
sentences = "Hello World It's good to see you. Thanks for buying this book."
print(sent_tokenize(sentences))


["Hello World It's good to see you.", 'Thanks for buying this book.']


In [4]:
from nltk.tokenize import word_tokenize

# 텍스트를 단어 토큰으로 분리
print(word_tokenize("Hello World."))


['Hello', 'World', '.']


In [5]:
# ' 부호를 고려하여 토큰 생성
print(word_tokenize("I can't do. They'll be there."))


['I', 'ca', "n't", 'do', '.', 'They', "'ll", 'be', 'there', '.']


In [6]:
from nltk.tokenize import WordPunctTokenizer

# 구두점(punctuation)을 모두 분리하여 토큰 생성
tokenizer = WordPunctTokenizer()
print(tokenizer.tokenize("I can't do. They'll be there."))


['I', 'can', "'", 't', 'do', '.', 'They', "'", 'll', 'be', 'there', '.']


In [7]:
from nltk.tokenize.regexp import RegexpTokenizer

# 정규표현식을 사용하여 토큰 생성
# [\w']+ : 문자/숫자/' 중 하나가 계속 반복
# 공백과 마침표가 제거됨
tokenizer = RegexpTokenizer("[\w']+")
print(tokenizer.tokenize("I can't do. They'll be there."))


['I', "can't", 'do', "They'll", 'be', 'there']


In [8]:
# gaps=True : 패턴의 문자열을 찾지 않고, 패턴을 기준으로 토큰으로 분리
# \s+ : 공백 반복
tokenizer = RegexpTokenizer("\s+", gaps=True)
print(tokenizer.tokenize("I can't do. They'll be there."))


['I', "can't", 'do.', "They'll", 'be', 'there.']


In [9]:
from nltk.tokenize.regexp import regexp_tokenize

print(regexp_tokenize("I can't do. They'll be there.", "[\w']+"))


['I', "can't", 'do', "They'll", 'be', 'there']


<br>
<br>

# 대소문자 통일

In [10]:
sentence = "I live in South Korea."
sentence = sentence.lower()
sentence


'i live in south korea.'

In [11]:
sentence = sentence.upper()
sentence


'I LIVE IN SOUTH KOREA.'

<br>
<br>

# 특수기호 제거

In [12]:
import re

sentence = "I like pizza^^ What do you like?"
sentence = re.sub(r"[.,;!?^]", "", sentence)
sentence


'I like pizza What do you like'

<br>
<br>

# 불용어 제거

In [13]:
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
from nltk.corpus import stopwords

# 불용어 언어 목록
print(stopwords.fileids())


['arabic', 'azerbaijani', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 'hungarian', 'indonesian', 'italian', 'kazakh', 'nepali', 'norwegian', 'portuguese', 'romanian', 'russian', 'slovene', 'spanish', 'swedish', 'tajik', 'turkish']


In [15]:
# 영어 불용어
english_stopwords = set(stopwords.words('english'))
english_stopwords


{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [16]:
# 프랑스어 불용어
french_stopwords = set(stopwords.words('french'))
french_stopwords


{'ai',
 'aie',
 'aient',
 'aies',
 'ait',
 'as',
 'au',
 'aura',
 'aurai',
 'auraient',
 'aurais',
 'aurait',
 'auras',
 'aurez',
 'auriez',
 'aurions',
 'aurons',
 'auront',
 'aux',
 'avaient',
 'avais',
 'avait',
 'avec',
 'avez',
 'aviez',
 'avions',
 'avons',
 'ayant',
 'ayante',
 'ayantes',
 'ayants',
 'ayez',
 'ayons',
 'c',
 'ce',
 'ces',
 'd',
 'dans',
 'de',
 'des',
 'du',
 'elle',
 'en',
 'es',
 'est',
 'et',
 'eu',
 'eue',
 'eues',
 'eurent',
 'eus',
 'eusse',
 'eussent',
 'eusses',
 'eussiez',
 'eussions',
 'eut',
 'eux',
 'eûmes',
 'eût',
 'eûtes',
 'furent',
 'fus',
 'fusse',
 'fussent',
 'fusses',
 'fussiez',
 'fussions',
 'fut',
 'fûmes',
 'fût',
 'fûtes',
 'il',
 'ils',
 'j',
 'je',
 'l',
 'la',
 'le',
 'les',
 'leur',
 'lui',
 'm',
 'ma',
 'mais',
 'me',
 'mes',
 'moi',
 'mon',
 'même',
 'n',
 'ne',
 'nos',
 'notre',
 'nous',
 'on',
 'ont',
 'ou',
 'par',
 'pas',
 'pour',
 'qu',
 'que',
 'qui',
 's',
 'sa',
 'se',
 'sera',
 'serai',
 'seraient',
 'serais',
 'serait',


In [17]:
# 아랍어 불용어
arabic_stopwords = set(stopwords.words('arabic'))
arabic_stopwords


{'آه',
 'آها',
 'آي',
 'أف',
 'أقل',
 'أكثر',
 'ألا',
 'أم',
 'أما',
 'أن',
 'أنا',
 'أنت',
 'أنتم',
 'أنتما',
 'أنتن',
 'أنى',
 'أو',
 'أولئك',
 'أولاء',
 'أوه',
 'أي',
 'أين',
 'أينما',
 'أيها',
 'إذ',
 'إذا',
 'إذما',
 'إذن',
 'إلا',
 'إلى',
 'إليك',
 'إليكم',
 'إليكما',
 'إليكن',
 'إما',
 'إن',
 'إنا',
 'إنما',
 'إنه',
 'إي',
 'إيه',
 'التي',
 'الذي',
 'الذين',
 'اللائي',
 'اللاتي',
 'اللتان',
 'اللتيا',
 'اللتين',
 'اللذان',
 'اللذين',
 'اللواتي',
 'بخ',
 'بس',
 'بعد',
 'بعض',
 'بك',
 'بكم',
 'بكما',
 'بكن',
 'بل',
 'بلى',
 'بما',
 'بماذا',
 'بمن',
 'بنا',
 'به',
 'بها',
 'بهم',
 'بهما',
 'بهن',
 'بي',
 'بيد',
 'بين',
 'تلك',
 'تلكم',
 'تلكما',
 'ته',
 'تي',
 'تين',
 'تينك',
 'ثم',
 'ثمة',
 'حاشا',
 'حبذا',
 'حتى',
 'حيث',
 'حيثما',
 'حين',
 'خلا',
 'دون',
 'ذا',
 'ذات',
 'ذاك',
 'ذان',
 'ذانك',
 'ذلك',
 'ذلكم',
 'ذلكما',
 'ذلكن',
 'ذه',
 'ذو',
 'ذوا',
 'ذواتا',
 'ذواتي',
 'ذي',
 'ذين',
 'ذينك',
 'ريث',
 'سوف',
 'سوى',
 'شتان',
 'عدا',
 'عسى',
 'عل',
 'على',
 'عليك',
 'عليه',
 'عم

In [18]:
sentence = "I am a boy and you are a girl."

# 소문자 변환
sentence = sentence.lower()
print(sentence)

# 특수기호 제거
sentence = re.sub(r"[.,;!?^]", "", sentence)
print(sentence)

# 토큰화
words = word_tokenize(sentence)
print(words)

# 불용어 목록에 있는 단어들을 제거
words = [word for word in words if word not in english_stopwords]
print(words)


i am a boy and you are a girl.
i am a boy and you are a girl
['i', 'am', 'a', 'boy', 'and', 'you', 'are', 'a', 'girl']
['boy', 'girl']


<br>
<br>

# 토큰을 인덱스로 변환

In [19]:
def text_to_indexs(sentences, cutoff_for_rare_words = 1):

    # 다차원 배열을 일차원 배열로 변경
    if len(sentences) > 1:
        tokens = [item for sublist in sentences for item in sublist]
    else:
        tokens = sentences
    
    # 토큰의 빈도수 구함
    fdist = nltk.FreqDist(tokens)

    # 판다스 데이터프레임으로 변환
    # orient='index' : 딕셔너리의 키값(fdist의 단어)이 행이 되도록 설정
    df_fdist = pd.DataFrame.from_dict(fdist, orient='index')
    df_fdist.columns = ['Frequency']

    # 단어 빈도수로 정렬
    # inplace=True : 현재 데이터프레임에 정렬 결과 저장
    df_fdist.sort_values(by=['Frequency'], ascending=False, inplace=True)

    # 단어 인덱스 추가
    number_of_words = df_fdist.shape[0]
    df_fdist['word_index'] = list(np.arange(number_of_words))

    # 판다스를 딕셔너리로 변환
    word_dict = df_fdist['word_index'].to_dict()
    
    # 토큰을 인덱스로 변환
    text_indexs = []
    for sentence in sentences:
        string_indexs = [word_dict[word] for word in sentence]
        text_indexs.append(string_indexs)  
    
    return (text_indexs, word_dict)

In [20]:
text = [['hello', 'world', 'Michael'],
         ['hello', 'universe'],
         ['michael', 'makes', 'a', 'good', 'cup', 'of', 'tea'],
         ['tea', 'is', 'nice'],
         ['michael', 'is', 'nice']]

# 텍스트 토큰을 인덱스로 변환
text_indexs, word_dict = text_to_indexs(text)

# 인덱스 출력
print (text_indexs)


[[0, 5, 6], [0, 7], [1, 8, 9, 10, 11, 12, 2], [2, 3, 4], [1, 3, 4]]


In [21]:
# 인덱스 딕셔너리
word_dict


{'hello': 0,
 'michael': 1,
 'tea': 2,
 'is': 3,
 'nice': 4,
 'world': 5,
 'Michael': 6,
 'universe': 7,
 'makes': 8,
 'a': 9,
 'good': 10,
 'cup': 11,
 'of': 12}