## 텍스트 전처리(영어)

In [1]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
text = "Text messaging, or texting, is the act of composing and sending brief, electronic messages between two or more mobile phones, or fixed or portable devices over a phone network."
print(text)

Text messaging, or texting, is the act of composing and sending brief, electronic messages between two or more mobile phones, or fixed or portable devices over a phone network.


* 정규화(대소문자 통합)

In [3]:
text = text.lower()
print(text)

text messaging, or texting, is the act of composing and sending brief, electronic messages between two or more mobile phones, or fixed or portable devices over a phone network.


* 토큰화

In [5]:
tokens = nltk.word_tokenize(text)
print(tokens)

['text', 'messaging', ',', 'or', 'texting', ',', 'is', 'the', 'act', 'of', 'composing', 'and', 'sending', 'brief', ',', 'electronic', 'messages', 'between', 'two', 'or', 'more', 'mobile', 'phones', ',', 'or', 'fixed', 'or', 'portable', 'devices', 'over', 'a', 'phone', 'network', '.']


* 표제어 추출

In [6]:
lemmatizer = nltk.stem.WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(t) for t in tokens]
print(tokens)

['text', 'messaging', ',', 'or', 'texting', ',', 'is', 'the', 'act', 'of', 'composing', 'and', 'sending', 'brief', ',', 'electronic', 'message', 'between', 'two', 'or', 'more', 'mobile', 'phone', ',', 'or', 'fixed', 'or', 'portable', 'device', 'over', 'a', 'phone', 'network', '.']


* 불용어 제거
  * is, and, are, don't

In [8]:
from nltk.corpus import stopwords
stopwords_en = stopwords.words('english')
tokens = [t for t in tokens if t not in stopwords_en]
tokens

['text',
 'messaging',
 ',',
 'texting',
 ',',
 'act',
 'composing',
 'sending',
 'brief',
 ',',
 'electronic',
 'message',
 'two',
 'mobile',
 'phone',
 ',',
 'fixed',
 'portable',
 'device',
 'phone',
 'network',
 '.']

* 숫자, 구두점, 특수 문자 등 제거

In [9]:
import re
tokens = [t for t in tokens if re.fullmatch(r'[a-z_-]+', t)] # 소문자, 언더바, 하이픈 만 포함
print(tokens)

['text', 'messaging', 'texting', 'act', 'composing', 'sending', 'brief', 'electronic', 'message', 'two', 'mobile', 'phone', 'fixed', 'portable', 'device', 'phone', 'network']


In [10]:
txt = "Big data is a field that treats ways to analyze, systematically extract information from, or otherwise deal with data sets that are too large or complex to be dealt with by traditional data-processing application software."
print(txt)

Big data is a field that treats ways to analyze, systematically extract information from, or otherwise deal with data sets that are too large or complex to be dealt with by traditional data-processing application software.


In [11]:
txt = txt.lower()
print(txt)

big data is a field that treats ways to analyze, systematically extract information from, or otherwise deal with data sets that are too large or complex to be dealt with by traditional data-processing application software.


In [12]:
tk = nltk.word_tokenize(txt)
print(tk)

['big', 'data', 'is', 'a', 'field', 'that', 'treats', 'ways', 'to', 'analyze', ',', 'systematically', 'extract', 'information', 'from', ',', 'or', 'otherwise', 'deal', 'with', 'data', 'sets', 'that', 'are', 'too', 'large', 'or', 'complex', 'to', 'be', 'dealt', 'with', 'by', 'traditional', 'data-processing', 'application', 'software', '.']


In [13]:
lemmatizer = nltk.stem.WordNetLemmatizer()
tk = [lemmatizer.lemmatize(t) for t in tk]
print(tk)

['big', 'data', 'is', 'a', 'field', 'that', 'treat', 'way', 'to', 'analyze', ',', 'systematically', 'extract', 'information', 'from', ',', 'or', 'otherwise', 'deal', 'with', 'data', 'set', 'that', 'are', 'too', 'large', 'or', 'complex', 'to', 'be', 'dealt', 'with', 'by', 'traditional', 'data-processing', 'application', 'software', '.']


In [14]:
tk = [t for t in tk if t not in stopwords_en]
tk

['big',
 'data',
 'field',
 'treat',
 'way',
 'analyze',
 ',',
 'systematically',
 'extract',
 'information',
 ',',
 'otherwise',
 'deal',
 'data',
 'set',
 'large',
 'complex',
 'dealt',
 'traditional',
 'data-processing',
 'application',
 'software',
 '.']

In [17]:
tk = [t for t in tk if re.fullmatch(r'[a-z_-]+', t)]
print(tk)

['big', 'data', 'field', 'treat', 'way', 'analyze', 'systematically', 'extract', 'information', 'otherwise', 'deal', 'data', 'set', 'large', 'complex', 'dealt', 'traditional', 'data-processing', 'application', 'software']


* 해리포터 텍스트 파일 불러오기
  * 과목 홈페이지에서 파일 다운받아서 작업 경로 안에 두기
  * 좌측 패널에서 파일 찾은 후 경로 복사



In [19]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [20]:
r = open('/content/gdrive/MyDrive/Colab Notebooks/BigData/harrypotter.txt', mode = 'r')
text = r.read()
r.close
print(text)

Harry Potter is a series of seven fantasy novels written by British author, J. K. Rowling. The novels chronicle the lives of a young wizard, Harry Potter, and his friends Hermione Granger and Ron Weasley, all of whom are students at Hogwarts School of Witchcraft and Wizardry. The main story arc concerns Harry's struggle against Lord Voldemort, a dark wizard who intends to become immortal, overthrow the wizard governing body known as the Ministry of Magic and subjugate all wizards and Muggles (non-magical people).
Since the release of the first novel, Harry Potter and the Philosopher's Stone, on 26 June 1997, the books have found immense popularity, positive reviews, and commercial success worldwide. They have attracted a wide adult audience as well as younger readers and are often considered cornerstones of modern young adult literature. As of February 2018, the books have sold more than 500 million copies worldwide, making them the best-selling book series in history, and have been tr

In [22]:
text = text.lower()
tokens = nltk.word_tokenize(text)
lemmatizer = nltk.stem.WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(t) for t in tokens]
tokens = [t for t in tokens if t not in stopwords_en]
tokens = [t for t in tokens if re.fullmatch(r'[a-z_-]+', t)]
print(tokens)

['harry', 'potter', 'series', 'seven', 'fantasy', 'novel', 'written', 'british', 'author', 'rowling', 'novel', 'chronicle', 'life', 'young', 'wizard', 'harry', 'potter', 'friend', 'hermione', 'granger', 'ron', 'weasley', 'student', 'hogwarts', 'school', 'witchcraft', 'wizardry', 'main', 'story', 'arc', 'concern', 'harry', 'struggle', 'lord', 'voldemort', 'dark', 'wizard', 'intends', 'become', 'immortal', 'overthrow', 'wizard', 'governing', 'body', 'known', 'ministry', 'magic', 'subjugate', 'wizard', 'muggles', 'non-magical', 'people', 'since', 'release', 'first', 'novel', 'harry', 'potter', 'philosopher', 'stone', 'june', 'book', 'found', 'immense', 'popularity', 'positive', 'review', 'commercial', 'success', 'worldwide', 'attracted', 'wide', 'adult', 'audience', 'well', 'younger', 'reader', 'often', 'considered', 'cornerstone', 'modern', 'young', 'adult', 'literature', 'february', 'book', 'sold', 'million', 'copy', 'worldwide', 'making', 'best-selling', 'book', 'series', 'history', 't