## 6.1 텍스트 정제하기

In [2]:
text_data=[" Interrobang. By Aishwarya Henriette ",
          "Parking And Going. By Karl Gautier",
          " Today Is The night. By Jarek Prakash "]

In [4]:
strip_whitespace = [string.strip() for string in text_data]

In [5]:
strip_whitespace

['Interrobang. By Aishwarya Henriette',
 'Parking And Going. By Karl Gautier',
 'Today Is The night. By Jarek Prakash']

In [6]:
remove_periods = [string.replace(".","")for string in strip_whitespace]


In [7]:
remove_periods

['Interrobang By Aishwarya Henriette',
 'Parking And Going By Karl Gautier',
 'Today Is The night By Jarek Prakash']

In [10]:
def capitalizer(string: str) -> str:
    return string.upper()

In [11]:
[capitalizer(string) for string in remove_periods]

['INTERROBANG BY AISHWARYA HENRIETTE',
 'PARKING AND GOING BY KARL GAUTIER',
 'TODAY IS THE NIGHT BY JAREK PRAKASH']

In [12]:
import re

In [13]:
def replace_letters_with_X(string: str) -> str:
    return re.sub(r"[a-zA-Z]", "X", string)

In [14]:
[replace_letters_with_X(string) for string in remove_periods]

['XXXXXXXXXXX XX XXXXXXXXX XXXXXXXXX',
 'XXXXXXX XXX XXXXX XX XXXX XXXXXXX',
 'XXXXX XX XXX XXXXX XX XXXXX XXXXXXX']

## 6.2 HTML 파싱과 정제하기

In [15]:
from bs4 import BeautifulSoup

In [22]:
html = """
    <div class='full_name'><span style = 'font-weight:bold'>Masego</span> Azra</div>"
    """

In [23]:
soup = BeautifulSoup(html, "lxml")

In [24]:
soup.find("div", { "class": "full_name"}).text

'Masego Azra'

## 6.3 구두점 삭제하기

In [25]:
import unicodedata
import sys

In [26]:
text_data = ['Hi!!!! I. Love. This. Song....',
            '10000% Agree!!!! #LoveIT',
            'Right?!?!']

In [30]:
punctuation = dict.fromkeys(i for i in range(sys.maxunicode)
                           if unicodedata.category(chr(i)).startswith('P'))

In [31]:
[string.translate(punctuation) for string in text_data]

['Hi I Love This Song', '10000 Agree LoveIT', 'Right']

## 6.4 텍스트 토큰화하기

In [32]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\MyCom\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [33]:
from nltk.tokenize import word_tokenize

In [34]:
string = "The science of today is the technology of tomorrow"

In [36]:
word_tokenize(string)

['The', 'science', 'of', 'today', 'is', 'the', 'technology', 'of', 'tomorrow']

In [38]:
from nltk.tokenize import sent_tokenize

In [40]:
string = "The science of today is the technology of tomorrow. Tomorrow is today."

In [41]:
sent_tokenize(string)

['The science of today is the technology of tomorrow.', 'Tomorrow is today.']

## 6.5 불용어 삭제하기

In [43]:
nltk.download('stopwords')
from nltk.corpus import stopwords
tokenized_words = ['i','am', 'going', 'to', 'go', 'to', 'the', 'store', 'and', 'park']

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MyCom\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [44]:
stop_words = stopwords.words('english')

In [45]:
[word for word in tokenized_words if word not in stop_words]

['going', 'go', 'store', 'park']

In [46]:
stop_words[:5]

['i', 'me', 'my', 'myself', 'we']

In [47]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [49]:
len(ENGLISH_STOP_WORDS), len(stop_words)

(318, 179)

In [51]:
list(ENGLISH_STOP_WORDS)[:5]

['also', 'since', 'here', 'nevertheless', 'besides']

## 6.6 어간 추출하기

In [52]:
from nltk.stem.porter import PorterStemmer
tokenized_words = ['i', 'am', 'humbled', 'by', 'this', 'tradtional', 'metting']

In [53]:
porter = PorterStemmer()

In [54]:
[porter.stem(word) for word in tokenized_words]

['i', 'am', 'humbl', 'by', 'thi', 'tradtion', 'met']

## 6.7 품사 태깅하기

In [1]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\MyCom\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [3]:
from nltk import pos_tag
from nltk import word_tokenize

In [4]:
text_data = "Chris loved outdoor running"

In [5]:
text_tagged = pos_tag(word_tokenize(text_data))

In [6]:
text_tagged

[('Chris', 'NNP'), ('loved', 'VBD'), ('outdoor', 'RP'), ('running', 'VBG')]

In [7]:
[word for word, tag in text_tagged if tag in ['NN', 'NNS', 'NNP', 'NNPS']]

['Chris']

In [8]:
from sklearn.preprocessing import MultiLabelBinarizer

In [9]:
tweets = ["I am eating a burrito for breakfast",
         "Political science is an amazing field",
         "San Francisco is an awesome city"]

In [10]:
tagged_tweets = []

In [12]:
for tweet in tweets:
    tweet_tag = nltk.pos_tag(word_tokenize(tweet))
    tagged_tweets.append([tag for word, tag in tweet_tag])

In [13]:
one_hot_multi = MultiLabelBinarizer()

In [17]:
one_hot_multi.fit_transform(tagged_tweets)

array([[1, 1, 0, 1, 0, 1, 1, 1, 0],
       [1, 0, 1, 1, 0, 0, 0, 0, 1],
       [1, 0, 1, 1, 1, 0, 0, 0, 1]])

In [19]:
one_hot_multi.classes_

array(['DT', 'IN', 'JJ', 'NN', 'NNP', 'PRP', 'VBG', 'VBP', 'VBZ'],
      dtype=object)

In [20]:
import nltk
nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\MyCom\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.


True

In [23]:
from nltk.corpus import brown
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

In [25]:
sentences = brown.tagged_sents(categories='news')

In [26]:
train = sentences[:4000]

In [27]:
test=sentences[4000:]

In [28]:
unigram = UnigramTagger(train)
bigram = BigramTagger(train, backoff=unigram)
trigram = TrigramTagger(train, backoff=bigram)

In [29]:
trigram.evaluate(test)

0.8174734002697437

In [30]:
pip install konlpy

Collecting konlpy
  Downloading konlpy-0.5.2-py2.py3-none-any.whl (19.4 MB)
Collecting JPype1>=0.7.0
  Downloading JPype1-1.0.2-cp37-cp37m-win_amd64.whl (1.6 MB)
Collecting tweepy>=3.7.0
  Downloading tweepy-3.9.0-py2.py3-none-any.whl (30 kB)
Collecting beautifulsoup4==4.6.0
  Downloading beautifulsoup4-4.6.0-py3-none-any.whl (86 kB)
Installing collected packages: JPype1, tweepy, beautifulsoup4, konlpy
  Attempting uninstall: beautifulsoup4
    Found existing installation: beautifulsoup4 4.8.2
    Uninstalling beautifulsoup4-4.8.2:
      Successfully uninstalled beautifulsoup4-4.8.2
Successfully installed JPype1-1.0.2 beautifulsoup4-4.6.0 konlpy-0.5.2 tweepy-3.9.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
from konlpy.tag import Okt

In [2]:
java_home ="C:\ProgramFiles\Java\jdk-14.0.1"

In [3]:
okt=Okt()

In [4]:
text = '태양계는 지금으로부터 약 46억 년 전, 거대한 분자 구름의 일부분이 중력 붕괴를 일으키면서 형성되었다'

In [5]:
okt.pos(text)

[('태양계', 'Noun'),
 ('는', 'Josa'),
 ('지금', 'Noun'),
 ('으로부터', 'Josa'),
 ('약', 'Noun'),
 ('46억', 'Number'),
 ('년', 'Noun'),
 ('전', 'Noun'),
 (',', 'Punctuation'),
 ('거대한', 'Adjective'),
 ('분자', 'Noun'),
 ('구름', 'Noun'),
 ('의', 'Josa'),
 ('일부분', 'Noun'),
 ('이', 'Josa'),
 ('중력', 'Noun'),
 ('붕괴', 'Noun'),
 ('를', 'Josa'),
 ('일으키면서', 'Verb'),
 ('형성', 'Noun'),
 ('되었다', 'Verb')]

In [6]:
okt.morphs(text)

['태양계',
 '는',
 '지금',
 '으로부터',
 '약',
 '46억',
 '년',
 '전',
 ',',
 '거대한',
 '분자',
 '구름',
 '의',
 '일부분',
 '이',
 '중력',
 '붕괴',
 '를',
 '일으키면서',
 '형성',
 '되었다']

In [7]:
okt.nouns(text)

['태양계', '지금', '약', '년', '전', '분자', '구름', '일부분', '중력', '붕괴', '형성']

## 6.8 텍스트를 BoW로 인코딩하기

In [6]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
text_data = np.array(['I love Brazil. Brazil!',
                     'Sweden is best',
                     'Germany beats both'])

In [8]:
count = CountVectorizer()
bag_of_words = count.fit_transform(text_data)

In [9]:
bag_of_words

<3x8 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [10]:
bag_of_words.toarray()

array([[0, 0, 0, 2, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 1, 0, 1],
       [1, 0, 1, 0, 1, 0, 0, 0]], dtype=int64)

In [11]:
count.get_feature_names()

['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love', 'sweden']

In [14]:
count_2gram = CountVectorizer(ngram_range=(1,2), stop_words="english", vocabulary=['brazil'])

In [15]:
bag=count_2gram.fit_transform(text_data)

In [16]:
bag.toarray()

array([[2],
       [0],
       [0]], dtype=int64)

In [18]:
count_2gram.vocabulary_

{'brazil': 0}

## 6.9 단어 중요도에 가중치 부여하기

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
text_data = np.array(['I love Brazil. Brazil!',
                     'Sweden is best',
                     'Germany beats both'])

In [21]:
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_data)

In [23]:
feature_matrix

<3x8 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [24]:
feature_matrix.toarray()

array([[0.        , 0.        , 0.        , 0.89442719, 0.        ,
        0.        , 0.4472136 , 0.        ],
       [0.        , 0.57735027, 0.        , 0.        , 0.        ,
        0.57735027, 0.        , 0.57735027],
       [0.57735027, 0.        , 0.57735027, 0.        , 0.57735027,
        0.        , 0.        , 0.        ]])

In [25]:
tfidf.vocabulary_

{'love': 6,
 'brazil': 3,
 'sweden': 7,
 'is': 5,
 'best': 1,
 'germany': 4,
 'beats': 0,
 'both': 2}