# 텍스트 전처리
---
- 패키지 설치
    - NLTK: pip install nltk
    - Konply : pip install konply

## [1] 토큰화(Tokenization)
---
- 문장/문서를 의미르 지닌 작은 단위로 나누는 것
- 나누어진 단어를 토큰(Token)이라 함
- 종류
    - 문장 토큰화
    - 단어 토큰화

In [22]:
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk

In [30]:
# nltk Corpus(말뭉치) 데이터셋 다운받기
nltk.download('all', quiet=True)

True

In [65]:
raw_text1='''
In Korea, more than half of residents speak Korean?
GitHub Actions makes it easy to automate? 
all your software workflows!
'''
raw_text2='''
GitHub Actions makes it easy to automate all your software workflows 
'''

In [66]:
result1 = word_tokenize(raw_text1)
print(result1)

['In', 'Korea', ',', 'more', 'than', 'half', 'of', 'residents', 'speak', 'Korean', '?', 'GitHub', 'Actions', 'makes', 'it', 'easy', 'to', 'automate', '?', 'all', 'your', 'software', 'workflows', '!']


In [67]:
result2 = word_tokenize(raw_text2)
print(result2)

['GitHub', 'Actions', 'makes', 'it', 'easy', 'to', 'automate', 'all', 'your', 'software', 'workflows']


In [68]:
text_raw = [raw_text1,raw_text2]

In [69]:
sent_result=sent_tokenize(raw_text1)

In [70]:
sent_result

['\nIn Korea, more than half of residents speak Korean?',
 'GitHub Actions makes it easy to automate?',
 'all your software workflows!']

In [71]:
len(sent_result)

3

## 여러 문장에 토큰 추출
---

In [72]:
# 문장 단위로 추출
for sent in text_raw:
    total_token=[]
    
    # 문장에서 추출한 토큰
    print(f'sent => {sent}')
    sentToken=word_tokenize(sent)
    print(sentToken,'-----',sep='\n')
    # 모든 문장의 토큰에 추가
    total_token.append(sentToken)
print('------')
print(total_token)

sent => 
In Korea, more than half of residents speak Korean?
GitHub Actions makes it easy to automate? 
all your software workflows!

['In', 'Korea', ',', 'more', 'than', 'half', 'of', 'residents', 'speak', 'Korean', '?', 'GitHub', 'Actions', 'makes', 'it', 'easy', 'to', 'automate', '?', 'all', 'your', 'software', 'workflows', '!']
-----
sent => 
GitHub Actions makes it easy to automate all your software workflows 

['GitHub', 'Actions', 'makes', 'it', 'easy', 'to', 'automate', 'all', 'your', 'software', 'workflows']
-----
------
[['GitHub', 'Actions', 'makes', 'it', 'easy', 'to', 'automate', 'all', 'your', 'software', 'workflows']]


## 한글
---

In [73]:
from konlpy.tag import Okt

# 형태소 분리 객체
okt = Okt()

In [74]:
result=okt.morphs("오늘은 월요일입니다")
print(result)

['오늘', '은', '월요일', '입니다']


In [75]:
# 형태소 분리 후 태깅(Tagging) => 품사
result2=okt.pos("오늘은 월요일입니다.", stem=True)
print(result2)

[('오늘', 'Noun'), ('은', 'Josa'), ('월요일', 'Noun'), ('이다', 'Adjective'), ('.', 'Punctuation')]


### [2] 정제 & 정규화
---
- 불영어 제거 => 노이즈 제거
- 텍스트의 동일화
    - 대문자 또는 소문자로 통일
    - 문장의 길이

### [2-1] 불용어(Stopword)

In [76]:
nltk.corpus.stopwords

<WordListCorpusReader in 'C:\\Users\\ss\\AppData\\Roaming\\nltk_data\\corpora\\stopwords'>

In [77]:
en_stopwords=nltk.corpus.stopwords.words('english')

In [78]:
len(en_stopwords)

179

In [79]:
en_stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

### [2-2] 어간 및 표제어 처리
---

In [80]:
from nltk.stem import LancasterStemmer

In [81]:
lstem = LancasterStemmer()

In [82]:
lstem.stem('working'), lstem.stem('works'), lstem.stem('worked')

('work', 'work', 'work')

In [83]:
lstem.stem('amuse'), lstem.stem('amused')

('amus', 'amus')

In [84]:
lstem.stem('happy'), lstem.stem('happiness')

('happy', 'happy')

In [85]:
# 표제어(사전에 등록된 단어 추출)
from nltk.stem import WordNetLemmatizer

In [86]:
wlemma=WordNetLemmatizer()

In [87]:
wlemma.lemmatize('working','v'), wlemma.lemmatize('worked','v')

('work', 'work')

In [88]:
wlemma.lemmatize('amusing','v'), wlemma.lemmatize('amused','v')

('amuse', 'amuse')

### [3] 텍스트 벡터화
---
- 텍스트 => 수치화
- 희소벡터(OHE)BOW 방식 --> Count기반, TH-IDF 기반
- 밀집벡터 : Embedding 방식, Word2Vect

In [89]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [90]:
corpus=[raw_text1, raw_text2]
corpus

['\nIn Korea, more than half of residents speak Korean?\nGitHub Actions makes it easy to automate? \nall your software workflows!\n',
 '\nGitHub Actions makes it easy to automate all your software workflows \n']

In [91]:
ohe = CountVectorizer()

In [93]:
result = ohe.fit_transform(corpus)

In [94]:
print(result)

  (0, 6)	1
  (0, 8)	1
  (0, 11)	1
  (0, 16)	1
  (0, 5)	1
  (0, 12)	1
  (0, 13)	1
  (0, 15)	1
  (0, 9)	1
  (0, 4)	1
  (0, 0)	1
  (0, 10)	1
  (0, 7)	1
  (0, 3)	1
  (0, 17)	1
  (0, 2)	1
  (0, 1)	1
  (0, 19)	1
  (0, 14)	1
  (0, 18)	1
  (1, 4)	1
  (1, 0)	1
  (1, 10)	1
  (1, 7)	1
  (1, 3)	1
  (1, 17)	1
  (1, 2)	1
  (1, 1)	1
  (1, 19)	1
  (1, 14)	1
  (1, 18)	1


In [95]:
result=result.toarray()

In [97]:
print(result.shape, result)

(2, 20) [[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 0 0 1 0 0 1 0 0 0 1 0 0 1 1 1]]


In [98]:
# TF-IDF 기반
tfidf = TfidfVectorizer()

In [100]:
tf_corpus=tfidf.fit_transform(corpus)

In [101]:
type(tf_corpus)

scipy.sparse.csr.csr_matrix

In [102]:
tf_corpus=tf_corpus.toarray()

In [103]:
tf_corpus

array([[0.18641024, 0.18641024, 0.18641024, 0.18641024, 0.18641024,
        0.26199309, 0.26199309, 0.18641024, 0.26199309, 0.26199309,
        0.18641024, 0.26199309, 0.26199309, 0.26199309, 0.18641024,
        0.26199309, 0.26199309, 0.18641024, 0.18641024, 0.18641024],
       [0.30151134, 0.30151134, 0.30151134, 0.30151134, 0.30151134,
        0.        , 0.        , 0.30151134, 0.        , 0.        ,
        0.30151134, 0.        , 0.        , 0.        , 0.30151134,
        0.        , 0.        , 0.30151134, 0.30151134, 0.30151134]])

In [104]:
text='''
sent='Wiki is in Ward is original description: The simplest online database that could possibly work.\
Wiki is a piece of server software that allows users to freely create and edit Web page content using 
any Web browser. Wiki supports hyperlinks and has a simple text syntax for creating new pages and crosslinks 
between internal pages on the fly.\ Wiki is unusual among group communication mechanisms in that 
it allows the organization of contributions to be edited in addition to the content itself.Like many simple concepts, 
"open editing" has some profound and subtle effects on Wiki usage. Allowing everyday users to create and edit any page 
in a Web site is exciting in that it encourages democratic use of the Web and promotes content composition 
by nontechnical users.'
'''

In [105]:
sen_text = sent_tokenize(text)

In [106]:
sen_text

["\nsent='Wiki is in Ward is original description: The simplest online database that could possibly work.Wiki is a piece of server software that allows users to freely create and edit Web page content using \nany Web browser.",
 'Wiki supports hyperlinks and has a simple text syntax for creating new pages and crosslinks \nbetween internal pages on the fly.\\ Wiki is unusual among group communication mechanisms in that \nit allows the organization of contributions to be edited in addition to the content itself.Like many simple concepts, \n"open editing" has some profound and subtle effects on Wiki usage.',
 "Allowing everyday users to create and edit any page \nin a Web site is exciting in that it encourages democratic use of the Web and promotes content composition \nby nontechnical users.'"]

In [108]:
len(sen_text)

3

In [115]:
en_stopwords=nltk.corpus.stopwords.words('english')

In [124]:
result=[]
for sent in sen_text:
    word = word_tokenize(sent)
    print(word)
    print('--------------')
    if word not in en_stopwords:
        result.append(word)
print(result, len(result))

["sent='Wiki", 'is', 'in', 'Ward', 'is', 'original', 'description', ':', 'The', 'simplest', 'online', 'database', 'that', 'could', 'possibly', 'work.Wiki', 'is', 'a', 'piece', 'of', 'server', 'software', 'that', 'allows', 'users', 'to', 'freely', 'create', 'and', 'edit', 'Web', 'page', 'content', 'using', 'any', 'Web', 'browser', '.']
--------------
['Wiki', 'supports', 'hyperlinks', 'and', 'has', 'a', 'simple', 'text', 'syntax', 'for', 'creating', 'new', 'pages', 'and', 'crosslinks', 'between', 'internal', 'pages', 'on', 'the', 'fly.\\', 'Wiki', 'is', 'unusual', 'among', 'group', 'communication', 'mechanisms', 'in', 'that', 'it', 'allows', 'the', 'organization', 'of', 'contributions', 'to', 'be', 'edited', 'in', 'addition', 'to', 'the', 'content', 'itself.Like', 'many', 'simple', 'concepts', ',', "''", 'open', 'editing', "''", 'has', 'some', 'profound', 'and', 'subtle', 'effects', 'on', 'Wiki', 'usage', '.']
--------------
['Allowing', 'everyday', 'users', 'to', 'create', 'and', 'edit

## Tokenizer 객체 생성
---