In [1]:
"""
표제어(Lemmatization) 추출:서로 형태는 다르지만, root 단어를
가지고 비교해서, 전체적으로 단어의 개수를 줄이자.
am, are, is, was, were.. => be(표제어)

형태소 : stem(어간, 단어의 의미), affix(접사,부가적 의미)
형태소 파싱 : 어간, 접사를 분리하는 작업
dog(독립형태소)
dogs = dog(어간) + s(접사)

WordNetLemmatizer :  NLTK에 표제어 추출 도구
"""
from nltk.stem import WordNetLemmatizer
wnl=WordNetLemmatizer()

In [2]:
#wnl.lemmatize('watched') #watched
wnl.lemmatize('watched', 'v')
wnl.lemmatize('has','v')
wnl.lemmatize('dies','v')

'die'

In [3]:
#어간 추출
text="Python is an interpreted, high-level, general-purpose programming language."
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [4]:
ps=PorterStemmer()
words=word_tokenize(text)
print(words)

['Python', 'is', 'an', 'interpreted', ',', 'high-level', ',', 'general-purpose', 'programming', 'language', '.']


In [5]:
print([ps.stem(w) for w in words])

['python', 'is', 'an', 'interpret', ',', 'high-level', ',', 'general-purpos', 'program', 'languag', '.']


In [6]:
print(ps.stem('electricical'))
print(ps.stem('formalize'))
#구글 : 마틴 포터 or 포터스태머 검색

electric
formal


In [7]:
ps.stem('going') #go
ps.stem('gone') #gone
from nltk.stem import LancasterStemmer

In [8]:
ls=LancasterStemmer()
ls.stem('going') #going
ls.stem('gone') #gon
ls.stem('dies')

'die'

In [9]:
#불용어:stopwords
from nltk.corpus import stopwords

In [10]:
sw=stopwords.words('english')
ex="Family is not an important thing. It's everything"
wt=word_tokenize(ex)
res=[]
for w in wt:
    if w not in sw:
        res.append(w)
print(wt)
print(res)
        

['Family', 'is', 'not', 'an', 'important', 'thing', '.', 'It', "'s", 'everything']
['Family', 'important', 'thing', '.', 'It', "'s", 'everything']


In [11]:
#www.ranks.nl/stopwords/korean
ex="""
최근 코로나19로 인한 감염으로 인해 
확진자 및 사망자가 증가하고 있습니다. 
코로나19를 이겨냅시다.
"""
stop_words="인한 증가 최근 및"
stop_words=stop_words.split(" ")
wt=word_tokenize(ex)
print(wt)
res=[]
for w in wt:
    if w not in stop_words:
        res.append(w)        
print(res)

['최근', '코로나19로', '인한', '감염으로', '인해', '확진자', '및', '사망자가', '증가하고', '있습니다', '.', '코로나19를', '이겨냅시다', '.']
['코로나19로', '감염으로', '인해', '확진자', '사망자가', '증가하고', '있습니다', '.', '코로나19를', '이겨냅시다', '.']


In [12]:
from nltk.tokenize import sent_tokenize
text="Python is an interpreted, high-level, general-purpose programming language. Created by Guido van Rossum and first released in 1991, Python's design philosophy emphasizes code readability with its notable use of significant whitespace. Its language constructs and object-oriented approach aim to help programmers write clear, logical code for small and large-scale projects."
text=sent_tokenize(text) #3개 문장
text

['Python is an interpreted, high-level, general-purpose programming language.',
 "Created by Guido van Rossum and first released in 1991, Python's design philosophy emphasizes code readability with its notable use of significant whitespace.",
 'Its language constructs and object-oriented approach aim to help programmers write clear, logical code for small and large-scale projects.']

In [13]:
#모든 단어를 소문자, 불용어 제거, 길이가 2이하 제거
#print(sw)

voc={}
sentences=[]
for t in text:
    words=word_tokenize(t)
    res=[]
    for word in words:
        word=word.lower()
        if word not in sw:
            if len(word)>2:
                res.append(word)
                if word not in voc:
                    voc[word]=0
                voc[word]+=1
    sentences.append(res)
print(sentences) #[[문장1][문장2],[문장3]]
#voc=={'python':3, ...}        

[['python', 'interpreted', 'high-level', 'general-purpose', 'programming', 'language'], ['created', 'guido', 'van', 'rossum', 'first', 'released', '1991', 'python', 'design', 'philosophy', 'emphasizes', 'code', 'readability', 'notable', 'use', 'significant', 'whitespace'], ['language', 'constructs', 'object-oriented', 'approach', 'aim', 'help', 'programmers', 'write', 'clear', 'logical', 'code', 'small', 'large-scale', 'projects']]


In [14]:
vs=sorted(voc.items(), key=lambda x:x[1], reverse=True)
vs

[('python', 2),
 ('language', 2),
 ('code', 2),
 ('interpreted', 1),
 ('high-level', 1),
 ('general-purpose', 1),
 ('programming', 1),
 ('created', 1),
 ('guido', 1),
 ('van', 1),
 ('rossum', 1),
 ('first', 1),
 ('released', 1),
 ('1991', 1),
 ('design', 1),
 ('philosophy', 1),
 ('emphasizes', 1),
 ('readability', 1),
 ('notable', 1),
 ('use', 1),
 ('significant', 1),
 ('whitespace', 1),
 ('constructs', 1),
 ('object-oriented', 1),
 ('approach', 1),
 ('aim', 1),
 ('help', 1),
 ('programmers', 1),
 ('write', 1),
 ('clear', 1),
 ('logical', 1),
 ('small', 1),
 ('large-scale', 1),
 ('projects', 1)]

In [15]:
wi={}
i=0
for w,f in vs:
    if f>1 :
        i+=1
        wi[w]=i #index 부여
print(wi)
        
        

{'python': 1, 'language': 2, 'code': 3}


In [16]:
a=wi.items()
for w,i in a:
    print(i)

1
2
3


In [17]:
vocSize=2 #가장 많이 언급된 2개 단어만 추출

#단어의 인덱스가 vocSize를 초과하는 단어 추출
wordFreq=[w for w,i in wi.items() if i>vocSize]
print(wordFreq)
for w in wordFreq:
    del wi[w]
#인덱스(index)가 3번 이상인 단어는 제거(1번, 2번만 추출)

['code']


In [18]:
wi

{'python': 1, 'language': 2}

In [31]:
#OOV(out of voc, 단어집합에 없는 단어)
# 영수:철수야 안녕? (입력 데이터, x)
# 철수:응 너도 안녕.(출력 데이터, y)
# ...
# 철수야 안녕? -> 모델 -> 응 너도 안녕.

# 철수(1) 안녕(2)  ->     -> 응  너  안녕
# ...

# 철수 안녕 너 응 잘가 ...
#   1    2   3   4   5 ...

# 개체명인식

In [20]:
sentences

[['python',
  'interpreted',
  'high-level',
  'general-purpose',
  'programming',
  'language'],
 ['created',
  'guido',
  'van',
  'rossum',
  'first',
  'released',
  '1991',
  'python',
  'design',
  'philosophy',
  'emphasizes',
  'code',
  'readability',
  'notable',
  'use',
  'significant',
  'whitespace'],
 ['language',
  'constructs',
  'object-oriented',
  'approach',
  'aim',
  'help',
  'programmers',
  'write',
  'clear',
  'logical',
  'code',
  'small',
  'large-scale',
  'projects']]

In [21]:
#원핫인코딩
from konlpy.tag import Okt

In [22]:
okt=Okt()
tok=okt.morphs("나는 자연어 처리를 학습한다")
#원핫벡터:단어 집합을 벡터로 표현하는 방식

In [23]:
w2i={}
for v in tok:
    if v not in w2i.keys():
        w2i[v]=len(w2i)
print(w2i)

{'나': 0, '는': 1, '자연어': 2, '처리': 3, '를': 4, '학습': 5, '한다': 6}


In [24]:
#'자연어' -> 원핫 -> 0010000
def ohe(w, w2i):
    ohv=[0]*len(w2i)
    index=w2i[w]
    ohv[index]=1
    return ohv
print(ohe("자연어", w2i))

[0, 0, 1, 0, 0, 0, 0]


In [25]:
#케라스 원핫인코딩 : to_categorical()
text="데이터 분석은 판다스 최고야 판다스 곰이야"

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
tok=Tokenizer()
tok.fit_on_texts([text])
print(tok.word_index)
#단어집합(voc)



{'판다스': 1, '데이터': 2, '분석은': 3, '최고야': 4, '곰이야': 5}


In [26]:
sample="판다스 분석은 동물원에서 한다"
enc=tok.texts_to_sequences([sample])[0]

In [27]:
to_categorical(enc)

array([[0., 1., 0., 0.],
       [0., 0., 0., 1.]], dtype=float32)

In [28]:
#단어 분리(BPE) => 기계번역
#학습과정에서 사용되지 않은 단어가 테스트과정에서
#입력되면 -> OOV 문제 => 제대로 모델이 동작X

In [29]:
# run-length 기법  aaaabbbaaaaa => a4b3a5
# 허프만 트리(인코딩)를 이용한 압축
#a=> 101, b=10, c=1101...
#BPE 압축 알고리즘 => 단어 분리에 응용

In [30]:
#AAABDAAABAC
#BPE 압축
#연속적인 글자 쌍(2글자)을 구성했을때 
#가장 많이 등장
#1) AA가 가장 많이 등장 => 다른 글자로 치환
#=> 소문자 z로 치환
#zABDzABAC

#2) AB가 가장 많이 등장 => 다른 글자로 치환
#=> 소문자 y로 치환
#zyDzyAC

#3) zy가 가장 많이 등장 => 다른 글자로 치환
#=> 소문자 x로 치환
#xDxAC


In [None]:
#BPE : 단어 분리 알고리즘 => 글자 단위 -> 단어 집합

# ex)train data
# ={low:5, lower:2, newest:6, widest:3}
# 1)단어 집합(VOC)
# =>low, lower, newest,widest
# ex)lowest(테스트 과정) 입력 -> OOV 문제 
# 2)BPE 알고리즘(OOV 문제 해결)
#    lowest와 lower 같은 의미 단어로 

In [32]:
### 토픽 모델링 => LSA
from sklearn.datasets import fetch_20newsgroups

In [35]:
dataset=fetch_20newsgroups(shuffle=True,random_state=1,remove=("headers","footers","quotes"))

In [36]:
print(type(dataset))
documents=dataset.data
len(documents)# 11314건의 뉴스 기사

<class 'sklearn.utils.Bunch'>


11314

In [37]:
type(documents)
documents[0]

"Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe most pro-israeli media in the world. Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured. The U.S. media as a whole seem to try to\nignore them. The U.S. is subsidizing Israels existance and the\nEuropeans are not (at least not to the same degree). So I think\nthat might be a reason they report more clearly on the\natrocities.\n\tWhat is a shame is that in Austria, daily reports of\nthe inhuman acts commited by Israeli soldiers and the blessing\nreceived from the Government makes some of the Holocaust guilt\ngo away. After all, look how the Jews are treating other races\nwhen they got power. It is unfortunate.\n"

In [38]:
dataset.target_names#뉴스 카테고리

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [39]:
documents[1]

"\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism?  No, you need a little leap of faith, Jimmy.  Your logic runs out\nof steam!\n\n\n\n\n\n\n\nJim,\n\nSorry I can't pity you, Jim.  And I'm sorry that you have these feelings of\ndenial about the faith you need to get by.  Oh well, just pretend that it will\nall end happily ever after anyway.  Maybe if you start a new newsgroup,\nalt.atheist.hard, you won't be bummin' so much?\n\n\n\n\n\n\nBye-Bye, Big Jim.  Don't forget your Flintstone's Chewables!  :) \n--\nBake Timmons, III"

In [None]:
# 토픽과 가장 관련성이 높은 단어를 10개씩 출력
# topic1:
# ~
# topic20

In [40]:
import pandas as pd
newsDf=pd.DataFrame({'document':documents})
newsDf

Unnamed: 0,document
0,Well i'm not sure about the story nad it did s...
1,"\n\n\n\n\n\n\nYeah, do you expect people to re..."
2,Although I realize that principle is not one o...
3,Notwithstanding all the legitimate fuss about ...
4,"Well, I will have to change the scoring on my ..."
...,...
11309,"Danny Rubenstein, an Israeli journalist, will ..."
11310,\n
11311,\nI agree. Home runs off Clemens are always m...
11312,I used HP DeskJet with Orange Micros Grappler ...


In [42]:
#특수문제 제거(영문자를 제외)
newsDf['clean_doc']=newsDf['document'].str.replace("[^a-zA-Z]"," ")

In [43]:
newsDf

Unnamed: 0,document,clean_doc
0,Well i'm not sure about the story nad it did s...,Well i m not sure about the story nad it did s...
1,"\n\n\n\n\n\n\nYeah, do you expect people to re...",Yeah do you expect people to read the ...
2,Although I realize that principle is not one o...,Although I realize that principle is not one o...
3,Notwithstanding all the legitimate fuss about ...,Notwithstanding all the legitimate fuss about ...
4,"Well, I will have to change the scoring on my ...",Well I will have to change the scoring on my ...
...,...,...
11309,"Danny Rubenstein, an Israeli journalist, will ...",Danny Rubenstein an Israeli journalist will ...
11310,\n,
11311,\nI agree. Home runs off Clemens are always m...,I agree Home runs off Clemens are always me...
11312,I used HP DeskJet with Orange Micros Grappler ...,I used HP DeskJet with Orange Micros Grappler ...


In [53]:
#3글자 이하 제거, 소문자 변환
newsDf['clean_doc']=newsDf['clean_doc'].apply(lambda x:' '.join([w for w in x.split() if len(w)>3]))

In [54]:
#소문자 변환
newsDf['clean_doc']=newsDf['clean_doc'].apply(lambda x:x.lower())

In [55]:
newsDf

Unnamed: 0,document,clean_doc
0,Well i'm not sure about the story nad it did s...,well sure about story seem biased what disagre...
1,"\n\n\n\n\n\n\nYeah, do you expect people to re...",yeah expect people read actually accept hard a...
2,Although I realize that principle is not one o...,although realize that principle your strongest...
3,Notwithstanding all the legitimate fuss about ...,notwithstanding legitimate fuss about this pro...
4,"Well, I will have to change the scoring on my ...",well will have change scoring playoff pool unf...
...,...,...
11309,"Danny Rubenstein, an Israeli journalist, will ...",danny rubenstein israeli journalist will speak...
11310,\n,
11311,\nI agree. Home runs off Clemens are always m...,agree home runs clemens always memorable kinda...
11312,I used HP DeskJet with Orange Micros Grappler ...,used deskjet with orange micros grappler syste...


In [61]:
#불용어 제거
sw=stopwords.words('english')
#토큰화
tokenizedDoc=newsDf['clean_doc'].apply(lambda x:x.split())

In [65]:
tokenizedDoc=tokenizedDoc.apply(lambda x:[item for item in x if item not in sw])

In [66]:
tokenizedDoc[1]

['yeah',
 'expect',
 'people',
 'read',
 'actually',
 'accept',
 'hard',
 'atheism',
 'need',
 'little',
 'leap',
 'faith',
 'jimmy',
 'logic',
 'runs',
 'steam',
 'sorry',
 'pity',
 'sorry',
 'feelings',
 'denial',
 'faith',
 'need',
 'well',
 'pretend',
 'happily',
 'ever',
 'anyway',
 'maybe',
 'start',
 'newsgroup',
 'atheist',
 'hard',
 'bummin',
 'much',
 'forget',
 'flintstone',
 'chewables',
 'bake',
 'timmons']

In [67]:
# TF-IDF 매트릭스 구성
# TFIDF는 토큰화가 안되어 있는 텍스트 데이터로 구성
# 토큰화 <-> 역토큰화(토큰화 취소)
newsDf['clean_doc'][1]

'yeah expect people read actually accept hard atheism need little leap faith jimmy your logic runs steam sorry pity sorry that have these feelings denial about faith need well just pretend that will happily ever after anyway maybe start newsgroup atheist hard bummin much forget your flintstone chewables bake timmons'

In [72]:
#역 토큰화
deTokenizedDoc=[]
for i in range(len(newsDf)):
    temp=' '.join(tokenizedDoc[i])
    deTokenizedDoc.append(temp)

In [73]:
newsDf['clean_doc'][1]

'yeah expect people read actually accept hard atheism need little leap faith jimmy your logic runs steam sorry pity sorry that have these feelings denial about faith need well just pretend that will happily ever after anyway maybe start newsgroup atheist hard bummin much forget your flintstone chewables bake timmons'

In [74]:
#tfidf 행렬 구성
from sklearn.feature_extraction.text import TfidfVectorizer
vector=TfidfVectorizer(stop_words='english',max_features=1000)#1000개 단어
res=vector.fit_transform(newsDf['clean_doc'])
res.shape

(11314, 1000)

In [75]:
#svd(full,truncated)
#특이값분해
#행렬=U*S*VT
#절단된 SVD -> 차원 축소
#

In [77]:
#토픽 숫자:n_components
from sklearn.decomposition import TruncatedSVD
svdModel=TruncatedSVD(n_components=20)

In [79]:
svdModel.fit(res)

TruncatedSVD(algorithm='randomized', n_components=20, n_iter=5,
             random_state=None, tol=0.0)

In [80]:
import numpy as np
np.shape(svdModel.components_)
#20개의 토픽과 1000개의 단어

(20, 1000)

In [81]:
terms=vector.get_feature_names()#1000개의 단어
terms

['ability',
 'able',
 'accept',
 'access',
 'according',
 'account',
 'action',
 'actions',
 'actual',
 'actually',
 'added',
 'addition',
 'additional',
 'address',
 'administration',
 'advance',
 'advice',
 'agencies',
 'agree',
 'algorithm',
 'allow',
 'allowed',
 'allows',
 'amendment',
 'america',
 'american',
 'americans',
 'analysis',
 'angeles',
 'anonymous',
 'answer',
 'answers',
 'anti',
 'anybody',
 'apparently',
 'appear',
 'appears',
 'apple',
 'application',
 'applications',
 'apply',
 'appreciate',
 'appreciated',
 'approach',
 'appropriate',
 'april',
 'arab',
 'archive',
 'area',
 'areas',
 'aren',
 'argument',
 'arguments',
 'armenia',
 'armenian',
 'armenians',
 'arms',
 'army',
 'article',
 'articles',
 'asked',
 'asking',
 'assume',
 'assuming',
 'atheism',
 'atheists',
 'attack',
 'attempt',
 'author',
 'authority',
 'available',
 'average',
 'avoid',
 'away',
 'background',
 'base',
 'baseball',
 'based',
 'basic',
 'basically',
 'basis',
 'begin',
 'beginning',

In [84]:
def getTopic(c, fName, n=10):
    for i, t in enumerate(c):
        print("토픽 %d:"% (i+1),[(fName[i],t[i].round(5)) for i in t.argsort()[:-n-1:-1]])
getTopic(svdModel.components_,terms)

토픽 1: [('just', 0.20887), ('like', 0.20469), ('know', 0.19349), ('people', 0.18318), ('think', 0.1697), ('does', 0.15336), ('good', 0.1438), ('time', 0.13656), ('thanks', 0.11063), ('make', 0.10461)]
토픽 2: [('thanks', 0.32759), ('windows', 0.28791), ('card', 0.18011), ('drive', 0.16867), ('mail', 0.15263), ('file', 0.14387), ('advance', 0.12512), ('files', 0.11334), ('software', 0.11272), ('does', 0.11075)]
토픽 3: [('game', 0.34024), ('team', 0.30231), ('year', 0.27044), ('games', 0.23506), ('drive', 0.17202), ('season', 0.17072), ('good', 0.15077), ('players', 0.14866), ('play', 0.14243), ('hockey', 0.12633)]
토픽 4: [('drive', 0.46219), ('scsi', 0.17163), ('disk', 0.14324), ('hard', 0.13746), ('problem', 0.12619), ('just', 0.11944), ('drives', 0.1194), ('card', 0.11135), ('controller', 0.08503), ('floppy', 0.08267)]
토픽 5: [('drive', 0.40148), ('know', 0.29064), ('thanks', 0.24761), ('does', 0.24387), ('just', 0.18098), ('scsi', 0.1523), ('drives', 0.10927), ('hard', 0.09927), ('controll

In [None]:
"""
low : 5
lower : 2
newest : 6
widest : 3
"""
# l,o,w,e,r,n,s,t,d

3
2
1


7