In [1]:
import warnings
warnings.filterwarnings('ignore')

# 자연어 처리 기본1
자연어 처리 참고 도서 : https://wikidocs.net/173085

In [2]:
with open('ml_datas/stevejobs.txt', 'r', encoding='utf8') as f:
    text = f.read()
print(text[:200])

This is the text of the Commencement address by Steve Jobs, CEO of Apple Computer and of Pixar Animation Studios, delivered on June 12, 2005.

 
I am honored to be with you today at your commencement 


### 구두점이나 공백과 같은 특수 문자를 문자열에서 제거

In [6]:
import re
compile_ = re.compile('[^ a-zA-Z0-9\.]+')

# 소문자로 변경
text = compile_.sub('', text).lower()
print(text[:200])

this is the text of the commencement address by steve jobs ceo of apple computer and of pixar animation studios delivered on june 12 2005. i am honored to be with you today at your commencement from o


### Natural Language Toolkit(NLTK) 모듈을 가져오는 명령
- "punkt" 데이터 세트는 영어 문장을 토큰으로 분리하는 데 사용되는 데이터 세트
- 이 데이터 세트를 다운로드하면 NLTK 모듈의 "word_tokenize" 함수를 사용하여 문장을 토큰으로 분리할 수 있음

In [7]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\khkim\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### NLTK 라이브러리의 함수로, 문자열을 문장으로 분리
- nltk.sent_tokenize(text)

In [18]:
# 문자열을 문장으로 구분하기
sentences = nltk.sent_tokenize(text)
print(sentences[:3])

['this is the text of the commencement address by steve jobs ceo of apple computer and of pixar animation studios delivered on june 12 2005. i am honored to be with you today at your commencement from one of the finest universities in the world.', 'i never graduated from college.', 'truth be told this is the closest ive ever gotten to a college graduation.']


### 단어로 토큰화 하기
- nltk.word_tokenize(sentence)

In [19]:
word_token = []
for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    word_token.append(words)
print(word_token[:1])

[['this', 'is', 'the', 'text', 'of', 'the', 'commencement', 'address', 'by', 'steve', 'jobs', 'ceo', 'of', 'apple', 'computer', 'and', 'of', 'pixar', 'animation', 'studios', 'delivered', 'on', 'june', '12', '2005.', 'i', 'am', 'honored', 'to', 'be', 'with', 'you', 'today', 'at', 'your', 'commencement', 'from', 'one', 'of', 'the', 'finest', 'universities', 'in', 'the', 'world', '.']]


In [7]:
print(text.split(' ')[:10])

['this', 'is', 'the', 'text', 'of', 'the', 'commencement', 'address', 'by', 'steve']


### 토큰화 함수로 정의해서 실행하기

In [20]:
def tokenize_text(text):
    sentences = nltk.sent_tokenize(text)
    word_tokens = [nltk.word_tokenize(sentense) for sentense in sentences]
    return word_tokens
word_tokens = tokenize_text(text)
print(word_tokens[:2])

[['this', 'is', 'the', 'text', 'of', 'the', 'commencement', 'address', 'by', 'steve', 'jobs', 'ceo', 'of', 'apple', 'computer', 'and', 'of', 'pixar', 'animation', 'studios', 'delivered', 'on', 'june', '12', '2005.', 'i', 'am', 'honored', 'to', 'be', 'with', 'you', 'today', 'at', 'your', 'commencement', 'from', 'one', 'of', 'the', 'finest', 'universities', 'in', 'the', 'world', '.'], ['i', 'never', 'graduated', 'from', 'college', '.']]


### NLTK 라이브러리에서 불용어 다운로드

In [22]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\khkim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [25]:
stopwords = nltk.corpus.stopwords.words('english')
len(stopwords)

179

In [26]:
stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

### 불용어 제거하기

In [28]:
all_tokens = []
for sentence in word_tokens:
    filtered_words = []
    for word in sentence:
        if word not in stopwords:
            filtered_words.append(word)
    all_tokens.append(filtered_words)
print(all_tokens[:2])

[['text', 'commencement', 'address', 'steve', 'jobs', 'ceo', 'apple', 'computer', 'pixar', 'animation', 'studios', 'delivered', 'june', '12', '2005.', 'honored', 'today', 'commencement', 'one', 'finest', 'universities', 'world', '.'], ['never', 'graduated', 'college', '.']]


### 단어의 어근 추출
#### LancasterStemmer 클래스
- LancasterStemmer 클래스는 단어의 뿌리를 찾는 데 사용되는 스태머
- 스태머는 단어의 형식을 단순화하여 단어의 의미를 보존함
- from nltk.stem import LancasterStemmer

In [81]:
import pandas as pd

In [34]:
from nltk.stem import LancasterStemmer
stemmer = LancasterStemmer()

print(stemmer.stem('running'))
print(stemmer.stem('doing'))

run
doing


#### WordNetLemmatizer 클래스

- NLTK 라이브러리에서 WordNetLemmatizer 클래스를 가져오라는 명령
- 단어의 형식을 단순화하여 단어의 의미를 보존함
- WordNet 데이터는 단어의 의미를 이해하고 단어 간의 관계를 분석하는 데 사용됨

In [36]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\khkim\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [40]:
lemma = WordNetLemmatizer()
# lemma.lemmatize('wore')
lemma.lemmatize('wore', 'v')

'wear'

In [41]:
lemma.lemmatize('done','v')

'do'

### 문서의 수량화 표현 : CountVectorizer, TfidfVectorizer
문서의 수량화 : 문자 -> 숫자
- CountVectorizer 클래스
    - 문서단어행렬(DTM, Document-Term Matrix) 만듬
    - 문서에서 단어의 빈도를 계산하여 단어 집합을 만드는 클래스
- TfidfVectorizer 클래스
    - TF-IDF(Term Frequencey - Inverse Document Frequency)
    - CountVectorizer와 유사하지만, 단어의 중요도를 고려하여 단어 집합을 만듬

In [46]:
# 문장으로 읽어오기
sentences = nltk.sent_tokenize(text)

In [47]:
sentences[:2]

['this is the text of the commencement address by steve jobs ceo of apple computer and of pixar animation studios delivered on june 12 2005. i am honored to be with you today at your commencement from one of the finest universities in the world.',
 'i never graduated from college.']

In [61]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#### 텍스트를 단어에 대한 빈도를 나타내는 행렬로 변환하는 것
- CountVectorizer

In [84]:
# 인스턴스 생성(텍스트를 단어에 대한 빈도를 나타내는 행렬로 변환하는 데 사용)
cnt_vect = CountVectorizer()

# 인스턴스 훈련 : 텍스트를 단어에 대한 빈도를 나타내는 행렬로 변환하는 과정
cnt_vect.fit(sentences)

# 훈련된 결과를 단어에 대한 빈도를 나타내는 행렬로 변환
word_vect = cnt_vect.transform(sentences)

In [63]:
word_vect

<139x715 sparse matrix of type '<class 'numpy.int64'>'
	with 1945 stored elements in Compressed Sparse Row format>

In [64]:
print(word_vect)

  (0, 1)	1
  (0, 6)	1
  (0, 14)	1
  (0, 31)	1
  (0, 35)	1
  (0, 38)	1
  (0, 43)	1
  (0, 53)	1
  (0, 62)	1
  (0, 97)	1
  (0, 111)	1
  (0, 129)	2
  (0, 131)	1
  (0, 162)	1
  (0, 247)	1
  (0, 264)	1
  (0, 316)	1
  (0, 328)	1
  (0, 339)	1
  (0, 345)	1
  (0, 346)	1
  (0, 438)	4
  (0, 443)	1
  (0, 444)	1
  (0, 474)	1
  :	:
  (134, 258)	1
  (134, 298)	1
  (134, 418)	1
  (134, 604)	1
  (134, 693)	1
  (135, 35)	1
  (135, 36)	1
  (135, 50)	1
  (135, 70)	1
  (135, 258)	1
  (135, 282)	1
  (135, 436)	1
  (135, 604)	1
  (135, 624)	1
  (135, 692)	1
  (135, 708)	2
  (136, 319)	1
  (136, 572)	1
  (137, 257)	1
  (137, 572)	1
  (138, 26)	1
  (138, 415)	1
  (138, 603)	1
  (138, 662)	1
  (138, 708)	1


In [94]:
# numpy 출력 생략 안하도록 설정
import numpy as np
import sys
# np.set_printoptions(threshold=np.inf) 
# np.set_printoptions(threshold=sys.maxsize) 
np.set_printoptions(threshold=20)

In [100]:
# 판다스 생략없이 출력
# row 생략 없이 출력
pd.set_option('display.max_rows', None)
# pd.set_option('display.max_rows', 10)
# col 생략 없이 출력
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_columns', 10)

In [104]:
# 각 문자에 word가 몇번 나왔는지 matrix로 보여줌
# print(word_vect.toarray())
dtm = word_vect.toarray()

In [105]:
# matrix 데이터 프레임으로 확인
cols = cnt_vect.get_feature_names_out()
pd.DataFrame(dtm, columns=cols).head(5)

Unnamed: 0,10,12,17,18,1960s,20,2005,30,33,35,4000,730,about,across,address,adopted,adoption,adult,adventurous,advised,affairs,after,again,age,agent,ago,all,almost,along,already,always,am,amazing,amount,an,and,anew,animated,animation,another,answer,any,apologize,apple,apples,application,approach,are,around,artistically,as,asked,asking,at,avoid,away,awful,baby,back,backwards,badly,baton,be,beautiful,beautifully,because,become,been,before,began,begin,beginner,being,believe,beneath,best,better,between,bibles,big,billion,biological,biopsy,birth,bit,board,bob,born,bottles,bought,boy,brand,brick,brought,but,buttoned,buy,by,call,called,calligraphed,calligraphy,came,cameras,campus,can,cancer,cant,capture,catalog,cells,ceo,certainly,certainty,change,changed,choices,chose,class,classes,clear,cleared,clearly,clears,closest,code,coke,college,combinations,commencement,company,computer,computers,concept,connect,connecting,convinced,copied,could,couldnt,country,courage,course,cover,create,created,creation,creative,crying,curable,curiosity,current,david,dawn,day,days,dead,deal,death,decades,decided,decisions,delivered,deposits,designed,designing,desktop,destination,destiny,devastating,developed,diagnosed,diagnosis,did,didnt,die,difference,different,directors,diverge,do,doctor,doctors,dogma,dont,dorm,dots,down,dramatic,drawer,drop,dropin,dropped,dropping,drown,during,each,earlier,early,earth,easy,else,elses,embarrassment,employees,encountered,endoscope,enter,entire,entrepreneurs,escaped,even,evening,events,eventually,ever,every,everything,example,except,expect,expectations,expensive,external,face,facing,failure,faith,fall,falling,family,far,farewell,fascinating,father,fear,feature,fell,fellow,felt,few,figure,fill,film,final,find,fine,finest,fired,first,five,floor,focus,follow,following,fonts,food,foolish,for,form,forward,found,freed,friends,from,future,garage,generation,get,gets,getting,girl,give,go,going,gone,good,goodbyes,google,got,gotten,gradually,graduate,graduated,graduates,graduation,great,grew,grown,guess,gut,had,hadnt,hand,happened,hard,hare,has,have,havent,having,he,head,heart,heaven,heaviness,help,here,high,him,hired,his,historical,hitchhiking,hits,home,honored,hope,how,hungry,idea,idealistic,if,ill,im,important,impossible,impression,in,incurable,inner,instruction,intellectual,interest,interesting,intestines,into,intuition,invention,is,issue,issues,it,its,ive,jobs,june,just,karma,keep,kept,kids,kind,know,krishna,label,large,last,late,later,laurene,lawyer,learn,learned,leaving,less,let,letter,life,lifes,lightness,like,likely,limited,list,live,lived,living,long,longer,looked,looking,lose,loss,love,loved,lovers,lucky,mac,macintosh,made,make,makes,many,matters,me,meal,means,medicine,menlo,message,met,microscope,mid1970s,middle,might,miles,minute,mirror,money,months,more,morning,most,mother,much,multiple,my,myself,naively,naked,named,neat,need,needed,needle,never,new,next,night,no,noise,none,normal,not,notions,now,noyce,of,off,offered,ok,old,on,one,ones,only,opinions,or,order,other,others,our,out,over,overflowing,own,packard,pancreas,pancreatic,paperback,papers,parents,park,part,passed,past,patient,people,peoples,perhaps,periods,personal,photograph,pixar,poetic,polaroid,popped,possible,poster,practical,prepare,pretty,previous,priceless,pride,promised,proportionally,public,publication,publicly,publishing,purely,put,quit,quite,quote,rare,read,really,reason,reed,refused,rejected,relationship,released,relented,remarkable,remembering,renaissance,replaced,required,results,retuned,returned,right,road,roll,romantic,room,rooms,row,run,running,said,san,satisfied,saved,savings,say,scan,scary,school,science,scissors,screwing,second,secondary,sedated,see,serif,set,settle,several,share,she,should,showed,sided,sign,signed,since,single,six,slept,slowly,so,someday,somehow,someone,something,sometimes,soon,sorry,sort,space,spaced,spending,spent,stanford,start,started,stay,stayed,steve,stewart,still,stomach,stop,stories,story,strongly,stuck,student,studio,studios,stumbled,subtle,successful,sunday,sure,surgery,take,taking,talented,tasting,team,technology,tell,temple,ten,text,than,thank,that,thats,the,their,them,then,there,these,they,thing,things,thinking,third,this,thought,three,throat,through,throughout,time,to,today,together,told,too,tool,tools,touch,town,toy,trap,trapped,tried,true,truly,trust,truth,try,tuition,tumor,turn,turned,two,type,typefaces,typewriters,typography,under,unexpected,universities,until,unwed,up,us,useful,valley,value,varying,very,viewed,visions,voice,waiting,walk,want,wanted,wants,was,wasnt,waste,way,we,week,well,went,were,what,whatever,when,whenever,where,which,who,whole,why,wife,will,windows,wish,wished,with,woman,wonderful,words,work,worked,workingclass,world,worlds,would,woz,year,years,yet,you,youd,youll,young,your,yourself,youve
0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### TF-IDF 행렬로 만들기
- 단어 빈도와 역문서빈도를 곱하여 얻어진 행렬임
- 각 토큰이 여러개의 문서에 공통적으로 많이 사용되었으면 낮은 점수를 얻고
- 특정 문서에서만 자주 사용된 경우는 높은 점수를 얻는 구조의 행렬
- 조사나 반복사용되는 '으로, 에서, ㄴ다'와 같은 무의미한 용어들은 패널티를 받아 낮은 점수를 갖게 됨
- TfidfVectorizer 클래스

[참고] - DTM 행렬은 각 단어의 빈도수를 나타내는 행열



In [106]:
tfidf_vect = TfidfVectorizer(max_df = 0.8, min_df = 2, 
                             stop_words='english',ngram_range=(1,2))
tfidf_vect.fit(sentences)
word_vect2 = tfidf_vect.transform(sentences)
print(word_vect2)

  (0, 171)	0.40357836333025365
  (0, 153)	0.37960530168319284
  (0, 124)	0.37960530168319284
  (0, 63)	0.40357836333025365
  (0, 32)	0.3458171690075344
  (0, 7)	0.32184410736047364
  (0, 6)	0.40357836333025365
  (1, 77)	0.6369268867091887
  (1, 76)	0.6369268867091887
  (1, 30)	0.43433659985532014
  (2, 154)	0.4570930826831498
  (2, 89)	0.4570930826831498
  (2, 30)	0.33138824533344036
  (2, 29)	0.48595969914245674
  (2, 28)	0.48595969914245674
  (3, 164)	0.42679243020595825
  (3, 153)	0.46849226627295076
  (3, 147)	0.4980787708099867
  (3, 142)	0.4980787708099867
  (3, 94)	0.31968467917332616
  (5, 14)	1.0
  (6, 142)	0.7818304415465513
  (6, 90)	0.6234911071307471
  (7, 143)	0.7071067811865476
  (7, 48)	0.7071067811865476
  :	:
  (128, 88)	0.41632521781877263
  (128, 62)	0.41632521781877263
  (128, 61)	0.3915949770059662
  (128, 52)	0.41632521781877263
  (128, 36)	0.41632521781877263
  (129, 140)	0.600907342063449
  (129, 138)	0.5270870255550616
  (129, 84)	0.600907342063449
  (130, 139

In [107]:
from sklearn.datasets import fetch_20newsgroups

news_data = fetch_20newsgroups(subset = 'all', random_state = 156)

In [108]:
print(news_data.data[0])

From: egreen@east.sun.com (Ed Green - Pixel Cruncher)
Subject: Re: Observation re: helmets
Organization: Sun Microsystems, RTP, NC
Lines: 21
Distribution: world
Reply-To: egreen@east.sun.com
NNTP-Posting-Host: laser.east.sun.com

In article 211353@mavenry.altcit.eskimo.com, maven@mavenry.altcit.eskimo.com (Norman Hamer) writes:
> 
> The question for the day is re: passenger helmets, if you don't know for 
>certain who's gonna ride with you (like say you meet them at a .... church 
>meeting, yeah, that's the ticket)... What are some guidelines? Should I just 
>pick up another shoei in my size to have a backup helmet (XL), or should I 
>maybe get an inexpensive one of a smaller size to accomodate my likely 
>passenger? 

If your primary concern is protecting the passenger in the event of a
crash, have him or her fitted for a helmet that is their size.  If your
primary concern is complying with stupid helmet laws, carry a real big
spare (you can put a big or small head in a big helmet, bu

In [111]:
train_news = fetch_20newsgroups(subset = 'train', remove=('headers', 'footers', 'quotes'), random_state=156)
X_train = train_news.data
y_train = train_news.target
print(X_train[0], y_train[0])



What I did NOT get with my drive (CD300i) is the System Install CD you
listed as #1.  Any ideas about how I can get one?  I bought my IIvx 8/120
from Direct Express in Chicago (no complaints at all -- good price & good
service).

BTW, I've heard that the System Install CD can be used to boot the mac;
however, my drive will NOT accept a CD caddy is the machine is off.  How can
you boot with it then?

--Dave
 4


In [112]:
train_news.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [113]:
test_news = fetch_20newsgroups(subset = 'test', remove=('headers', 'footers', 'quotes'), random_state=156)
X_test = test_news.data
y_test = test_news.target
print(f'train : {len(X_train)}, test : {len(X_test)}')

train : 11314, test : 7532


In [114]:
X_train[0]

"\n\nWhat I did NOT get with my drive (CD300i) is the System Install CD you\nlisted as #1.  Any ideas about how I can get one?  I bought my IIvx 8/120\nfrom Direct Express in Chicago (no complaints at all -- good price & good\nservice).\n\nBTW, I've heard that the System Install CD can be used to boot the mac;\nhowever, my drive will NOT accept a CD caddy is the machine is off.  How can\nyou boot with it then?\n\n--Dave\n"

In [115]:
from sklearn.feature_extraction.text import CountVectorizer

cnt_vect = CountVectorizer(stop_words='english')
cnt_vect.fit(X_train)

X_train_cnt_vect = cnt_vect.transform(X_train)
X_test_cnt_vect = cnt_vect.transform(X_test)

In [116]:
X_train_cnt_vect.shape

(11314, 101322)

In [117]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr_clf = LogisticRegression(solver = 'liblinear')
lr_clf.fit(X_train_cnt_vect, y_train)
pred = lr_clf.predict(X_test_cnt_vect)
print(round(accuracy_score(y_test, pred),2))

0.64


In [118]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(stop_words='english')
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)


lr_clf = LogisticRegression(solver = 'liblinear')
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
print(round(accuracy_score(y_test, pred),2))

0.69


In [119]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

news_pipe = Pipeline([
  ('tfidf_vect', TfidfVectorizer(stop_words='english')),
  ('lr_clf', LogisticRegression(solver='liblinear'))  
])
# __(언더바 2개는 pipeline 단계에서 해당 단계에 하이퍼파라미터를 설정할때 씀)
params = {'tfidf_vect__ngram_range' : [(1,1),(1,2),(1,3)],
          'tfidf_vect__max_df' : [100, 300, 700], 
          'lr_clf__C' : [1, 5, 10]}

grid_news_pipe = GridSearchCV(news_pipe, param_grid=params, cv = 3, scoring='accuracy', verbose = 0)

grid_news_pipe.fit(X_train, y_train)
print(grid_news_pipe.best_params_, grid_news_pipe.best_score_)

pred = grid_news_pipe.predict(X_test)

print(f'정확도 : {accuracy_score(y_test, pred)}')

{'lr_clf__C': 10, 'tfidf_vect__max_df': 700, 'tfidf_vect__ngram_range': (1, 2)} 0.7550828826229531
정확도 : 0.7019383961763144
