# 단어중요도 구하기 

*tf-idf를 통해 각 문서에서 중요키워드 5개씩 반환하는 프로그램

# 1. Data preprocessing

In [88]:
import numpy as np
import pandas as pd
import re
import math
from nltk.stem.wordnet import WordNetLemmatizer

In [57]:
lemmatizer = WordNetLemmatizer()

def cleansing(text_file):
    f = open(text_file,'r',encoding='utf8')
    lines = f.readlines()
    wcorpus = list()

    for line in lines:
        wcorpus = wcorpus + line.lower().split() #데이터 불러오기, 소문자처리

    word_corpus = list()
    for word in wcorpus:
        word = lemmatizer.lemmatize(re.sub('[^a-z]','',word)) #숫자제거, 어근추출, 길이 1 단어제거 
        if len(word) > 1:
            word_corpus.append(word)
        
    word_corpus = ' '.join(word_corpus).split() #공백제거
    
    return word_corpus

In [63]:
corpus1 = cleansing('문서1.txt')
corpus2 = cleansing('문서2.txt')
corpus3 = cleansing('문서3.txt')

# 2.Term frequency

In [112]:
keywords = sorted(list(set(corpus1 + corpus2 + corpus3)))
tf1, tf2, tf3 = [],[],[]
lis = [[tf1,corpus1],[tf2,corpus2],[tf3,corpus3]]

for t,k in lis:
    for keyword in keywords:
        tf = k.count(keyword)   #단어 등장빈도
        t.append(tf)
        
term_frequency = pd.DataFrame([tf1,tf2,tf3],index=['doc1','doc2','doc3'],columns = keywords)
term_frequency

Unnamed: 0,about,achieving,action,agent,ai,also,an,and,animal,any,...,training,undergraduate,university,unlike,used,usually,whereas,which,with,work
doc1,1,0,0,0,0,0,1,5,0,0,...,0,2,2,0,0,0,1,1,0,0
doc2,0,0,0,0,0,1,0,5,0,0,...,1,0,0,0,0,1,0,0,0,1
doc3,0,1,1,1,2,0,0,3,1,1,...,0,0,0,1,1,0,0,0,1,0


# 3.Inverse document frequency

In [109]:
def calidf(term):
    docs = [corpus1, corpus2, corpus3]
    N = len(docs)
    df = 0

    for doc in docs:
        df += term in doc

    return math.log(N/df)

In [110]:
keyword_idf = []
for word in keywords:
    keyword_idf.append(calidf(word))

idf = pd.DataFrame([keyword_idf,keyword_idf,keyword_idf],index=['doc1','doc2','doc3'],columns = keywords)
idf

Unnamed: 0,about,achieving,action,agent,ai,also,an,and,animal,any,...,training,undergraduate,university,unlike,used,usually,whereas,which,with,work
doc1,1.098612,1.098612,1.098612,1.098612,1.098612,1.098612,1.098612,0.0,1.098612,1.098612,...,1.098612,1.098612,1.098612,1.098612,1.098612,1.098612,1.098612,1.098612,1.098612,1.098612
doc2,1.098612,1.098612,1.098612,1.098612,1.098612,1.098612,1.098612,0.0,1.098612,1.098612,...,1.098612,1.098612,1.098612,1.098612,1.098612,1.098612,1.098612,1.098612,1.098612,1.098612
doc3,1.098612,1.098612,1.098612,1.098612,1.098612,1.098612,1.098612,0.0,1.098612,1.098612,...,1.098612,1.098612,1.098612,1.098612,1.098612,1.098612,1.098612,1.098612,1.098612,1.098612


# 4.단어중요도 계산

In [154]:
tf_arr = np.array(term_frequency)
idf_arr = np.array(idf)
tf_idf = pd.DataFrame(np.transpose(tf_arr*idf_arr),columns=['doc1','doc2','doc3'], index= keywords)   #tf-idf 계산
tf_idf

Unnamed: 0,doc1,doc2,doc3
about,1.098612,0.000000,0.000000
achieving,0.000000,0.000000,1.098612
action,0.000000,0.000000,1.098612
agent,0.000000,0.000000,1.098612
ai,0.000000,0.000000,2.197225
...,...,...,...
usually,0.000000,1.098612,0.000000
whereas,1.098612,0.000000,0.000000
which,1.098612,0.000000,0.000000
with,0.000000,0.000000,1.098612


In [174]:
main_words_1 = list(tf_idf.sort_values(by='doc1',ascending=False).head(5).index) #문서 별  상위 5개 중요 키워드 추출
main_words_2 = list(tf_idf.sort_values(by='doc2',ascending=False).head(5).index)
main_words_3 = list(tf_idf.sort_values(by='doc3',ascending=False).head(5).index)

answer = pd.DataFrame(np.transpose([main_words_1,main_words_2,main_words_3]),index = np.arange(1,6),columns=['doc1','doc2','doc3'])
answer

Unnamed: 0,doc1,doc2,doc3
1,campus,engineering,intelligence
2,seoul,software,human
3,chungju,electronic,machine
4,student,design,it
5,college,computer,ai
