In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
import numpy as np
from nltk.corpus import stopwords 

from transformers import BertTokenizer

ModuleNotFoundError: No module named 'transformers'

In [21]:
class SentenceTokenizer(object):
    def __init__(self):
        self.stopwords = set(stopwords.words('english')) 
    
    def get_tokens(self, sentences):
        tokenizer = BertTokenizer(vocab_file='bert-base-uncased-vocab.txt')
        tokens = tokenizer.tokenize(sentences)  # word_piece_list
        
        # tokens = word_tokenize(sentences) #요부분!! 들어온 text를 단어로 쪼개는 부분만 바꿔주면 됨!!
        
        #이건 불용어 빼주는 작업
        tr_tk = []
        
        for w in tokens:
            if w not in self.stopwords:
                tr_tk.append(w)

        print(tr_tk)
        return tr_tk

In [22]:
class GraphMatrix(object):
    def __init__(self):
        self.tfidf = TfidfVectorizer()
        self.cnt_vec = CountVectorizer()
        self.graph_sentence = []

    def build_words_graph(self, sentence):
        cnt_vec_mat = normalize(self.cnt_vec.fit_transform(sentence).toarray().astype(float), axis=0)
        vocab = self.cnt_vec.vocabulary_
        return np.dot(cnt_vec_mat.T, cnt_vec_mat), {vocab[word] : word for word in vocab}

In [23]:
class Rank(object):
    def get_ranks(self, graph, d=0.85): # d = damping factor
        A = graph
        matrix_size = A.shape[0]
        for id in range(matrix_size):
            A[id, id] = 0 # diagonal 부분을 0으로
            link_sum = np.sum(A[:,id]) # A[:, id] = A[:][id]
            if link_sum != 0:
                A[:, id] /= link_sum
            A[:, id] *= -d
            A[id, id] = 1

        B = (1-d) * np.ones((matrix_size, 1))
        ranks = np.linalg.solve(A, B) # 연립방정식 Ax = b
        return {idx: r[0] for idx, r in enumerate(ranks)}

In [28]:
class TextRank(object):
    def keywords(self, text, word_num=1):

        tokens = SentenceTokenizer().get_tokens(text)
        words_graph, idx2word = GraphMatrix().build_words_graph(tokens)
        
        rank = Rank()
        rank_idx = rank.get_ranks(words_graph)
        sorted_rank_idx = sorted(rank_idx, key=lambda k: rank_idx[k], reverse=True)

        keywords = []
        index=[]
        for idx in sorted_rank_idx[:word_num]:
            index.append(idx)

        for idx in index:
            keywords.append(idx2word[idx])
        
        return keywords

In [29]:
text = 'We should identify all the victims of the massacre, and find out how the military fired machine-guns from helicopters and how the distortion and cover-up maneuvering were carried out," he said. "The purpose of the investigation is not to punish the responsible people, but to seek reconciliation and unity based on truth.'

result = TextRank().get_keyword(text)
print(result)

['We', 'identify', 'victims', 'massacre', ',', 'find', 'military', 'fired', 'machine-guns', 'helicopters', 'distortion', 'cover-up', 'maneuvering', 'carried', ',', "''", 'said', '.', '``', 'The', 'purpose', 'investigation', 'punish', 'responsible', 'people', ',', 'seek', 'reconciliation', 'unity', 'based', 'truth', '.']
['guns']
