In [None]:
!pip install newspaper3k
!pip install newspaper
!pip install jpype1
!pip install konlpy
!pip install scikit-learn

In [None]:
from newspaper import Article
from konlpy.tag import Kkma
from konlpy.tag import Twitter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
import numpy as np

# https://www.lucypark.kr/courses/2015-dm/text-mining.html
from nltk import regexp_tokenize
import nltk

pattern = r'''(?x) ([A-Z]\.)+ | \w+(-\w+)* | \$?\d+(\.\d+)?%? | \.\.\. | [][.,;"'?():-_`]'''
# tokens_en = regexp_tokenize(doc_en, pattern)
# en = nltk.Text(tokens_en)
# print(len(en.tokens))       # returns number of tokens (document length)
# print(len(set(en.tokens)))  # returns number of unique tokens
# en.vocab()                  # returns frequency distribution

import re

class SentenceTokenizer(object):
    def __init__(self, lan = 'KR'):
        self.kkma = Kkma()
        self.twitter = Twitter()
        self.stopwords = ['중인' ,'만큼', '마찬가지', '꼬집었', "연합뉴스", "데일리", "동아일보", "중앙일보", "조선일보", "기자"
        ,"아", "휴", "아이구", "아이쿠", "아이고", "어", "나", "우리", "저희", "따라", "의해", "을", "를", "에", "의", "가",]
        self.LAN = lan
    
    def url2sentences(self, url):
        article = Article(url, language='ko')
        article.download()
        article.parse()
        sentences = self.kkma.sentences(article.text)
        
        for idx in range(0, len(sentences)):
            if len(sentences[idx]) <= 10:
                sentences[idx-1] += (' ' + sentences[idx])
                sentences[idx] = ''
        
        print('SentenceTokenizer>url2sentences len(sentences)', len(sentences))
        return sentences
    
    def text2sentences(self, text):
        if self.LAN == 'KR':
            sentences = self.kkma.sentences(text)
        else:
            sentences = text # text = array

        for idx in range(0, len(sentences)):
            if len(sentences[idx]) <= 10:
                sentences[idx-1] += (' ' + sentences[idx])
                sentences[idx] = ''
        print('SentenceTokenizer>text2sentences len(sentences)', len(sentences))
        print('sentences:', sentences)
        return sentences
        
    def get_nouns(self, sentences):
        nouns = []
        idx = 0
        for sentence in sentences:
            try:
                if sentence is not '':
                    #print('type(sentence):', type(sentence))
                    #print('len(str(sentence)):', len(sentence))
                    #print('len(str(sentence)):', len(str(sentence)))
                    if self.LAN == 'KR':
                        tw_nouns_obj = self.twitter.nouns(sentence)
                    else:
                        # tokens_en = regexp_tokenize(sentence, pattern)
                        # en = nltk.Text(tokens_en)
                        # tw_nouns_obj = en.tokens
                        #Pattern : 알파벳숫자 + .(dot)/,(comma)/-(dash) 허용
                        p = re.compile('[a-zA-Z0-9.,-]*')   
                        tw_nouns_obj = p.findall(sentence)
                        
                    #print('len(tw_nouns_obj):', len(tw_nouns_obj))
                    #print('tw_nouns_obj:', tw_nouns_obj)
                    noun_list = []
                    for noun in tw_nouns_obj :
                        if noun not in self.stopwords and len(noun) > 1:
                            noun_list.append(noun)
                    #if idx % 10 == 0:
                    #print('noun_list:', noun_list)
                    idx += 1
                    nouns.append(' '.join(noun_list))
            except Exception as e:
                print('err:', str(e))
        return nouns

class GraphMatrix(object):
    def __init__(self):
        self.tfidf = TfidfVectorizer()
        self.cnt_vec = CountVectorizer()
        self.graph_sentence = []
        
    def build_sent_graph(self, sentence):
        tfidf_mat = self.tfidf.fit_transform(sentence).toarray()
        self.graph_sentence = np.dot(tfidf_mat, tfidf_mat.T)
        return  self.graph_sentence
        
    def build_words_graph(self, sentence):
        cnt_vec_mat = normalize(self.cnt_vec.fit_transform(sentence).toarray().astype(float), axis=0)
        vocab = self.cnt_vec.vocabulary_
        return np.dot(cnt_vec_mat.T, cnt_vec_mat), {vocab[word] : word for word in vocab}
 
class Rank(object):
    def get_ranks(self, graph, d=0.85): # d = damping factor
        A = graph
        matrix_size = A.shape[0]
        
        for id in range(matrix_size):
            A[id, id] = 0 # diagonal 부분을 0으로
            link_sum = np.sum(A[:,id]) # A[:, id] = A[:][id]
            if link_sum != 0:
                A[:, id] /= link_sum
            A[:, id] *= -d
            A[id, id] = 1
            
        B = (1-d) * np.ones((matrix_size, 1))
        ranks = np.linalg.solve(A, B) # 연립방정식 Ax = b
        
        return {idx: r[0] for idx, r in enumerate(ranks)}


class TextRank(object):
    def __init__(self, text, lan='KR'):
        self.sent_tokenize = SentenceTokenizer(lan)
        
        if text[:5] in ('http:', 'https'):
            self.sentences = self.sent_tokenize.url2sentences(text)
        else:
            self.sentences = self.sent_tokenize.text2sentences(text)
        
        self.nouns = self.sent_tokenize.get_nouns(self.sentences)
        print('get_nouns!')
                    
        self.graph_matrix = GraphMatrix()
        self.sent_graph = self.graph_matrix.build_sent_graph(self.nouns)
        self.words_graph, self.idx2word = self.graph_matrix.build_words_graph(self.nouns)
        print('create graph!')
        
        self.rank = Rank()
        self.sent_rank_idx = self.rank.get_ranks(self.sent_graph)
        self.sorted_sent_rank_idx = sorted(self.sent_rank_idx, key=lambda k: self.sent_rank_idx[k], reverse=True)
        print('sorted_sent_rank_idx:', self.sorted_sent_rank_idx)
        
        self.word_rank_idx =  self.rank.get_ranks(self.words_graph)
        self.sorted_word_rank_idx = sorted(self.word_rank_idx, key=lambda k: self.word_rank_idx[k], reverse=True)
        print('sorted_word_rank_idx:', self.sorted_word_rank_idx)
        
        
    def summarize(self, sent_num=3):
        summary = []
        index=[]
        for idx in self.sorted_sent_rank_idx[:sent_num]:
            index.append(idx)
        
        index.sort()
        for idx in index:
            summary.append(self.sentences[idx])
        
        return summary
        
    def keywords(self, word_num=10):
        rank = Rank()
        rank_idx = rank.get_ranks(self.words_graph)
        sorted_rank_idx = sorted(rank_idx, key=lambda k: rank_idx[k], reverse=True)
        
        keywords = []
        index=[]
        for idx in sorted_rank_idx[:word_num]:
            index.append(idx)
            
        #index.sort()
        for idx in index:
            keywords.append(self.idx2word[idx])
        
        return keywords

In [None]:
#url = 'http://v.media.daum.net/v/20170611192209012?rcmd=r'
corpus=[
  '신용카드',
  '김상조',
  '김 후보자',
  '월급쟁이라면'
]

textrank = TextRank(' '.join(corpus).lower())
#textrank = TextRank(url)
print('init complete!')
for row in textrank.summarize(3):
    print(row)
    print()
print('keywords :',textrank.keywords())