In [1]:
pip install newspaper3k



In [2]:
pip install jpype1



In [3]:
pip install konlpy



In [4]:
pip install scikit-learn



In [5]:
from newspaper import Article
from konlpy.tag import Kkma
from konlpy.tag import Okt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
import numpy as np
import re

In [12]:
class SentenceTokenizer(object):
  def __init__(self):
    self.kkma = Kkma()
    self.okt = Okt()

    text_file = open("/content/stop_words_file.txt", 'r')

    self.stopwords = [line.rstrip() for line in text_file]

  def url2sentences(self, url):
    article = Article(url, language='ko')
    article.download()
    article.parse()
    sentences = self.kkma.sentences(article.text)

    for idx in range(0, len(sentences)):
      if len(sentences[idx]) <= 10:
        sentences[idx-1] += (' ' + sentences[idx])
        sentences[idx] = ''
    return sentences

  def text2sentences(self, text):
    sentences = self.kkma.sentences(text)

    for idx in range(0, len(sentences)):
      if len(sentences[idx]) <= 10:
        sentences[idx-1] += (' ' + sentences[idx])
        sentences[idx] = ''
    return sentences

  def clean_text(self, text):
    # 개행문자 제거
    text = re.sub("\n", " ", text)
    text = re.sub("\v", " ", text)
    text = re.sub("\f", " ", text)
    # E-mail 제거#
    text = re.sub("([\w\d.]+@[\w\d.]+)", "", text)
    text = re.sub("([\w\d.]+@)", "", text)
    # 괄호 안 제거#
    text = re.sub("<[\w\s\d‘’=/·~:&,`]+>", "", text)
    text = re.sub("\([\w\s\d‘’=/·~:&,`]+\)", "", text)
    text = re.sub("\[[\w\s\d‘’=/·~:&,`]+\]", "", text)
    text = re.sub("【[\w\s\d‘’=/·~:&,`]+】", "", text)
    # 전화번호 제거#
    text = re.sub("(\d{2,3})-(\d{3,4}-\d{4})", "", text)  # 전화번호
    text = re.sub("(\d{3,4}-\d{4})", "", text)  # 전화번호
    # 홈페이지 주소 제거#
    text = re.sub("(www.\w.+)", "", text)
    text = re.sub("(.\w+.com)", "", text)
    text = re.sub("(.\w+.co.kr)", "", text)
    text = re.sub("(.\w+.go.kr)", "", text)
    # 기자 이름 제거#
    text = re.sub("/\w+[=·\w@]+\w+\s[=·\w@]+", "", text)
    text = re.sub("\w{2,4}\s기자", "", text)
    # 한자 제거#
    text = re.sub("[\u2E80-\u2EFF\u3400-\u4DBF\u4E00-\u9FBF\uF900]+", "", text)
    # 특수기호 제거#
    text = re.sub("[◇#/▶▲◆■●△①②③★○◎▽=▷☞◀ⓒ□?㈜♠☎]", "", text)
    # 따옴표 제거#
    text = re.sub("[\"'”“‘’]", "", text)
    return text

  def get_nouns(self, sentences):
    words = []
    #nouns = []

    for sentence in sentences:
      words.append(self.okt.nouns(str(sentence)))
    for i,v in enumerate(words):
      if len(v)<2:
        words.pop(i)
      if v in self.stopwords:
        words.pop(i)
    
    return words

In [36]:
class GraphMatrix(object):
  def __init__(self):
    self.tfidf = TfidfVectorizer()
    self.cnt_vec = CountVectorizer()
    self.graph_nouns = []

  def build_sent_graph(self, nouns):
    tfidf_mat = self.tfidf.fit_transform(nouns).toarray()
    self.graph_nouns = np.dot(tfidf_mat, tfidf_mat.T)
    return self.graph_nouns

  def build_words_graph(self, nouns):
    cnt_vec_mat = normalize(self.cnt_vec.fit_transform(nouns).toarray().astype(float), axis=0)
    vocab = self.cnt_vec.vocabulary_
    return np.dot(cnt_vec_mat.T, cnt_vec_mat), {vocab[word] : word for word in vocab}

In [9]:
class Rank(object):
  def get_ranks(self, graph, d=0.85): # d = damping factor
    A = graph
    matrix_size = A.shape[0]

    for id in range(matrix_size):
      A[id, id] = 0 # diagonal 부분을 0으로
      link_sum = np.sum(A[:,id]) # A[:, id] = A[:][id]
      if link_sum != 0:
        A[:, id] /= link_sum
      A[:, id] *= -d
      A[id, id] = 1

    B = (1-d) * np.ones((matrix_size, 1))
    ranks = np.linalg.solve(A, B) # 연립방정식 Ax = b
    return {idx: r[0] for idx, r in enumerate(ranks)}

In [10]:
class TextRank(object):
  def __init__(self, text):
    self.sent_tokenize = SentenceTokenizer()

    if text[:5] in ('http:', 'https'):
      self.sentences = self.sent_tokenize.url2sentences(text)
    else:
      self.sentences = self.sent_tokenize.text2sentences(text)
    
   # self.sentences = self.sent_tokenize.clean_text(self.sentences)
    self.nouns = self.sent_tokenize.get_nouns(self.sentences)
    
    self.graph_matrix = GraphMatrix()
    self.sent_graph = self.graph_matrix.build_sent_graph(self.nouns)    
    self.words_graph, self.idx2word = self.graph_matrix.build_words_graph(self.nouns)
    
    self.rank = Rank()
    self.sent_rank_idx = self.rank.get_ranks(self.sent_graph)
    self.sorted_sent_rank_idx = sorted(self.sent_rank_idx, key=lambda k: self.sent_rank_idx[k], reverse=True)
    
    self.word_rank_idx = self.rank.get_ranks(self.words_graph)
    self.sorted_word_rank_idx = sorted(self.word_rank_idx, key=lambda k: self.word_rank_idx[k], reverse=True)

  def summarize(self, sent_num=3):
    summary = []
    index=[]

    for idx in self.sorted_sent_rank_idx[:sent_num]:
      index.append(idx)

    index.sort()
    for idx in index:
      summary.append(self.sentences[idx])
    return summary

  def keywords(self, word_num=10):
    rank = Rank()
    rank_idx = rank.get_ranks(self.words_graph)
    sorted_rank_idx = sorted(rank_idx, key=lambda k: rank_idx[k], reverse=True)

    keywords = []
    index=[]

    for idx in sorted_rank_idx[:word_num]:
      index.append(idx)
    
    #index.sort()
    for idx in index:
      keywords.append(self.idx2word[idx])
    
    return keywords

In [37]:
url = 'http://v.media.daum.net/v/20170611192209012?rcmd=r'
textrank = TextRank(url)

for row in textrank.summarize(3):
  print(row)
  print()

print('keywords :',textrank.keywords())

AttributeError: ignored