In [166]:
from urllib.request import urlopen
import urllib.parse
from bs4 import BeautifulSoup

from newspaper import Article
from konlpy.tag import Kkma
from konlpy.tag import Okt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
import numpy as np

import networkx as nx
import matplotlib.pyplot as plt
from IPython.core.display import display, HTML

import pymysql

In [167]:
class SentenceTokenizer(object):
    def __init__(self):
        self.kkma = Kkma()
        self.twitter = Okt()
        #stopwords를 효율적으로 수정 혹은 추가
        self.stopwords = ['', '\n', '중인' ,'만큼', '마찬가지', '꼬집었', "한국경제", "데일리", "동아일보", "중앙일보", "조선일보", "기자", "어", "나", "우리", "저희", "따라", "의해", "을", "를", "에", "의", "가"]
        
#크롤링 수정
    
    def text2sentences(self, text):
        sentences = self.kkma.sentences(text)
        '''
        for idx in range(0, len(sentences)):
            if len(sentences[idx]) <= 10:
                sentences[idx-1] += (' ' + sentences[idx])
                sentences[idx] = ''
        '''    
        return sentences

    def get_nouns(self, sentences):
        nouns = []
        for sentence in sentences:
            if sentence is not '':
                nouns.append(' '.join([noun for noun in self.twitter.nouns(str(sentence))
                                        if noun not in self.stopwords and len(noun) > 1]))

        return nouns

In [168]:
class GraphMatrix(object):
    def __init__(self):
        self.tfidf = TfidfVectorizer()
        self.cnt_vec = CountVectorizer()
        self.graph_sentence = []

    def build_sent_graph(self, sentence):
        tfidf_mat = self.tfidf.fit_transform(sentence).toarray()
        self.graph_sentence = np.dot(tfidf_mat, tfidf_mat.T)
        return self.graph_sentence

    def build_words_graph(self, sentence):
        cnt_vec_mat = normalize(self.cnt_vec.fit_transform(sentence).toarray().astype(float), axis=0)
        vocab = self.cnt_vec.vocabulary_
        return np.dot(cnt_vec_mat.T, cnt_vec_mat), {vocab[word] : word for word in vocab}

#레퍼런스 코드 다른 부분 필요한건지
#     def draw_graph(self, graph):
#         graph = nx.from_numpy_matrix(self.graph_sentence, create_using=nx.MultiDiGraph())
#         pos = nx.circular_layout(graph)
#         nx.draw_circular(graph)
#         labels = {i : i for i in graph.nodes()}
#         nx.draw_networkx_labels(graph, pos, labels, font_size=15)
#         plt.show()

In [169]:
class Rank(object):
    def get_ranks(self, graph, d=0.85): # d = damping factor
        A = graph
        matrix_size = A.shape[0]
        for id in range(matrix_size):
            A[id, id] = 0 # diagonal 부분을 0으로
            link_sum = np.sum(A[:,id]) # A[:, id] = A[:][id]
            if link_sum != 0:
                A[:, id] /= link_sum
            A[:, id] *= -d
            A[id, id] = 1

        B = (1-d) * np.ones((matrix_size, 1))
        ranks = np.linalg.solve(A, B) # 연립방정식 Ax = b
        return {idx: r[0] for idx, r in enumerate(ranks)}

In [170]:
class TextRank(object):
    def __init__(self, text):
        self.sent_tokenize = SentenceTokenizer()
    
        self.sentences = self.sent_tokenize.text2sentences(text)

        self.nouns = self.sent_tokenize.get_nouns(self.sentences)

        self.graph_matrix = GraphMatrix()
        self.sent_graph = self.graph_matrix.build_sent_graph(self.nouns)
        self.words_graph, self.idx2word = self.graph_matrix.build_words_graph(self.nouns)

        self.rank = Rank()
        self.sent_rank_idx = self.rank.get_ranks(self.sent_graph)
        self.sorted_sent_rank_idx = sorted(self.sent_rank_idx, key=lambda k: self.sent_rank_idx[k], reverse=True)

        self.word_rank_idx = self.rank.get_ranks(self.words_graph)
        self.sorted_word_rank_idx = sorted(self.word_rank_idx, key=lambda k: self.word_rank_idx[k], reverse=True)


#output form을 어떻게 바꿔야할지
    def summarize(self, sent_num=3):
        summary = []
        num = len(self.sentences)
        if(num>=50):
            sent_num = 5
        index=[]
        for idx in self.sorted_sent_rank_idx[:sent_num]:
            index.append(idx)

        index.sort()
        for idx in index:
            summary.append(self.sentences[idx])

        return summary

#우리 프로젝트에서는 사용하지 않음 빼도 되는지 더 빼도 되는 부분이 있는지
    def keywords(self, word_num=10):
        rank = Rank()
        rank_idx = rank.get_ranks(self.words_graph)
        sorted_rank_idx = sorted(rank_idx, key=lambda k: rank_idx[k], reverse=True)

        keywords = []
        index=[]
        for idx in sorted_rank_idx[:word_num]:
            index.append(idx)

        #index.sort()
        for idx in index:
            keywords.append(self.idx2word[idx])
        
        return keywords


In [171]:
def get_art_body(URL):
  a = Article(URL,language='ko')
  a.download()
  a.parse()
  return (a.title,a.text)

def get_articles_list():  
  with urllib.request.urlopen("https://www.hankyung.com/society/1002?hkonly=true") as response:
    html = response.read()
    soup = BeautifulSoup(html,'html.parser')
    art_link = soup.select('div.article > span > a')

  articles = []

  for (i,a) in enumerate(art_link):
    #articles[i][0]: 링크, articles[i][1]: 제목, articles[i][2]: 본문
    if i == 15:
      break
    l = a.get('href')
    (t,c) = get_art_body(l)
    r = TextRank(c).summarize()
    r = "-".join(r)
    articles.append([t,r,l])  #[[제목,텍스트랭크(1줄요약),링크],...]]
    

  return articles

In [173]:
# 접속
conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='171217', db='test', charset='utf8')
atcList = get_articles_list()
try:
  for a in atcList:
    with conn.cursor() as cursor:
      sql = 'INSERT INTO news (title,content,link) VALUES (%s, %s, %s)'
      cursor.execute(sql, (a[0],a[1],a[2]))
    conn.commit()
    print(cursor.lastrowid)
    
finally:
    conn.close()

"conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='171217', db='test', charset='utf8')\natcList = get_articles_list()\ntry:\n  for a in atcList:\n    with conn.cursor() as cursor:\n      sql = 'INSERT INTO news (title,content,link) VALUES (%s, %s, %s)'\n      cursor.execute(sql, (a[0],a[1],a[2]))\n    conn.commit()\n    print(cursor.lastrowid)\n    \nfinally:\n    conn.close()"