In [1]:
try:
    import jpype
except:
    import jpype
from konlpy.tag import Kkma
from konlpy.tag import Okt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
import numpy as np
import re
import pandas as pd
import csv
import requests
from lxml.html import fromstring
from newspaper import Article

## CATEGORY
society: 사회 / sports: 스포츠 / politics: 정치 / economic: 경제 / foreign: 국제 / culture: 문화 / entertain: 연예 / digital: IT / editorial: 칼럼 / press: 보도자료

{CATEGORY}_texts: 카테고리 + 제목 + 내용

{CATEGORY}_summary: 요약

{CATEGORY}_keywords: 키워드

{CATEGORY}_sentiment: 감정분석

In [2]:
society = []
sports = []
politics = []
economic = []
foreign = []
culture = []
entertain = []
digital = []
editorial = []
press = []

In [3]:
def news_link(category):
    for sub in ['category']:
        for i in range(1,3):
            url_page = '?page='
            url = 'https://news.daum.net/breakingnews/' + sub + url_page + str(i)
            res = requests.get(url)
            
            parser = fromstring(res.text)
        
            article_list = parser.xpath('//div[@class="box_etc"]')

            parsed_articles = article_list[0].xpath('.//li')

            for article in parsed_articles:
                parsed_link = article.xpath('.//a[@href]')
                
                link = parsed_link[0].get('href')
                category.append(link)
    return category

In [4]:
news_link(society)
news_link(sports)
news_link(politics)
news_link(economic)
news_link(foreign)
news_link(culture)
news_link(entertain)
news_link(digital)
news_link(editorial)
news_link(press)

['https://v.daum.net/v/20210515191056033',
 'https://v.daum.net/v/20210515191034032',
 'https://v.daum.net/v/20210515191033031',
 'https://v.daum.net/v/20210515191010030',
 'https://v.daum.net/v/20210515191007029',
 'https://v.daum.net/v/20210515191004028',
 'https://v.daum.net/v/20210515191003027',
 'https://v.daum.net/v/20210515191001026',
 'https://v.daum.net/v/20210515190958025',
 'https://v.daum.net/v/20210515190943024',
 'https://v.daum.net/v/20210515190912023',
 'https://v.daum.net/v/20210515190903022',
 'https://v.daum.net/v/20210515190902021',
 'https://v.daum.net/v/20210515190901020',
 'https://v.daum.net/v/20210515190900019',
 'https://v.daum.net/v/20210515191056033',
 'https://v.daum.net/v/20210515191034032',
 'https://v.daum.net/v/20210515191033031',
 'https://v.daum.net/v/20210515191010030',
 'https://v.daum.net/v/20210515191007029',
 'https://v.daum.net/v/20210515191004028',
 'https://v.daum.net/v/20210515191003027',
 'https://v.daum.net/v/20210515191001026',
 'https://v

In [5]:
def split(category, category_ko):
    texts = []
    sentences = []
    for link in category:
        article = Article(link, language='ko')
        article.download()
        article.parse()
        titles = article.title
        sentence = article.text
        
        if sentence:    # 내용이 없으면 건너뜀
            text = "<" + category_ko + ">" + "제목:" + titles + "내용:" + sentence
            texts.append(text)
            sentences.append(sentence)

    return texts, sentences

In [6]:
society_texts, society_sentences = split(society, '사회')
sports_texts, sports_sentences = split(sports, '스포츠')
politics_texts, politics_sentences = split(politics, '정치')
economic_texts, economic_sentences = split(economic, '경제')
foreign_texts, foreign_sentences = split(foreign, '국제')
culture_texts, culture_sentences = split(culture, '문화')
entertain_texts, entertain_sentences = split(entertain, '연예')
digital_texts, digital_sentences = split(digital, 'IT')
editorial_texts, editorial_sentences = split(editorial, '칼럼')
press_texts, press_sentences = split(press, '보도자료')

In [7]:
def summary(_sentences, words):
    ## GraphMatrix
    tfidf = TfidfVectorizer()
    cnt_vec = CountVectorizer()
    sentence_graph = []

    # build_sent_graph
    tfidf_mat = tfidf.fit_transform(words).toarray()
    sentence_graph = np.dot(tfidf_mat, tfidf_mat.T)

    # build_words_graph
    data = cnt_vec.fit_transform(words).toarray()
    cnt_vec_mat = normalize(data, axis=0)
    vocab = cnt_vec.vocabulary_
    words_graph = np.dot(cnt_vec_mat.T, cnt_vec_mat)
    idx2word = {vocab[word] : word for word in vocab}
    
    ## Rank
    # sentence graph
    A = sentence_graph
    d = 0.85
    matrix_size = A.shape[0]

    for id in range(matrix_size):
        A[id, id] = 0 # diagonal 부분을 0으로
        link_sum = np.sum(A[:,id]) # A[:, id] = A[:][id]
        if link_sum != 0:
                A[:, id] /= link_sum
        A[:, id] *= -d
        A[id, id] = 1

    B = (1-d) * np.ones((matrix_size, 1))
    ranks = np.linalg.solve(A, B) # 연립방정식 Ax = b

    sentence_graph_idx = {idx: r[0] for idx, r in enumerate(ranks)}

    sorted_sentence_rank_idx = sorted(sentence_graph_idx, key=lambda k: sentence_graph_idx[k], reverse=True)

    # word graph
    A = words_graph
    d = 0.85
    matrix_size = A.shape[0]

    for id in range(matrix_size):
        A[id, id] = 0 # diagonal 부분을 0으로
        link_sum = np.sum(A[:,id]) # A[:, id] = A[:][id]
        if link_sum != 0:
                A[:, id] /= link_sum
        A[:, id] *= -d
        A[id, id] = 1

    B = (1-d) * np.ones((matrix_size, 1))
    ranks = np.linalg.solve(A, B) # 연립방정식 Ax = b

    words_graph_idx = {idx: r[0] for idx, r in enumerate(ranks)}

    sorted_words_rank_idx = sorted(words_graph_idx, key=lambda k: words_graph_idx[k], reverse=True)

    # summarize
    sent_num=3
    summary = []
    index=[]

    for idx in sorted_sentence_rank_idx[:sent_num]:
        index.append(idx)

    index.sort()
    for idx in index:
        summary.append(_sentences[idx])

    # keywords
    word_num=10
    keywords = []
    index=[]
    summarys=[]

    for idx in sorted_words_rank_idx[:word_num]:
        index.append(idx)
        
    #index.sort()
    for idx in index:
        keywords.append(idx2word[idx])

    count = 0
    
    for row in summary:
        print(row)
        summarys.append(row)
        print()
        count = count + 1
        if(count > 3):
            break
    
    print('keywords :', keywords)
    return summarys, keywords

In [8]:
def text_processing(start, end, _sentences):
    table = dict()
    with open('./polarity.csv', 'r', -1, 'utf-8') as polarity:
        next(polarity)
        
        for line in csv.reader(polarity):
            key = str()
            for word in line[0].split(';'):
                key += word.split('/')[0]
            table[key] = {'Neg': line[3], 'Neut': line[4], 'Pos': line[6]}

    columns=['negative', 'neutral', 'positive']
    df = pd.DataFrame(columns=columns)

    file_stop_word = open('./stop_words_file.txt', 'r', -1, 'utf-8')
    stop_word = file_stop_word.read()
    stop_word_list = []
    negative_list = []
    neutral_list = []
    positive_list = []
    for word in stop_word.split(','):
        if word not in stop_word_list:
            stop_word_list.append(word)
            file_stop_word.close()

    for i in range(start, end):
        words = str(_sentences)
        hangul = re.compile('[^ ㄱ-ㅣ가-힣]+')
        words = hangul.sub('', words)
        words_list = []
        for i in words:
            if i not in stop_word_list:
                words_list.append(i)

    negative = 0
    neutral = 0
    positive = 0

    for word in words_list:
        if word in table:
            negative += float(table[word]['Neg'])
            neutral += float(table[word]['Neut'])
            positive += float(table[word]['Pos'])
  
    negative_list.append(negative)
    neutral_list.append(neutral)
    positive_list.append(positive)

    df['negative'] = negative_list
    df['neutral'] = neutral_list
    df['positive'] = positive_list

    return df

In [9]:
def sentiment_analysis(_sentences):
    df = text_processing(0,366, _sentences)

    df.to_csv('./result.csv', index=False)

    ds = pd.read_csv('./result.csv')
    ds == ds.values.max()
    ids, cols = np.where(ds == ds.values.max())
    list(zip(ids, cols))
    sen = [ds.columns[c] for c in cols]
    return sen

In [10]:
okt = Okt()
kkma = Kkma()

In [11]:
def data(category_sentences):
    for i in category_sentences:
        _sentences = kkma.sentences(i)
        
        words = []

        for j in _sentences:
            word = okt.nouns(i)
            word_str = ' '.join(word)
            words.append(word_str)

        sentiment_result = sentiment_analysis(_sentences)
        summary_result, keywords = summary(_sentences, words)

    return summary_result, keywords, sentiment_result

In [14]:
society_summary, society_keywords, society_sentiment = data(society_sentences)
sports_summary, sports_keywords, sports_sentiment = data(sports_sentences)
politics_summary, politics_keywords, politics_sentiment = data(politics_sentences)
economic_summary, economic_keywords, economic_sentiment = data(economic_sentences)
foreign_summary, foreign_keywords, foreign_sentiment = data(foreign_sentences)
culture_summary, culture_keywords, culture_sentiment = data(culture_sentences)
entertain_summary, entertain_keywords, entertain_sentiment = data(entertain_sentences)
digital_summary, digital_keywords, digital_sentiment = data(digital_sentences)
editorial_summary, editorial_keywords, editorial_sentiment = data(editorial_sentences)
press_summary, press_keywords, press_sentiment = data(press_sentences)

배우 조진웅이 13일 오후 경기 고양시 일산 킨 텍스에서 열린 ' 제 57회 백상예술대상' 시상식 레드 카펫 행사에 참석해 포즈를 취하고 있다.

고양= 김 진경 기자 kim.jinkyung @jtbc .co .kr /2021.05 .13/

keywords : ['일산', '고양시', '김진경', '시상식', '참석', '고양', '레드카펫', '배우', '백상예술대상', '오후']
퀴 라 소 축구협회는 15일 공식 페이스 북에 이 같은 사실을 알리며 “ 패트릭 클 루이 베르트가 임시 감독으로 팀을 이끌 것” 이라고 전했다.

히 딩크 감독은 70대 중반의 고령에도 지난해 8월 퀴 라 소 축구 대표팀 감독직과 기술위원장을 맡았다.

퀴 라 소 축구협회는 이날 “ 히 딩크 감독의 빠른 쾌유를 바란다 ”며 “ 클 루이 베르트가 기꺼이 팀을 맡아 준 것에 감사를 표한다” 고 밝혔다.

keywords : ['거스', '구단', '국가', '달라', '선두', '진출', '감사', '공식', '다음', '소년']
강원 FC의 부진이 길어 지고 있다.

강원은 15일 오후 4시 30분에 춘천 송 암 스포츠 타운에서 열린 ' 하나 원 큐 K 리그 1 2021' 16 라운드에서 수원 FC와 0-0으로 비겼다.

이후 수원 FC에게 1-2 패배를 시작으로 광주에게 0-1로 패하며 연패에 빠졌다.

keywords : ['강원', '결과', '광주', '수원', '스포츠', '시작', '연패', '전체', '강호', '기반']
배우 김수 현이 13일 오후 경기 고양시 일산 킨 텍스에서 열린 ' 제 57회 백상예술대상' 시상식 레드 카펫 행사에 참석해 포즈를 취하고 있다.

고양= 김 진경 기자 kim.jinkyung @jtbc .co .kr /2021.05 .13/

keywords : ['오후', '고양시', '김수현', '백상예술대상', '참석', '고양', '김진경', '레드카펫', '배우', '시상식']
이에 관계자는 " 마 독스가 직접 참여한 가사를 통해 따뜻한 위

NameError: name 'editorial_sentences' is not defined