In [1]:
try:
    import jpype
except:
    import jpype
from konlpy.tag import Kkma
from konlpy.tag import Okt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
import numpy as np
import re
import pandas as pd
import csv
import requests
from lxml.html import fromstring
from newspaper import Article

## CATEGORY
society: 사회 / sports: 스포츠 / politics: 정치 / economic: 경제 / foreign: 국제 / culture: 문화 / entertain: 연예 / digital: IT / editorial: 칼럼 / press: 보도자료

{CATEGORY}_texts: 카테고리 + 제목 + 내용

{CATEGORY}_summary: 요약

{CATEGORY}_keywords: 키워드

{CATEGORY}_sentiment: 감정분석

In [2]:
society = []
sports = []
politics = []
economic = []
foreign = []
culture = []
entertain = []
digital = []
editorial = []
press = []

In [3]:
def news_link(category):
    for sub in ['category']:
        for i in range(1,3):
            url_page = '?page='
            url = 'https://news.daum.net/breakingnews/' + sub + url_page + str(i)
            res = requests.get(url)
            
            parser = fromstring(res.text)
        
            article_list = parser.xpath('//div[@class="box_etc"]')

            parsed_articles = article_list[0].xpath('.//li')

            for article in parsed_articles:
                parsed_link = article.xpath('.//a[@href]')
                
                link = parsed_link[0].get('href')
                category.append(link)
    return category

In [4]:
news_link(society)
news_link(sports)
news_link(politics)
news_link(economic)
news_link(foreign)
news_link(culture)
news_link(entertain)
news_link(digital)
news_link(editorial)
news_link(press)

['https://v.daum.net/v/20210510190429775',
 'https://v.daum.net/v/20210510190419774',
 'https://v.daum.net/v/20210510190418773',
 'https://v.daum.net/v/20210510190417772',
 'https://v.daum.net/v/20210510190410771',
 'https://v.daum.net/v/20210510190409770',
 'https://v.daum.net/v/20210510190408769',
 'https://v.daum.net/v/20210510190405767',
 'https://v.daum.net/v/20210510190402766',
 'https://v.daum.net/v/20210510190401765',
 'https://v.daum.net/v/20210510190401764',
 'https://v.daum.net/v/20210510190344763',
 'https://v.daum.net/v/20210510190342762',
 'https://v.daum.net/v/20210510190339761',
 'https://v.daum.net/v/20210510190326760',
 'https://v.daum.net/v/20210510190429775',
 'https://v.daum.net/v/20210510190419774',
 'https://v.daum.net/v/20210510190418773',
 'https://v.daum.net/v/20210510190417772',
 'https://v.daum.net/v/20210510190410771',
 'https://v.daum.net/v/20210510190409770',
 'https://v.daum.net/v/20210510190408769',
 'https://v.daum.net/v/20210510190405767',
 'https://v

In [14]:
def split(category, category_ko):
    texts = []
    sentences = []
    for link in category:
        article = Article(link, language='ko')
        article.download()
        article.parse()
        titles = article.title
        sentence = article.text
        text = "<" + category_ko + ">" + "제목:" + titles + "내용:" + sentence
        texts.append(text)
        sentences.append(sentence)
    return texts, sentences

In [15]:
society_texts, society_sentences = split(society, '사회')
sports_texts, sports_sentences = split(sports, '스포츠')
politics_texts, politics_sentences = split(politics, '정치')
economic_texts, economic_sentences = split(economic, '경제')
foreign_texts, foreign_sentences = split(foreign, '국제')
culture_texts, culture__sentences = split(culture, '문화')
split_texts, split_sentences = split(entertain, '연예')
digital_texts, digital_sentences = split(digital, 'IT')
split_texts, split_sentences = split(editorial, '칼럼')
press_texts, press_sentences = split(press, '보도자료')

In [17]:
def summary(_sentences, words):
    ## GraphMatrix
    tfidf = TfidfVectorizer()
    cnt_vec = CountVectorizer()
    sentence_graph = []

    # build_sent_graph
    tfidf_mat = tfidf.fit_transform(words).toarray()
    sentence_graph = np.dot(tfidf_mat, tfidf_mat.T)

    # build_words_graph
    data = cnt_vec.fit_transform(words).toarray()
    cnt_vec_mat = normalize(data, axis=0)
    vocab = cnt_vec.vocabulary_
    words_graph = np.dot(cnt_vec_mat.T, cnt_vec_mat)
    idx2word = {vocab[word] : word for word in vocab}
    
    ## Rank
    # sentence graph
    A = sentence_graph
    d = 0.85
    matrix_size = A.shape[0]

    for id in range(matrix_size):
        A[id, id] = 0 # diagonal 부분을 0으로
        link_sum = np.sum(A[:,id]) # A[:, id] = A[:][id]
        if link_sum != 0:
                A[:, id] /= link_sum
        A[:, id] *= -d
        A[id, id] = 1

    B = (1-d) * np.ones((matrix_size, 1))
    ranks = np.linalg.solve(A, B) # 연립방정식 Ax = b

    sentence_graph_idx = {idx: r[0] for idx, r in enumerate(ranks)}

    sorted_sentence_rank_idx = sorted(sentence_graph_idx, key=lambda k: sentence_graph_idx[k], reverse=True)

    # word graph
    A = words_graph
    d = 0.85
    matrix_size = A.shape[0]

    for id in range(matrix_size):
        A[id, id] = 0 # diagonal 부분을 0으로
        link_sum = np.sum(A[:,id]) # A[:, id] = A[:][id]
        if link_sum != 0:
                A[:, id] /= link_sum
        A[:, id] *= -d
        A[id, id] = 1

    B = (1-d) * np.ones((matrix_size, 1))
    ranks = np.linalg.solve(A, B) # 연립방정식 Ax = b

    words_graph_idx = {idx: r[0] for idx, r in enumerate(ranks)}

    sorted_words_rank_idx = sorted(words_graph_idx, key=lambda k: words_graph_idx[k], reverse=True)

    # summarize
    sent_num=3
    summary = []
    index=[]

    for idx in sorted_sentence_rank_idx[:sent_num]:
        index.append(idx)

    index.sort()
    for idx in index:
        summary.append(_sentences[idx])

    # keywords
    word_num=10
    keywords = []
    index=[]
    summarys=[]

    for idx in sorted_words_rank_idx[:word_num]:
        index.append(idx)
        
    #index.sort()
    for idx in index:
        keywords.append(idx2word[idx])

    count = 0
    
    for row in summary:
        print(row)
        summarys.append(row)
        print()
        count = count + 1
        if(count > 3):
            break
    
    print('keywords :', keywords)
    return summarys, keywords

In [18]:
def text_processing(start, end, _sentences):
    table = dict()
    with open('f:/eunjin/OSSW-Project/Sentiment-analysis/polarity.csv', 'r', -1, 'utf-8') as polarity:
        next(polarity)
        
        for line in csv.reader(polarity):
            key = str()
            for word in line[0].split(';'):
                key += word.split('/')[0]
            table[key] = {'Neg': line[3], 'Neut': line[4], 'Pos': line[6]}

    columns=['negative', 'neutral', 'positive']
    df = pd.DataFrame(columns=columns)

    file_stop_word = open('f:/eunjin/OSSW-Project/Sentiment-analysis/stop_words_file.txt', 'r', -1, 'utf-8')
    stop_word = file_stop_word.read()
    stop_word_list = []
    negative_list = []
    neutral_list = []
    positive_list = []
    for word in stop_word.split(','):
        if word not in stop_word_list:
            stop_word_list.append(word)
            file_stop_word.close()

    for i in range(start, end):
        words = str(_sentences)
        hangul = re.compile('[^ ㄱ-ㅣ가-힣]+')
        words = hangul.sub('', words)
        words_list = []
        for i in words:
            if i not in stop_word_list:
                words_list.append(i)

    negative = 0
    neutral = 0
    positive = 0

    for word in words_list:
        if word in table:
            negative += float(table[word]['Neg'])
            neutral += float(table[word]['Neut'])
            positive += float(table[word]['Pos'])
  
    negative_list.append(negative)
    neutral_list.append(neutral)
    positive_list.append(positive)

    df['negative'] = negative_list
    df['neutral'] = neutral_list
    df['positive'] = positive_list

    return df

In [19]:
def sentiment_analysis(_sentences):
    df = text_processing(0,366, _sentences)

    df.to_csv('./result.csv', index=False)

    ds = pd.read_csv('./result.csv')
    ds == ds.values.max()
    ids, cols = np.where(ds == ds.values.max())
    list(zip(ids, cols))
    sen = [ds.columns[c] for c in cols]
    return sen

In [20]:
okt = Okt()
kkma = Kkma()

In [25]:
def data(category_sentences):
    for i in category_sentences:
        _sentences = kkma.sentences(i)
        
        words = []

        for j in _sentences:
            word = okt.nouns(i)
            word_str = ' '.join(word)
            words.append(word_str)

        sentiment_result = sentiment_analysis(_sentences)
        summary_result, keywords = summary(_sentences, words)

    return summary_result, keywords, sentiment_result

In [None]:
society_summary, society_keywords, society_sentiment = data(society_sentences)
sports_summary, sports_keywords, sports_sentiment = data(sports_sentences)
politics_summary, politics_keywords, politics_sentiment = data(politics_sentences)
economic_summary, economic_keywords, economic_sentiment = data(economic_sentences)
foreign_summary, foreign_keywords, foreign_sentiment = data(foreign_sentences)
culture_summary, culture_keywords, culture_sentiment = data(culture_sentences)
entertain_summary, entertain_keywords, entertain_sentiment = data(entertain_sentences)
digital_summary, digital_keywords, digital_sentiment = data(digital_sentences)
editorial_summary, editorial_keywords, editorial_sentiment = data(editorial_sentences)
press_summary, press_keywords, press_sentiment = data(press_sentences)