In [None]:
import re
import requests
from bs4 import BeautifulSoup
from konlpy.tag import Okt
from collections import Counter
import re

class NewsCrawler(object):
    def __init__(
            self,
            press_url: str,
    ) -> None:
        self.press_url = press_url

    def __call__(self):
        news_list = list()
        title_list = list()
        section_list = list()
        
        url_list = self._url_crawling()
        for url in url_list:
            news, section, title = self._news_crawling(url)
            if news is not None and news[:2] != 'if' and len(news) > 150:
                news_list.append(news)
                section_list.append(section)
                title_list.append(title)

        return news_list, title_list, section_list

    def _url_crawling(self) -> list:
        url_list = list()
        base_url = 'https://news.naver.com'
        headers = {"User-Agent": "Mozilla/5.0"}
        
        html = requests.get(self.press_url, headers=headers)
        soup = BeautifulSoup(html.text, 'html.parser')

        news = soup.find('ul', class_='rankingnews_list type_detail')

        urls = news.find_all('a')

        for url in urls:
            href = url.attrs['href']
            if href is not None:
                url_list.append(base_url + href)

        return url_list

    def _news_crawling(self, url: str):
        def _clean_text(text):
            text = re.sub('[a-zA-Z]', "", text)
            text = re.sub('[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>◀◁▷▶■♥⑤◆※@\#$%&\\\=\(\"\"(\n)(\t)]', "", text)
            text = text.replace("오류를 우회하기 위한 함수 추가", "")
            text = text.replace("사진연합뉴스", "")
            text = text.replace("무단 전재 및 재배포 금지", "")
            text = text.replace("동영상 뉴스", "")
            cleaned_text = text.replace("앵커", "")

            return cleaned_text
    
        headers = {"User-Agent": "Mozilla/5.0"}

        html = requests.get(url, headers=headers)
        soup = BeautifulSoup(html.text, 'html.parser')

        all_news = soup.find(id='articleBodyContents').text
        all_news = _clean_text(all_news).strip()
        
        title = soup.find(id='articleTitle').text.strip()
        section = soup.find('em', class_='guide_categorization_item').text.strip()

        return all_news, section, title

In [None]:
import pandas as pd
class NewsCollector(object):
    def __init__(
            self,
            press_name: str,
    ) -> None:
        press = {'KBS': '056', 'MBC': '214', 'SBS': '055', 'JTBC': '437', 'YTN': '052', '한국경제TV': '215',
                 'TV조선': '448', '연합뉴스': '001', '연합뉴스TV': '422', '뉴스1': '421', '뉴시스': '003',
                 '중앙일보': '025', '동아일보': '020', '매일경제': '009', '전자신문': '030', '이데일리': '018',
                 '서울경제' : '011', '머니투데이': '008', '채널A': '449'}

        if press_name in self.available_presses():
            self.press_name = press_name
        else:
            raise KeyError('Unknown press : {}, available presses are {}'.format(press_name, self.available_presses()))

        self.base_url = 'https://news.naver.com/main/ranking/office.nhn?officeId='
        self.press_url = self.base_url + press[self.press_name]

    def collect(self) -> list:
        collector = NewsCrawler(self.press_url)
        news_list, title_list, section_list = collector()
        
        arr = list(set(zip(title_list, section_list, news_list)))
        
        return arr
    @staticmethod
    def available_presses() -> list:
        return ['KBS', 'MBC', 'SBS', 'JTBC', 'YTN', '한국경제TV', 
                'TV조선', '연합뉴스', '연합뉴스TV', '뉴스1', '뉴시스',
                '중앙일보', '동아일보', '매일경제', '전자신문', '이데일리', 
                '서울경제', '머니투데이', '채널A']

In [None]:
import pandas as pd

dic = {'Title': list(), 'Section': list(), 'News': list()}
for press in NewsCollector.available_presses():
    news_collector = NewsCollector(press)
    try:
        arr = news_collector.collect()
    except Exception as e:
        continue
    
    for i in range(len(arr)):
        dic['Title'].append(arr[i][0])
        dic['Section'].append(arr[i][1])
        dic['News'].append(arr[i][2])
        
df = pd.DataFrame(dic)

In [None]:
okt = Okt()
fp = open("stopwords.txt", "r", encoding = 'UTF8')
stopwords = fp.read().split()

def get_nouns(x):
    nouns = okt.nouns(x)#명사추출
    nouns = [noun for noun in nouns if len(noun) > 1]#한글자 제거
    nouns = [noun for noun in nouns if noun not in stopwords]#불용어 처리

    return nouns

def text_cleaning(text):#한글만 남기기
    hangul = re.compile('[^ ㄱ-ㅣ가-힣]+') 
    result = hangul.sub('', text)
    return result

def start(df,section):
    df2=df.copy()
    df2=df2[df2['Section'] == section]#색션 같은 기사만 추출
    df2['News']=df2['Title']+df2['News']# 타이틀이랑 합치기
    df2['only_hangul'] = df2['News'].apply(lambda x : text_cleaning(x))#한글만 남기기
    df2['nouns']=df2['only_hangul'].apply(lambda x: get_nouns(x))#명사만 추출
    return df2

df2 = start(df,'사회')
data_split = df2['News'].tolist()

In [None]:
#topic class
class topic(object):
    def get_freq(self, data):
        pocket_size = 20
        noun = []
        noun_proto = okt.nouns(data)
        
        for words in noun_proto:
            if (len(words)>1) and (words not in stop_words):
                noun.append(words)
                
        count = Counter(noun)        
        noun_list = count.most_common(pocket_size)
    
        if(len(noun_list) < pocket_size):
            pocket_size = len(noun_list)
        
        return noun_list
    
    def get_topic(self, data, amount):
        doc = []
        for article in data:
            freq_list = self.get_freq(article)
            
            for freq in freq_list:
                doc.append((freq[0]))
            
        total = Counter(doc)
    
        return total.most_common(amount)
    
    # 특정 keyword 에 대한 연관단어 추출
    def get_topic_include(self, data, amount, keyword):
        doc = []
        for article in data:
            if(okt.nouns(article).count(keyword) == 0):continue
                
            freq_list = self.get_freq(article)

            for freq in freq_list:
                doc.append((freq[0]))
            
        total = Counter(doc)
    
        return total.most_common(amount)

In [None]:
#weight calculater
class calc(object):
    def __init__(self, data):
        self.data = data
        
    def cal_weight(self, includes):
        freq_sum = 0
        result_sum = 0
        result_list = []
        for words in includes: #핵심단어로 지정된 단어들이 등장한 문서의 합
            freq_sum+= words[1]

        for article in self.data:
            result_sum = 0
            for words in includes:
                result_sum += words[1] * (okt.nouns(article).count(words[0])/len(article))
                #result_sum += (words[1]/freq_sum) * (okt.nouns(article).count(words[0])/len(article)) 잠시변경
            result_list.append(result_sum)
        return result_list

In [None]:
#result
def result_generator(selec):
    result_list = list()
    max_ = 0
    index_ = 0
    include = topics.get_topic_include(data_split, 10, main_topic[selec][0])
    calculater = calc(data_split)
    result = calculater.cal_weight(include)
    i = 1
    print(include[0][0],"에 대해서")
    print(include)
    for value in result:
        print(i,"번째기사 =", value)
        result_list.append(value)
        if(value > max_):
            max_ = value
            index_ = i
        i+=1
    print("max:", index_,"번째기사 =", max_)
    return result_list;

In [None]:
#main frame 1 - 데이터 로드 및 선언
doc = []
topics = topic()
stop_words = stopwords

In [None]:
#main frame 2 - 전체 데이터(기사)의 토픽 > 핫이슈
main_topic = topics.get_topic(data_split, 5)
print(main_topic)

In [None]:
#main frame 3 - 핫이슈 별 연관 키워드(해당 단어를 포함한 기사들만 모아서 재분석)
for issue in main_topic:
    include = topics.get_topic_include(data_split, 10, issue[0])
    print("<Topic:", issue[0],">")
    print(include, "\n")

In [None]:
#test1_사용자 입력
status = True
j = 0
choice = "hello"
for words in main_topic:
    print((j+1),":", words[0], sep ='', end = ' ')
    j += 1

print("\n입력:",end = '')
sel = int(input())-1
rl = result_generator(sel)

In [None]:
def min_max_normalize(target):
    normalized_list = []
    
    for value in target:
        normalized_target = (value - min(target)) / (max(target) - min(target))
        normalized_list.append(normalized_target)
    
    return normalized_list

In [None]:
from apyori import apriori
def Apriori(df2):#Apriori알고리즘
    Threshold=0.2
    transactions = df2['nouns'].tolist()
    transactions = [transaction for transaction in transactions if transaction] # 공백 문자열을 방지
    
    results = list(apriori(transactions,min_support=0.06,
    min_confidence=0.05,
    min_lift=1.0,
    max_length=2))
    
    columns = ['source', 'target', 'support']
    network_df2 = pd.DataFrame(columns=columns)
    for result in results:
        if len(result.items) == 2 and result.support>=Threshold:
            items = [x for x in result.items]
            row = [items[0], items[1], result.support]
            series = pd.Series(row, index=network_df2.columns)
            network_df2 = network_df2.append(series, ignore_index=True)

    network_df2=network_df2.sort_values(by='support', ascending=False)
    network_df=pd.DataFrame(results)
    network_df['length']=network_df['items'].apply(lambda x: len(x))
    
    network_df=network_df[(network_df['length']==2) & 
                          (network_df['support']>=Threshold)].sort_values(by='support', ascending=False)
    
    network_df=network_df.drop(columns=['ordered_statistics'])
    
    network_df=network_df.reset_index()
    network_df2=network_df2.reset_index()
    
    network_df['source']=network_df2['source']
    network_df['target']=network_df2['target']
    network_df=network_df.drop(columns=['index'])
    return network_df

network_df = Apriori(df2)
topic_userinput=main_topic[sel][0]
df2 = df2.reset_index(drop = True)

network_df2=network_df[network_df['source']==topic_userinput]
topic_about=network_df2['target'].tolist()
topic_about.append(topic_userinput)

count_list=[]
df3 = df2.copy()
for i in df3['nouns']:#1행씩
    count=0
    for j in range(len(i)):#각행의 단어들
        for k in topic_about:#각행의 단어와 키워드 비교
            if i[j] ==k:
                count+=1
    count_list.append(count)
#두 값을 동일하게 반영하기 위해서 min-max 정규화를 통해 0~1사이값으로 바꿈
df3["value"] = min_max_normalize(rl)
df3['count']= min_max_normalize(count_list)
df3

In [None]:
#기사추출
df3['add']= (df3['value']+df3['count'])/2 #두 지표의 평균
#df3['mul']= df3['value']*df3['count'] #두 지표의 곱
df_result = df3.sort_values(by='add', ascending=False)
df_result

In [None]:
#요약하기-카카오브레인 pororo모델
from pororo import Pororo
summary=Pororo(task="summary", lang="kr")
result=summary(df_result["NEWS"][0])
result

In [None]:
#시각화 -참고
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from matplotlib import rc
font_name = fm.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)

def Visualization(network_df):
    G = nx.Graph()
    ar=(network_df['items'])
    G.add_edges_from(ar)
    pr=nx.pagerank(G)
    nsize=np.array([v for v in pr.values()])
    nsize= 2000* (nsize-min(nsize)) / (max(nsize)-min(nsize))
    
    pos=nx.random_layout(G)
    
    plt.figure(figsize=(16,12))
    plt.axis('off')
    nx.draw_networkx(G, font_family=font_name, font_size=16,
                    pos=pos, node_color=list(pr.values()),node_size=nsize,
                    alpha=0.7, edge_color='.5')
    #nx.draw_networkx_labels(G, pos, font_family=font_name, font_size=15)
    plt.show()
Visualization(network_df)