In [47]:
from bs4 import BeautifulSoup
import urllib.request
import requests
import re
from kornounextractor.noun_extractor import extract
import konlpy.tag
from collections import Counter 
from ckonlpy.tag import Twitter
from kr_sna import do_kr_sna
import nltk
import networkx as nx
import itertools

## 1. 기사 내용 수집하기

In [48]:
#### 인터넷 뉴스 기사를 추출하는 함수 생성
def get_article(url):
    source_code_from_url = urllib.request.urlopen(url)
    soup = BeautifulSoup(source_code_from_url, 'lxml', from_encoding='utf-8')
    news_title = soup.title.text
    publisher = soup.find('meta', attrs={'name':'twitter:creator'}).get('content')
    news_content = soup.find('div', attrs = {'id':'articleBodyContents'}).text
    news_content = news_content.split('{}')[1].strip()
    return news_title, publisher, news_content

#### 조선(cs), 동아(da), 한겨례(hkr), 경향(kh)

In [49]:
with open('Chosun.txt','r',encoding='utf8') as f:
    full_content_cs=[]
    for i in f.readlines():
           cs = i.strip()
           title_cs, publisher_cs, content_cs = get_article(cs)
           filtered_content_cs = re.sub(r'[^\s\d\w\.\?\!\,]',' ',content_cs)
           filtered_content_cs = filtered_content_cs.replace("양 회장","양진호").replace("양회장","양진호").replace("회장","양진호").replace("양씨","양진호")
           full_content_cs.append(filtered_content_cs)

In [50]:
with open('Donga.txt','r',encoding='utf8') as f:
    full_content_da=[]
    for i in f.readlines():
           da = i.strip()
           title_da, publisher_da, content_da = get_article(da)
           filtered_content_da = re.sub(r'[^\s\d\w\.\?\!\,]',' ',content_da)
           filtered_content_da = filtered_content_da.replace("양 회장","양진호").replace("양 씨","양진호").replace("양회장","양진호").replace("회장","양진호").replace("양씨","양진호")
           full_content_da.append(filtered_content_da)

In [51]:
with open('Hankyoreh.txt','r',encoding='utf8') as f:
    full_content_hkr=[]
    for i in f.readlines():
           hkr = i.strip()
           title_hkr, publisher_hkr, content_hkr = get_article(hkr)
           filtered_content_hkr = re.sub(r'[^\s\d\w\.\?\!\,]',' ',content_hkr)
           filtered_content_hkr = filtered_content_hkr.replace("양 회장","양진호").replace("양회장","양진호").replace("회장","양진호").replace("양씨","양진호").replace("헤비 업로더","헤비업로더")
           full_content_hkr.append(filtered_content_hkr)

In [12]:
with open('Kyunghyang.txt','r',encoding='utf8') as f:
    full_content_kh=[]
    for i in f.readlines():
           kh = i.strip()
           title_kh, publisher_kh, content_kh = get_article(kh)
           filtered_content_kh = re.sub(r'[^\s\d\w\.\?\!\,]',' ',content_kh)
           filtered_content_kh = filtered_content_kh.replace("양 회장","양진호").replace("양회장","양진호").replace("회장","양진호").replace("양씨","양진호")
           full_content_kh.append(filtered_content_kh)

## 2. 단어 빈도 분석_명사, 형용사, 동사

In [13]:
## 빈도분석을 위한 함수 생성_명사
def count_noun(text , n = None):
    ## 사용자 지정 사전 등록 kornounextractor 사용
    with open('dic.txt', 'w', encoding='utf8') as f:
        f.write('헤비업로더'+'\tNNG\n')
        for word in sorted(extract(text, freq=1.0)):
            if word == '갑지':
                word ='갑질'
            else:
                f.write(word+'\tNNG\n')   
                
    komoran = konlpy.tag.Komoran(userdic='dic.txt')
    
    ## 불용어 사전을 등록 후 불용어 제거
    stopwords = ['조선일보','전효진','김우영','기자','뉴스타파','캡처','싸이월드','이날','조선닷컴','바로가기','구독신청하기',
                 'Copyrights','무단','전재','재배포','금지','수원','권상은','성남','윤민혁','박성우','chosunbiz','.com',
                 'chosun','young','동아일보','donga','한겨레','경향신문','www','khan','.co','.kr','신문구독','아이폰XS',
                 '라고','공식 페이스북','hani','무료만화','구독','무단전재']

    ### 명사 추출
    Nouns = komoran.nouns(text)
    final_nouns = Nouns.copy()
    unique_nouns = set(Nouns)
    for word in unique_nouns:
    ### 1음절 단어와 불용어 제거하기
        if len(word) == 1:
            while word in final_nouns:
                  final_nouns.remove(word)
        if word in stopwords:
            while word in final_nouns:
                  final_nouns.remove(word)
    ### 빈도수 count
    c_noun = Counter(final_nouns) 
    if n == None:
        return print(c_noun.most_common(len(c_noun)))
    else:
        return print(c_noun.most_common(n))

In [14]:
###빈도분석을 위한 함수 생성_형용사, 동사
def count_av(text , n = None):
    ## ckonlpy를 이용
    twitter = Twitter()

    twitter.add_dictionary("있다","Verb")
    twitter.add_dictionary("없다","Verb")
    twitter.add_dictionary("있었다","Verb")
    twitter.add_dictionary("갑질","Noun")
    
    ## 포스 태깅
    twitter_morphs = twitter.pos(text)
    
    final_adj = []
    final_verb = []
    for word, pos in twitter_morphs: 
        if pos == 'Adjective':
            final_adj.append(word)
        elif pos == 'Verb':
            final_verb.append(word)
    
    ### 빈도수 count    
    c_adj = Counter(final_adj)
    print(c_adj.most_common(n))
    
    c_verb = Counter(final_verb)
    print(c_verb.most_common(n))

In [64]:
count_noun(str(full_content_cs),10)
count_noun(str(full_content_da),10)
count_noun(str(full_content_hkr),10)
count_noun(str(full_content_kh),10)

[('양진호', 74), ('폭행', 31), ('위디스크', 30), ('직원', 22), ('경찰', 19), ('위반', 15), ('영상', 14), ('한국미래기술', 13), ('혐의', 13), ('파일노리', 12)]
[('양진호', 79), ('직원', 32), ('폭행', 26), ('위디스크', 19), ('교수', 18), ('사무실', 14), ('영상', 13), ('압수수색', 12), ('혐의', 11), ('한국미래기술', 10)]
[('웹하드', 57), ('업체', 52), ('양진호', 41), ('필터링', 31), ('불법', 23), ('직원', 20), ('영상물', 19), ('위디스크', 19), ('유통', 17), ('폭행', 16)]
[('양진호', 34), ('직원', 30), ('폭행', 21), ('상사', 17), ('폭언', 15), ('처벌', 14), ('근로기준법', 14), ('직장 내 괴롭힘', 11), ('직장', 10), ('사용자', 9)]


In [16]:
count_av(str(full_content_cs),10)
count_av(str(full_content_da),10)
count_av(str(full_content_hkr),10)
count_av(str(full_content_kh),10)

  warn('"Twitter" has changed to "Okt" since KoNLPy v0.4.5.')


[('있는', 9), ('같은', 3), ('선한', 3), ('입니다', 1), ('당했던', 1), ('인해', 1), ('좋겠다', 1), ('빠르면', 1), ('굳게', 1), ('검으', 1)]
[('했다', 24), ('있다', 14), ('할', 11), ('하는', 11), ('된', 10), ('알려졌다', 6), ('는', 6), ('하기', 6), ('되는', 5), ('들', 4)]
[('있는', 12), ('같은', 5), ('많다', 3), ('강한', 2), ('좋겠다', 2), ('그런', 2), ('당했다는', 1), ('없는', 1), ('이러한', 1), ('인해', 1)]
[('했다', 57), ('들', 11), ('했다고', 10), ('된', 6), ('와', 6), ('할', 6), ('있다', 6), ('한다', 6), ('밝혔다', 5), ('한', 5)]
[('있는', 12), ('같은', 5), ('이런', 5), ('있', 3), ('없는', 3), ('어떤', 2), ('선한', 2), ('많은', 2), ('있지', 1), ('있었', 1)]
[('했다', 28), ('있다', 20), ('한다', 12), ('된', 10), ('되는', 10), ('하는', 10), ('할', 9), ('는', 9), ('들', 7), ('한', 7)]
[('있는', 8), ('같은', 5), ('있', 4), ('아닌', 3), ('있을', 2), ('많다', 2), ('있고', 2), ('어렵다', 2), ('안되면', 2), ('부끄러운', 1)]
[('했다', 26), ('할', 21), ('있다', 20), ('는', 18), ('하는', 15), ('없다', 12), ('한다', 8), ('가', 7), ('한', 7), ('된', 7)]


## 3. 네트워크 분석

In [58]:
#Semantic network analysis
def get_words_list(counter_list):
    words = []
    for word, count in counter_list:
        words.append(word)
    return words

def get_sentences(content):
    sentences = re.split(r'[\.\?\!]\s+', content)
    return sentences

def add_ties(g, sentence, komoran):

#각 문장에 대해서, 각 문장에서 함께 사용되는 단어들 사이에 관계 형성하기

    NN_words = komoran.nouns(sentence)
        
    selected_words =[]
    for noun in set(NN_words):
        if noun in list(g.nodes()):
            selected_words.append(noun)

    for pair in list(itertools.combinations(list(selected_words), 2)):
        if pair[0] == pair[1]:
            continue
        if pair in g.edges(): 
            g[pair[0]][pair[1]]['weight'] += 1
            
        else:
            g.add_edge(pair[0], pair[1], weight=1 )
    
    return g

def form_network(g, document, komoran):
#원본 데이터와 가장 많이 출현하는 명사 단어 x개를 사용해서 그 단어들 사이의 관계 형성하기
    for sentence in document:
        g = add_ties(g, sentence, komoran)
        
    return g

def do_kr_sna(text, final_nouns, stopwords, fre=2, num=20):
    text = text.replace('\n', ' ')
    
    with open('dic.txt', 'w', encoding='utf8') as f:
        for word in sorted(extract(text, freq=fre)):
            f.write(word+'\tNNG\n')
    
    komoran = konlpy.tag.Komoran(userdic='dic.txt')
    
    Nouns = komoran.nouns(text)
    
    #------------------------------------------------
    # 단어 빈도 파악하기 (Frequency analysis)
    c = Counter(final_nouns)
    list_of_words = get_words_list(c.most_common(num))

    # 원본 텍스트 데이터를 문장으로 쪼개기
    # in order to find ties between words, we first need to split the article content into sentences
    text1 = re.sub(r'[^\.\?\!\s\w\d]', ' ', text.replace('\n', ' '))
    text2 = re.sub(r'([\.\?\!])',r'\1 ', text1)
    article_sentences = get_sentences(text2)
    
    # 가장 많이 출현하는 num개의 명사 단어들에 대해서 네트워크 생성하기

    G = nx.Graph()
    G.add_nodes_from(list_of_words)
    G = form_network(G, article_sentences, komoran)
    
    return G

In [62]:
def out_g(text):
    
    with open('dic.txt', 'w', encoding='utf8') as f:
        f.write('헤비업로더'+'\tNNG\n')
        for word in sorted(extract(text, freq=1.0)):
            if word == '갑지':
                word ='갑질'
            else:
                f.write(word+'\tNNG\n')    
                
    komoran = konlpy.tag.Komoran(userdic='dic.txt')
    
    ## 불용어 사전을 등록 후 불용어 제거
    stopwords = ['조선일보','전효진','김우영','기자','뉴스타파','캡처','싸이월드','이날','조선닷컴','바로가기','구독신청하기',
                 'Copyrights','무단','전재','재배포','금지','수원','권상은','성남','윤민혁','박성우','chosunbiz','.com',
                 'chosun','young','동아일보','donga','한겨레','경향신문','www','khan','.co','.kr','신문구독','아이폰XS',
                 '라고','공식 페이스북','hani','무료만화','구독','무단전재']

    ### 명사 추출
    Nouns = komoran.nouns(text)
    final_nouns = Nouns.copy()
    unique_nouns = set(Nouns)
    for word in unique_nouns:
    ### 1음절 단어와 불용어 제거하기
        if len(word) == 1:
            while word in final_nouns:
                  final_nouns.remove(word)
        if word in stopwords:
            while word in final_nouns:
                  final_nouns.remove(word)

                
    g = do_kr_sna(text, final_nouns, stopwords, num=20)
    
    return g

In [65]:
g_cs = out_g(str(full_content_cs))
g_da = out_g(str(full_content_da))
g_hkr = out_g(str(full_content_hkr)) 
g_kh = out_g(str(full_content_kh))

In [33]:
g_cs['양진호']

AtlasView({'피해자': {'weight': 4}, '폭행': {'weight': 25}, '경찰': {'weight': 14}, '직원': {'weight': 15}, '한국미래기술': {'weight': 11}, '웹하드': {'weight': 6}, '위디스크': {'weight': 21}, '업체': {'weight': 6}, '불법': {'weight': 4}, '영상': {'weight': 7}, '사무실': {'weight': 10}, '2015년 4월': {'weight': 7}, '공개': {'weight': 5}, '위반': {'weight': 6}, '혐의': {'weight': 9}, '파일노리': {'weight': 7}, '음란물': {'weight': 4}, '유통': {'weight': 3}, '포르노': {'weight': 4}})

In [32]:
g_da['양진호']

AtlasView({'직원': {'weight': 14}, '폭행': {'weight': 21}, '한국미래기술': {'weight': 9}, '웹하드': {'weight': 3}, '위디스크': {'weight': 11}, '영상': {'weight': 9}, '불법': {'weight': 5}, '자신': {'weight': 7}, '조사': {'weight': 5}, '사무실': {'weight': 8}, '경찰': {'weight': 8}, '압수수색': {'weight': 9}, '자택': {'weight': 6}, '혐의': {'weight': 10}, '수사': {'weight': 6}, '강요': {'weight': 5}, '위반': {'weight': 4}, '교수': {'weight': 12}, '페이스북': {'weight': 3}})

In [31]:
g_hkr['양진호']

AtlasView({'필터링': {'weight': 5}, '위디스크': {'weight': 9}, '음란물': {'weight': 3}, '사실': {'weight': 3}, '파일노리': {'weight': 7}, '불법': {'weight': 8}, '웹하드': {'weight': 6}, '유통': {'weight': 7}, '업체': {'weight': 6}, '영상물': {'weight': 5}, '직원': {'weight': 8}, '경찰': {'weight': 7}, '폭행': {'weight': 12}, '피해자': {'weight': 5}, '피해': {'weight': 2}, '영상': {'weight': 4}, '수사': {'weight': 4}, '카르테': {'weight': 2}})

In [30]:
g_kh['양진호']

AtlasView({'직원': {'weight': 10}, '한국미래기술': {'weight': 6}, '갑지': {'weight': 4}, '공개': {'weight': 6}, '폭행': {'weight': 9}, '직장갑질119': {'weight': 6}, '직장 내 괴롭힘': {'weight': 4}, '직장': {'weight': 2}, '괴롭힘': {'weight': 2}, '피해자': {'weight': 3}, '근로기준법': {'weight': 3}, '처벌': {'weight': 3}, '폭언': {'weight': 3}, '회사': {'weight': 5}, '상사': {'weight': 1}, '사용자': {'weight': 1}, '경찰': {'weight': 4}, '페이스북': {'weight': 2}})