# 나무위키 크롤러 생성

## 1. 나무위키 구조 정의

<p>
    나무위키 컨탠츠 구조<br>
    소제목 : h2(class:wiki-heading) -> span(id=소제목)<br>
    내용 : div(class:wiki-heading-content) -> div(class:wiki-paragraph)
</p>


In [None]:
#테스트
import requests
from bs4 import BeautifulSoup

req = requests.get('https://namu.wiki/w/테스터훈')
bs = BeautifulSoup(req.text, 'html.parser')

targets = bs.find_all("h2", class_="wiki-heading")
for target in targets:
    realtarget = target.next_sibling.get_text()
    print(realtarget)

targets = bs.find_all("h3", class_="wiki-heading")
for target in targets:
    realtarget = target.next_sibling.get_text()
    print(realtarget)

targets = bs.find_all("h4", class_="wiki-heading")
for target in targets:
    realtarget = target.next_sibling.get_text()
    print(realtarget)


## 2. 크롤러 클래스 생성

In [None]:
#크롤러
import requests
from bs4 import BeautifulSoup

class Namu_Crawler:
    def __init__(self, keyword):
        self.keyword = keyword

    def getPage(self):
        try:
            req = requests.get('https://namu.wiki/w/'+self.keyword)
        except requests.exceptions.RequestException:
            print('request error')
            return None
        return BeautifulSoup(req.text, 'html.parser')

    def crawl(self):
        youtuberstring = []
        bs = self.getPage()
        targets = bs.find_all("h2", class_="wiki-heading")
        for target in targets:
            youtuberstring.append(target.next_sibling.get_text())

        targets = bs.find_all("h3", class_="wiki-heading")
        for target in targets:
            youtuberstring.append(target.next_sibling.get_text())

        targets = bs.find_all("h4", class_="wiki-heading")
        for target in targets:
            youtuberstring.append(target.next_sibling.get_text())

        return youtuberstring




## 3. 크롤링 대상

In [None]:
#상위 50명 분석
gameyoutubers = ['도티','잠뜰','감스트','김블루','대도서관','양띵','김재원(샌드박스%20네트워크)','뜨뜨뜨뜨','태경','악어(인터넷%20방송인)','테스터훈','우주하마','우왁굳','마이린','릴카','겜브링','킴성태','테드%20TV','쁘허','기리TV','캐릭온','텔론','램램','마재','makeUmove','연다','울산큰고래','머독','혜안(유튜버)','잉여맨','군림보','과로사','침착맨','김왼팔','Cowsep','돼지저금통(유튜버)','괴물쥐','만두민','문호준','강지','개리형','로이조','삼식']

strings = ""
for youtuber in gameyoutubers:
    crawler = Namu_Crawler(youtuber)
    one_string = "".join(crawler.crawl())
    strings += " " + one_string

In [None]:
#1인 분석시
crawler = Namu_Crawler('테스터훈')
strings = " ".join(crawler.crawl())

In [None]:
#텍스트 파일 추출

f = open('./crawling_result_text/testerhoon_namu.txt', mode='wt', encoding='utf-8')

f.write(strings)
f.close()

## 4. 불용어 사전

In [None]:
#불용어사전(자체 제작)
stopwords = ['라', '중', '이', '때문', '지', '이상', '등', '수', '것', '시작', '부분', '당시', '경우', '이후', '오브', '리그', '편', '위', '정도', '활동', '전', '둘',
    '한', '도중', '자체', '경기', '방송이','때','녹두','이유','문단을','대부분','번','일','리','두','초','나','문단','문','방송','적','후','단어','게임','유튜브','영상','말','역사','시청자들','본인','이전','시청자','2020년','2018년','2019년','2017년','2016년','업로드','콘텐츠','듯','현재','명','들','사람','1','개','관련','채널','모습','주','거','내','사이','자신','유튜버','컨텐츠','생각']  + ['1월','2월','3월','4월','5월','6월','7월','8월','9월','10월','11월','12월'] + ['테스터훈','한동숙'] #+ gameyoutubers

## 5. 워드 클라우드 생성

In [None]:
#워드클라우드
# -*- encoding:utf8 -*-

from collections import Counter
import random

from konlpy.tag import Hannanum
import pytagcloud
import webbrowser
import os

r = lambda: random.randint(0,255)
color = lambda: (r(),r(),r())

def get_tags(text, ntags=50, multiplier=3):
    h = Hannanum()
    nouns = h.nouns(text)
    unique_nouns = set(nouns)
    for word in unique_nouns:
        if word in stopwords:
            while word in nouns:
                nouns.remove(word)



    count = Counter(nouns)
    return [{ 'color': color(), 'tag': n, 'size': int(c*multiplier) }\
                for n, c in count.most_common(ntags)]
                

def draw_cloud(tags, filename, fontname='Korean', size=(800, 600)):
    pytagcloud.create_tag_image(tags, filename, fontname=fontname, size=size)
    webbrowser.open( os.path.relpath(filename) )


tags = get_tags(strings)
print(tags)
draw_cloud(tags,'./crawling_result_wordcloud/WC_testerhoon.png')

