In [279]:
from konlpy.tag import Kkma
from konlpy.utils import pprint
from konlpy.tag import Okt
import datetime as dt
from collections import Counter
from wordcloud import WordCloud
from matplotlib import pyplot as plt
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests

In [280]:

okt = Okt()

### to_datetime(dateTime)
dateTime: string in 'YYYY-MM-DD HH:MM:SS' format <br>
returns dateTime object (YYYY, MM, DD, HH, MM, SS)

In [281]:
def to_datetime(dateTime):
    yyyymmdd = dateTime.split(' ')[0]
    hhmmss = dateTime.split(' ')[1]

    yyyy = int(yyyymmdd.split('-')[0])
    mm = int(yyyymmdd.split('-')[1])
    dd = int(yyyymmdd.split('-')[2])
    hh = int(hhmmss.split(':')[0])
    mmin = int(hhmmss.split(':')[1])
    ss = int(hhmmss.split(':')[2])
    
    return dt.datetime(yyyy, mm, dd, hh, mmin, ss)
    

### return_page(start_time):
start_time: string in 'YYYY-MM-DD HH:MM:SS' format <br>
returns page containing start_time

In [289]:

def return_page(start_time):
    start_time = to_datetime(start_time)
    
    page_num = 1
    root_url = "https://gall.dcinside.com/board/lists/?id="
    gall_id = "skwyverns"
    target_url = root_url + gall_id + "&page=" + str(page_num)
    
    while True:
        
        target_url = root_url + gall_id + "&page=" + str(page_num)
        req = requests.get(target_url, headers={'User-Agent':'test'})
        text = req.text
        soup = BeautifulSoup(text, 'html.parser')
        article_list = soup.findAll("tr", {"data-type": "icon_txt"})
        time_list = [to_datetime(items.find("td", {"class": "gall_date"})["title"]) for items in article_list]
        if start_time > time_list[-1]:
            break
        page_num += 1
    return page_num


### return_texts(start_time, end_time, gall_id):
start_time, end_time: string in 'YYYY-MM-DD hh:mm:ss' format <br>
gall_id: string of gallery id<br>
concatenates titles of articles that were written between start_time and end_time; returns string

In [290]:
def return_texts(start_time, end_time, gall_id):
    end_page_num = return_page(start_time)
    start_page_num = return_page(end_time)
    
    texts = []
    times = []
    root_url = "https://gall.dcinside.com/board/lists/?id="
    
    for page_num in range(start_page_num, end_page_num+1):
        
        target_url = root_url + gall_id + "&page=" + str(page_num)

        req = requests.get(target_url, headers={'User-Agent':'test'})
        text = req.text
        soup = BeautifulSoup(text, 'html.parser')
        article_list = soup.findAll("tr", {"data-type": ["icon_txt", "icon_pic"]})
        text_list = [items.find("a", {"class": None}).get_text() for items in article_list]
        datetime_list = [to_datetime(items.find("td", {"class": "gall_date"})["title"]) for items in article_list]

        texts += text_list
        times += datetime_list

    new_dict = dict(zip(times, texts))
    
    filtered = {key: value for key, value in new_dict.items() if (key > to_datetime(start_time) and key < to_datetime(end_time))}
    ss = ""
    for items in filtered.values():
        ss += items
        ss += " "
    return ss

### create_wordcloud(start_time, end_time, gall_id)
creates wordcloud and saves it in root directory

In [297]:
def create_wordcloud(start_time, end_time, gall_id):
    string = return_texts(start_time, end_time, gall_id)
    
    nouns_list = okt.nouns(string)
    counts = Counter(nouns_list)
    wc = WordCloud(font_path = './NanumGothic.ttf', background_color = 'white', width = 1000, height = 800, max_words = 100, max_font_size = 250)
    wc.generate_from_frequencies(counts)
    
    file_name = str(start_time.split(' ')[0]) + "_" + str(gall_id) + ".png"
    wc.to_file(file_name)
    print("file saved as", file_name)

#### meaningless_words
remove meaningless words from wordcloud <br>
list has to be updated 

In [298]:
meaningless_words = ['왜', '또', '개', '경기', '때', '우리', '네', '년', '놈', '니', '지랄', '좆', '씨발', '존나', '병신', '새끼', '시발', '진짜', '오늘', '그냥', '뭐', '저', '너', '더', '해', '함', '쟤', '이']

### create_wordcloud_clean
creates a 'clean' wordcloud and saves it in root directory

In [299]:


def create_wordcloud_clean(start_time, end_time, gall_id):
    string = return_texts(start_time, end_time, gall_id)
    
    nouns_list = okt.nouns(string)
    counts = Counter(nouns_list)
    

    clean_dict ={key:value for key,value in counts.items() if key not in meaningless_words}
    
    
    wc = WordCloud(font_path = './NanumGothic.ttf', background_color = 'white', width = 1000, height = 800, max_words = 100, max_font_size = 250)
    wc.generate_from_frequencies(clean_dict)
    
    file_name = str(start_time.split(' ')[0]) + "_" + str(gall_id) + ".png"
    wc.to_file(file_name)
    print("file saved as filename:", file_name)