# 필요한 library 설치

In [None]:
!pip install selenium
!pip install beautifulsoup4
!pip install pickle-mixin
!pip install requests

# 크롤링 시작

In [None]:
#selenium 불러오기
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import pandas as pd
import time
import requests
import re

In [None]:
# options to look like a human
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox") #관리자 모드에서 접근을 할 때 필요

#기계라고 생각되면 접속을 차단할 수도 있음 따라서 옵션을 줌
options.add_argument("window-size=1920x1080") 
options.add_argument("lang=ko_KR")
options.add_argument("user-agent=Chrome/89.0.4389.114")

# to save error log
service_args = ['--verbose']   
service_log_path = "./chromedriver.log"  #에러가 났을 때 log 찍을 수 있게 함


In [None]:
#chrome창 열기
driver = webdriver.Chrome(executable_path ="./chromedriver",
                         options = options,
                         service_args = service_args,
                         service_log_path = service_log_path)

## 키워드를 통해 검색  
+ 브런치가 정한 키워드와 유저가 직접 키워드를 지정해 검색하는 방법을 나눈다.
+ 브런치는 스크롤을 통해 글들을 불러오는 형식이므로 스크롤을 자동으로 내려주는 코드를 추가한다.
+ 키워드에 해당하는 글을 가져오기보다 글들의 url을 저장한다.

In [None]:
def brunch_url_keyword(keyword, user_selected = True):
    if user_selected:
        url = "https://brunch.co.kr/search?q="+ keyword
    else:
        url = "https://brunch.co.kr/keyword/" + keyword + "?q=g"
    driver.get(url)

    # 스크롤 높이
    last_height = driver.execute_script("return document.body.scrollHeight") 


    for i in range(1000): 
        # 스크롤 무빙 
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    
        # 페이지 로드 대기 
        time.sleep(3) 
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight-50);")
        time.sleep(3) 
    
        # 새로운 스크롤 동작 높이와 이전 스크롤과의 비교 
        new_height = driver.execute_script("return document.body.scrollHeight") 
        if new_height == last_height: 
            break 
    
        last_height = new_height

    source = driver.page_source
    data = source.encode('utf-8')
    bs = BeautifulSoup(data, 'html.parser')

    driver.quit()

    urls = bs.select('#wrapArticle > div.wrap_article_list.\#keyword_related_contents > ul > li')
    print(len(urls))

    # 파일로 저장
    filename = keyword + "_url.txt"
    f = open(filename, 'w')
    for val in urls:
        data = val + "\n"
        f.write(data)
    f.close()

    return urls

In [None]:
# 브런치가 정해놓은 키워드로 검색
brunch_url_keyword("감성_에세이",False)
brunch_url_keyword("문화·예술",False)
brunch_url_keyword("취향저격_영화_리뷰",False)
brunch_url_keyword("사랑·이별",False)

In [None]:
# 유저가 직접 키워드를 검색
brunch_url_keyword("기쁨")
brunch_url_keyword("슬픔")
brunch_url_keyword("분노")
brunch_url_keyword("공포")
brunch_url_keyword("사랑")

In [None]:
def read_url(keyword):
    file_name = './브런치데이터/'+ keyword + "_url.txt"
    b = []
    f = open(file_name, 'r')
    a = f.readlines()
    for l in a:
        before = l.replace('\n', '')
        b.append(before)
    return b
    

In [None]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                            # u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               u"\xa0"
                               u"\ucdee"
                               u'\ude0a'
                               "]+", flags=re.UNICODE)

    return emoji_pattern.sub(r'', str(string))

In [None]:
def get_rawText_req(url_list):
    doc_df = pd.DataFrame(columns = ['text'])
    for url in url_list:
        
        #각 url로 글에 접근
        req = requests.get(url)
        html = req.text
        time.sleep(0.03)
        data = html.encode('utf-8')
        bs = BeautifulSoup(data, 'html.parser')
        

        #글 가져오기
        doc = bs.select('body > div.service_contents.article_contents > div.wrap_view_article > div.wrap_body')
        raw_doc = ""

        if not doc:
            continue
        elif doc[0].select('h4') != []:
            for d in doc[0].select('h4'):
                par = d.get_text().replace(u'xa0', ' ').replace('&nbsp;',' ').replace(u'\udd25', ' ').replace(u'\ucdee', ' ')
                par = remove_emoji(par)
                par = re.compile('[^가-힣0-9ㄱ-ㅎㅏ-ㅣ\.\?\!,^]+').sub(' ', par)
                raw_doc = raw_doc + str(par)
        elif doc[0].select('p') != []:
            for d in doc[0].select('p'):
                par = d.get_text().replace(u'xa0', ' ').replace('&nbsp;',' ').replace(u'\udd25', ' ').replace(u'\ucdee', ' ')
                par = remove_emoji(par)
                par = re.compile('[^가-힣0-9ㄱ-ㅎㅏ-ㅣ\.\?\!,^]+').sub(' ', par)
                raw_doc = raw_doc + str(par)
    
        #dataframe에 append
        print(raw_doc + "\n")
        doc_df = doc_df.append({'text' : raw_doc}, ignore_index = True)
        time.sleep(0.05)
    
    print(doc_df)

    return doc_df.drop_duplicates()

In [None]:
get_rawText_req(read_url('url_scary_keyword.txt')).to_excel('scary.xlsx')
get_rawText_req(read_url('url_love_and_farewell.txt')).to_excel('love_farewell.xlsx')
get_rawText_req(read_url('url_movie_review.txt')).to_excel('movie_review.xlsx')
get_rawText_req(read_url('url_senti_essay.txt')).to_excel('senti_essay.xlsx')
get_rawText_req(read_url('url_happy_keyword.txt')).to_excel('happy.xlsx')
get_rawText_req(read_url('url_angry_keyword.txt')).to_excel('angry.xlsx')
get_rawText_req(read_url('url_sad_keyword.txt')).to_excel('sad.xlsx')