# 에브리타임 크롤링

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from tqdm import tqdm

import matplotlib.pyplot as plt
import pandas as pd
import random
import time
import requests

In [None]:
# Create an instance of Options
webdriver_options = Options()

# Specify the user-agent
user_agent = "Mozilla/5.0 (Linux; Android 9; SM-G975F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.83 Mobile Safari/537.36"

# Add the user-agent to the options
webdriver_options.add_argument('user-agent=' + user_agent)

# Create a Chrome driver with the specified options
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=webdriver_options)
# Set implicit wait time
driver.implicitly_wait(1)

# 로그인 페이지에서 로그인하기
url = 'https://everytime.kr/login'
driver.get(url)

In [None]:
#우울증게시판 크롤링
base_url = 'https://everytime.kr/413131'
driver.get(base_url)

title = []
contents = []

for page in tqdm(range(1,11)):
    url = base_url + f'/p/{page}'
    rand_value = random.uniform(5, 8)
    time.sleep(rand_value)
    driver.get(url)
    
    time.sleep(3)
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    
    #제목과 내용 태그 모두 찾음(한 페이지당 20개)
    h2_tags = soup.find_all('h2')
    time.sleep(rand_value)
    p_tags = soup.find_all('p', attrs={'class':'medium'})
    
    #글 제목
    title_text = [h2_tags[i].text for i in range(20)]
    title.extend(title_text)
    
    #글 내용
    contents_text = [p_tags[i].text for i in range(20)]
    contents.extend(contents_text)

driver.quit()

df = pd.DataFrame({'title' : title,
                  'content' : contents})
df.to_csv('everytime_depression.csv', index=False)

In [None]:
# Text Length 히스토그램 그리기
plt.hist(df['combined'].apply(len), bins=20, color='blue', edgecolor='black')
plt.xlabel('Text Length')
plt.ylabel('Frequency')
plt.title('Text Length Histogram for Combined Column')
plt.show()

# 네이버 크롤링

In [None]:
import pandas as pd
import numpy as np
import os
import sys
import urllib.request
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import csv
import requests
from collections import Counter

In [None]:
# 본인의 네이버 API 키 발급 필요
client_id = ""
client_secret = "" 
encText = urllib.parse.quote("우울증") #검색 키워드
links = []


for i in range(11):
    try:
        url = "https://openapi.naver.com/v1/search/kin?query=" + encText +"&display=100" + f"&start={1+100*i}" #json 결과
        request = urllib.request.Request(url)
        request.add_header("X-Naver-Client-Id",client_id)
        request.add_header("X-Naver-Client-Secret",client_secret)
        response = urllib.request.urlopen(request)
        rescode = response.getcode()
        if(rescode==200):
            response_body = response.read()
        else:
            print("Error Code:" + rescode)

        split = response_body.decode('utf-8').split('{')
        for j in range(2, len(split)):
            links.append(split[j].split('\n')[2][9:-2])
    except:
        break
        
df = pd.DataFrame(links)
df.columns = ['url']
df['url'] = df['url'].apply(lambda x: x[2:])
df['url'] = df['url'].apply(lambda x: x.replace('\/','/'))

In [None]:
# 결과를 저장할 리스트
titles = []
questions = []
answers = []
qa_urls = []
failed_urls = []
a_number = []
for i, url in enumerate(df.url):
    print("---------------------------",i,"---------------------------")
    response = requests.get(url)
    print(url)
    
    # candidate answer lists
    candid_answers = []
    if response.status_code == 200:
        html = response.text
        soup = BeautifulSoup(html, 'html.parser')
        # title and question
        try:
            title = soup.select_one("#content > div.question-content > div > div.c-heading._questionContentsArea.c-heading--default-old > div.c-heading__title > div.c-heading__title-inner > div.title").get_text().strip()
            question = soup.select_one("#content > div.question-content > div > div.c-heading._questionContentsArea.c-heading--default-old > div.c-heading__content").get_text().strip()
        except:
            try:
                title = soup.select_one("#content > div.question-content > div > div.c-heading._questionContentsArea.c-heading--multiple-old > div.c-heading__title > div.c-heading__title-inner > div.title").get_text().strip()
                question = soup.select_one("#content > div.question-content > div > div.c-heading._questionContentsArea.c-heading--multiple-old > div.c-heading__content").get_text().strip()
            except:
                try:
                    title = soup.select_one("#content > div.question-content > div > div.c-heading._questionContentsArea.c-heading--default > div.c-heading__title > div.c-heading__title-inner > div.title").get_text().strip()
                    question = '제목과 내용 동일'
                except:
                    try:
                        title = soup.select_one("#content > div.question-content > div > div.c-heading._questionContentsArea.c-heading--multiple > div.c-heading__title > div.c-heading__title-inner > div.title").get_text().strip()
                        question = '제목과 내용 동일'
                    except:
                        failed_urls.append(df['url'][i])
                        continue
        for j in range(1,10):
            try:
                try:
                    temp = soup.select_one('#answer_' + str(j)).find('div', {'class': 'se-module se-module-text'}).text
                except:
                    temp = soup.select_one('#answer_' + str(j)).find('div', {'class': '_endContentsText c-heading-answer__content-user'}).text
                
                if temp != '':
                    titles.append(title)
                    questions.append(question)
                    answers.append(temp)
                    qa_urls.append(url)
                
            except:
                break
    else:
        print(response.status_code)
# Check saved data
print(len(titles))
print(len(questions))
print(len(answers))
print(len(qa_urls))
print(len(failed_urls))

In [None]:
# Convert List to DataFrame and Save
kin = pd.DataFrame(
                {'title' : titles,
                 'question' : questions,
                 'answer' : answers,
                 'url' : qa_urls
                })

kin.to_csv('naver_kin.csv', index=False)
kin.to_excel('naver_kin.xlsx', index=False)