In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup as bs
from tqdm import tqdm
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

### 리뷰 크롤링할 링크 수집

In [145]:
driver = webdriver.Chrome()
driver.get('https://ohou.se/productions/feed?query=%ED%86%A0%ED%8D%BC&search_affect_type=Typing')
driver.maximize_window()

url_list = []

#스크롤 하기 전 페이지에서 링크 수집
initial_soup = bs(driver.page_source, 'lxml') 
initial_urls = initial_soup.select('article.css-7k423j.etj6rb20>a')
for link in initial_urls:
    url_list.append(link['href'])

# 스크롤하면서 링크 수집
for i in range(3): #스크롤 횟수 조정 가능
    driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
    time.sleep(1)
    new_soup = bs(driver.page_source, 'lxml')
    new_url = new_soup.select('article.css-7k423j.etj6rb20>a')
    for link in new_url:
        url_list.append(link['href'])

# 중복 제거
url_list = list(set(url_list))

# URL 리스트를 완전한 URL로 변환
for i in range(len(url_list)):
    url_list[i] = 'https://ohou.se'+url_list[i] 

In [150]:
import pickle

with open('topper_url.pkl', 'wb') as f:
    pickle.dump(url_list, f)

### 크롤링

In [None]:
import pickle
with open('topper_url.pkl', 'rb') as f:
    url_list = pickle.load(f)

In [None]:
# 드라이버 초기화
driver = webdriver.Chrome()

# 함수 정의
def collect_reviews(url, max_reviews=200):
    driver.get(url)
    # 창 최대화 예외처리
    try:
        driver.maximize_window()
    except Exception as e:
        print(f"Maximize window failed: {e}")
    time.sleep(1)
    # 리뷰 버튼 없으면 넘어감
    try: 
        driver.find_element(By.XPATH, '/html/body/div[1]/div/div/div[2]/div[4]/div/nav/ol/li[2]/a').click()
    except:
        pass

    star_list = []
    review_list = []
    user_list = []
    date_list = []

    current_page = 2
    previous_reviews = set()

    try:
        while len(review_list) < max_reviews:
            time.sleep(0.5)

            if current_page <= 5:
                # 페이지 번호가 1~5인 경우
                xpath = f'/html/body/div[1]/div/div/div[2]/div[5]/div/div[1]/div/section[2]/div/ul/li[{current_page}]/button'
            else:
                # 페이지 번호가 6 이상인 경우 (번호가 밀리는 상황)
                xpath = f'/html/body/div[1]/div/div/div[2]/div[5]/div/div[1]/div/section[2]/div/ul/li[7]/button'

            try:
                WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, xpath))
                ).click()
            except Exception as e:
                print(f"Failed to click next page button on page {current_page}: {e}")
                break

            time.sleep(1)  # 페이지 로드 대기

            soup = bs(driver.page_source, 'lxml')

            stars = soup.find_all('span', class_='production-review-item__writer__info__total-star')
            reviews = soup.select('.production-review-item__description')
            username = soup.select('.production-review-item__writer__info__name')
            reviewdate = soup.select('.production-review-item__writer__info__date')

            new_reviews = [review.text for review in reviews]

            # 이전에 수집된 리뷰와 중복되는지 확인
            if any(review in previous_reviews for review in new_reviews):
                print(f"Duplicate reviews found on page {current_page} of {url}, moving to next URL")
                break

            previous_reviews.update(new_reviews)

            for star in stars:
                aria_label = star.get('aria-label')
                if aria_label:
                    star_list.append(aria_label)

            for review in reviews:
                review_list.append(review.text)

            for user in username:
                user_list.append(user.text)

            for date in reviewdate:
                date_list.append(date.text)

            # 페이지 번호를 증가시킴
            current_page += 1

            # 현재까지 수집된 리뷰 개수를 출력하고, 최대 개수에 도달하면 종료
            print(f"Collected {len(review_list)} reviews from {url}")
            if len(review_list) >= max_reviews:
                print(f"Stopping as collected {len(review_list)} reviews from {url}")
                break

    except Exception as e:
        print(f"Error on page {current_page} of {url}: {e}")

    return star_list, review_list, user_list, date_list

# 메인 리뷰 리스트
all_star_list = []
all_review_list = []
all_user_list = []
all_date_list = []

for url in tqdm(iot_url_list):
    star_list, review_list, user_list, date_list = collect_reviews(url)
    all_star_list.extend(star_list)
    all_review_list.extend(review_list)
    all_user_list.extend(user_list)
    all_date_list.extend(date_list)

# 드라이버 종료
driver.quit()

# 결과 출력 (확인용)
print(f"Collected {len(all_star_list)} stars, {len(all_review_list)} reviews, {len(all_user_list)} users, and {len(all_date_list)} dates in total")

### 데이터 전처리

In [None]:
for i in range(len(all_review_list)):
    all_review_list[i] = all_review_list[i].replace('\n','')
    
for i in range(len(all_date_list)):
    all_date_list[i] = all_date_list[i].replace(' ∙ 오늘의집 구매','')
    all_date_list[i] = all_date_list[i].replace(' ∙ 오늘의집 비구매','')
    
for i in range(len(all_star_list)):
    all_star_list[i] = all_star_list[i].replace('별점 ','').replace('점','')

### 데이터 프레임으로 저장

In [None]:
import pandas as pd
topper_review = {'작성자':all_user_list, '작성날짜':all_date_list, '별점':all_star_list, '내용':all_review_list}
topper = pd.DataFrame(topper_review)