# 쿠팡 상품 후기 크롤링 프로그램 (리뷰 내용 제외 버전)

## 프로그램 설명
쿠팡 상품 페이지의 리뷰를 자동으로 수집하되, 리뷰 내용은 제외하고 메타데이터만 수집하는 프로그램입니다. PyQt5를 활용한 GUI 인터페이스를 제공합니다.

## 주요 기능
- 쿠팡 상품 리뷰 메타데이터 수집 (리뷰 내용 제외)
- PyQt5 기반 GUI 인터페이스
- 진행 상황 표시 (프로그레스 바)
- 결과 미리보기 (테이블)
- 엑셀 파일 저장

## 사용 방법
1. 프로그램 실행 (GUI 창 열림)
2. 쿠팡 상품 URL 입력
3. 저장할 파일명 입력
4. 크롤링 시작 버튼 클릭
5. 결과 확인 및 저장

## 필요 라이브러리
- selenium: 웹 크롤링
- pandas: 데이터 처리
- PyQt5: GUI 인터페이스

## 주의사항
- 쿠팡 웹사이트 구조 변경 시 수정이 필요할 수 있습니다
- ChromeDriver 경로가 설정되어 있어야 합니다

In [None]:
import sys
import os
import subprocess
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, WebDriverException, TimeoutException, ElementClickInterceptedException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import logging
import pandas as pd
from datetime import datetime
import time
from PyQt5.QtWidgets import QApplication, QWidget, QVBoxLayout, QHBoxLayout, QLabel, QLineEdit, QPushButton, QTextEdit, QFileDialog, QProgressBar, QTableWidget, QTableWidgetItem, QHeaderView
from PyQt5.QtCore import Qt, QThread, pyqtSignal

# Constants
CHROME_DRIVER_PATH = r"C:\chromedriver-win64\chromedriver.exe"
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36"
DEFAULT_FILENAME = "쿠팡상품후기크롤링"
WAIT_TIME = 60
SCROLL_PAUSE_TIME = 2
PAGE_LOAD_PAUSE_TIME = 5

# Logging setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# WebDriver options setup
options = Options()
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument(f"user-agent={USER_AGENT}")

class CrawlerThread(QThread):
    update_progress = pyqtSignal(int)
    update_log = pyqtSignal(str)
    update_table = pyqtSignal(dict)
    crawling_finished = pyqtSignal(list)
    crawling_stopped = pyqtSignal()

    def __init__(self, product_url, pages_to_crawl):
        QThread.__init__(self)
        self.product_url = product_url
        self.pages_to_crawl = pages_to_crawl
        self.is_running = True

    def run(self):
        review_list = self.crawl_reviews()
        if review_list:
            self.crawling_finished.emit(review_list)
        else:
            self.crawling_stopped.emit()

    def stop(self):
        self.is_running = False

    def crawl_reviews(self):
        try:
            service = Service(CHROME_DRIVER_PATH)
            driver = webdriver.Chrome(service=service, options=options)
            review_list = []
            driver.get(self.product_url)

            for current_page in range(1, self.pages_to_crawl + 1):
                if not self.is_running:
                    self.update_log.emit("크롤링이 사용자에 의해 중지되었습니다.")
                    break

                try:
                    WebDriverWait(driver, WAIT_TIME).until(EC.presence_of_element_located((By.CLASS_NAME, "js_reviewArticleRatingValue")))
                    
                    review_elements = driver.find_elements(By.CLASS_NAME, "sdp-review__article__list__info")
                    
                    self.update_log.emit(f"페이지 {current_page}에서 찾은 리뷰 수: {len(review_elements)}")

                    for review_element in review_elements:
                        review_info = self.extract_review_info(review_element, len(review_list) + 1)
                        if review_info:
                            review_list.append(review_info)
                            self.update_table.emit(review_info)

                    if current_page < self.pages_to_crawl:
                        if not self.go_to_next_page(driver, current_page):
                            self.update_log.emit("더 이상 다음 페이지로 이동할 수 없습니다. 크롤링을 종료합니다.")
                            break
                    self.update_progress.emit(int((current_page / self.pages_to_crawl) * 100))

                except TimeoutException:
                    self.update_log.emit(f"리뷰 페이지 {current_page}가 로드되지 않았습니다. 크롤링을 종료합니다.")
                    break

            driver.quit()
            return review_list

        except WebDriverException as e:
            self.update_log.emit(f"웹 드라이버 오류: {e}")
            return []

    def extract_review_info(self, review_element, review_number):
        try:
            rating = review_element.find_element(By.CLASS_NAME, "js_reviewArticleRatingValue").get_attribute("data-rating")
            user_name = review_element.find_element(By.CLASS_NAME, "sdp-review__article__list__info__user__name").text
            review_date = review_element.find_element(By.CLASS_NAME, "sdp-review__article__list__info__product-info__reg-date").text
            product_name = review_element.find_element(By.CLASS_NAME, "sdp-review__article__list__info__product-info__name").text
            seller_name = review_element.find_element(By.CLASS_NAME, "sdp-review__article__list__info__product-info__seller_name").text

            return {
                "순번": review_number,
                "상품 정보": product_name,
                "작성 일자": review_date,
                "작성자": user_name,
                "평점": rating,
                "판매자 정보": seller_name
            }
        except NoSuchElementException as e:
            self.update_log.emit(f"리뷰 정보를 찾을 수 없습니다: {e}")
            return None

    def go_to_next_page(self, driver, current_page):
        try:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(SCROLL_PAUSE_TIME)

            if current_page % 10 == 0:
                next_page = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, 'button.sdp-review__article__page__next'))
                )
            else:
                next_page = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, f'button.sdp-review__article__page__num[data-page="{current_page + 1}"]'))
                )
            driver.execute_script("arguments[0].click();", next_page)
            time.sleep(PAGE_LOAD_PAUSE_TIME)
            return True
        except (NoSuchElementException, ElementClickInterceptedException) as e:
            self.update_log.emit(f"다음 페이지로 이동 중 오류 발생: {e}")
            return False

class CoupangReviewCrawlerGUI(QWidget):
    def __init__(self):
        super().__init__()
        self.initUI()

    def initUI(self):
        self.setWindowTitle('쿠팡 리뷰 크롤러')
        self.setGeometry(300, 300, 1000, 800)

        layout = QVBoxLayout()

        url_layout = QHBoxLayout()
        url_label = QLabel('상품 URL:')
        self.url_input = QLineEdit()
        url_layout.addWidget(url_label)
        url_layout.addWidget(self.url_input)
        layout.addLayout(url_layout)

        pages_layout = QHBoxLayout()
        pages_label = QLabel('크롤링할 페이지 수:')
        self.pages_input = QLineEdit()
        pages_layout.addWidget(pages_label)
        pages_layout.addWidget(self.pages_input)
        layout.addLayout(pages_layout)

        button_layout = QHBoxLayout()
        self.crawl_button = QPushButton('크롤링 시작')
        self.crawl_button.clicked.connect(self.start_crawling)
        self.reset_button = QPushButton('입력 초기화')
        self.reset_button.clicked.connect(self.reset_inputs)
        self.exit_button = QPushButton('종료')
        self.exit_button.clicked.connect(self.close)
        button_layout.addWidget(self.crawl_button)
        button_layout.addWidget(self.reset_button)
        button_layout.addWidget(self.exit_button)
        layout.addLayout(button_layout)

        self.progress_bar = QProgressBar()
        layout.addWidget(self.progress_bar)

        self.log_output = QTextEdit()
        self.log_output.setReadOnly(True)
        self.log_output.setMaximumHeight(100)
        layout.addWidget(self.log_output)

        self.result_table = QTableWidget()
        self.result_table.setColumnCount(6)
        self.result_table.setHorizontalHeaderLabels(["순번", "상품 정보", "작성 일자", "작성자", "평점", "판매자 정보"])
        self.result_table.horizontalHeader().setSectionResizeMode(QHeaderView.Stretch)
        layout.addWidget(self.result_table)

        self.setLayout(layout)

    def start_crawling(self):
        product_url = self.url_input.text()
        pages_to_crawl = int(self.pages_input.text())

        self.result_table.setRowCount(0)  # Clear existing rows
        self.progress_bar.setValue(0)
        self.log_output.clear()

        self.crawler_thread = CrawlerThread(product_url, pages_to_crawl)
        self.crawler_thread.update_progress.connect(self.update_progress_bar)
        self.crawler_thread.update_log.connect(self.update_log)
        self.crawler_thread.update_table.connect(self.update_table)
        self.crawler_thread.crawling_finished.connect(self.save_results)
        self.crawler_thread.crawling_stopped.connect(self.handle_crawling_stopped)
        self.crawler_thread.start()

        self.crawl_button.setText('크롤링 중지')
        self.crawl_button.clicked.disconnect()
        self.crawl_button.clicked.connect(self.stop_crawling)

    def stop_crawling(self):
        self.crawler_thread.stop()
        self.crawl_button.setEnabled(False)

    def reset_inputs(self):
        self.url_input.clear()
        self.pages_input.clear()
        self.log_output.clear()
        self.progress_bar.setValue(0)
        self.result_table.setRowCount(0)

    def update_progress_bar(self, value):
        self.progress_bar.setValue(value)

    def update_log(self, message):
        self.log_output.append(message)
        self.log_output.verticalScrollBar().setValue(self.log_output.verticalScrollBar().maximum())

    def update_table(self, review_info):
        row = self.result_table.rowCount()
        self.result_table.insertRow(row)
        for col, (key, value) in enumerate(review_info.items()):
            self.result_table.setItem(row, col, QTableWidgetItem(str(value)))
        self.result_table.scrollToBottom()

    def save_results(self, review_list):
        if review_list:
            options = QFileDialog.Options()
            fileName, _ = QFileDialog.getSaveFileName(self, "Save File", DEFAULT_FILENAME, "CSV Files (*.csv)", options=options)
            if fileName:
                df = pd.DataFrame(review_list)
                columns_order = ["순번", "상품 정보", "작성 일자", "작성자", "평점", "판매자 정보"]
                df = df[columns_order]
                df.to_csv(fileName, index=False, encoding='utf-8-sig')
                self.update_log(f"크롤링 결과를 {fileName} 파일로 저장했습니다.")
                
                self.open_file(fileName)
        else:
            self.update_log("크롤링된 리뷰가 없습니다.")

        self.crawl_button.setText('크롤링 시작')
        self.crawl_button.clicked.disconnect()
        self.crawl_button.clicked.connect(self.start_crawling)
        self.crawl_button.setEnabled(True)

    def handle_crawling_stopped(self):
        self.update_log("크롤링이 중지되었습니다. 지금까지 수집된 데이터를 저장합니다.")
        review_list = []
        for row in range(self.result_table.rowCount()):
            review_info = {}
            for col in range(self.result_table.columnCount()):
                header = self.result_table.horizontalHeaderItem(col).text()
                review_info[header] = self.result_table.item(row, col).text()
            review_list.append(review_info)
        self.save_results(review_list)

    def open_file(self, file_path):
        try:
            if sys.platform.startswith('darwin'):  # macOS
                subprocess.call(('open', file_path))
            elif sys.platform.startswith('win'):  # Windows
                os.startfile(file_path)
            else:  # linux
                subprocess.call(('xdg-open', file_path))
            self.update_log(f"{file_path} 파일이 열렸습니다.")
        except Exception as e:
            self.update_log(f"파일을 열 수 없습니다: {e}")

def run_gui():
    app = QApplication.instance()
    if not app:
        app = QApplication(sys.argv)
    ex = CoupangReviewCrawlerGUI()
    ex.show()
    return app, ex

# GUI 실행
if __name__ == "__main__":
    app, ex = run_gui()
    sys.exit(app.exec_())

In [1]:
import sys
import os
import subprocess
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, WebDriverException, TimeoutException, ElementClickInterceptedException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import logging
import pandas as pd
from datetime import datetime
import time
from PyQt5.QtWidgets import QApplication, QWidget, QVBoxLayout, QHBoxLayout, QLabel, QLineEdit, QPushButton, QTextEdit, QFileDialog, QProgressBar, QTableWidget, QTableWidgetItem, QHeaderView
from PyQt5.QtCore import Qt, QThread, pyqtSignal

# Constants
CHROME_DRIVER_PATH = r"C:\chromedriver-win64\chromedriver.exe"
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36"
DEFAULT_FILENAME = "쿠팡상품후기크롤링"
WAIT_TIME = 60
SCROLL_PAUSE_TIME = 2
PAGE_LOAD_PAUSE_TIME = 5

# Logging setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# WebDriver options setup
options = Options()
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument(f"user-agent={USER_AGENT}")

class CrawlerThread(QThread):
    update_progress = pyqtSignal(int)
    update_log = pyqtSignal(str)
    update_table = pyqtSignal(dict)
    crawling_finished = pyqtSignal(list)
    crawling_stopped = pyqtSignal()

    def __init__(self, product_url, pages_to_crawl):
        QThread.__init__(self)
        self.product_url = product_url
        self.pages_to_crawl = pages_to_crawl
        self.is_running = True

    def run(self):
        review_list = self.crawl_reviews()
        if review_list:
            self.crawling_finished.emit(review_list)
        else:
            self.crawling_stopped.emit()

    def stop(self):
        self.is_running = False

    def crawl_reviews(self):
        try:
            service = Service(CHROME_DRIVER_PATH)
            driver = webdriver.Chrome(service=service, options=options)
            review_list = []
            driver.get(self.product_url)

            for current_page in range(1, self.pages_to_crawl + 1):
                if not self.is_running:
                    self.update_log.emit("크롤링이 사용자에 의해 중지되었습니다.")
                    break

                try:
                    WebDriverWait(driver, WAIT_TIME).until(EC.presence_of_element_located((By.CLASS_NAME, "js_reviewArticleRatingValue")))
                    
                    review_elements = driver.find_elements(By.CLASS_NAME, "sdp-review__article__list__info")
                    
                    self.update_log.emit(f"페이지 {current_page}에서 찾은 리뷰 수: {len(review_elements)}")

                    for review_element in review_elements:
                        review_info = self.extract_review_info(review_element, len(review_list) + 1)
                        if review_info:
                            review_list.append(review_info)
                            self.update_table.emit(review_info)

                    if current_page < self.pages_to_crawl:
                        if not self.go_to_next_page(driver, current_page):
                            self.update_log.emit("더 이상 다음 페이지로 이동할 수 없습니다. 크롤링을 종료합니다.")
                            break
                    self.update_progress.emit(int((current_page / self.pages_to_crawl) * 100))

                except TimeoutException:
                    self.update_log.emit(f"리뷰 페이지 {current_page}가 로드되지 않았습니다. 크롤링을 종료합니다.")
                    break

            driver.quit()
            return review_list

        except WebDriverException as e:
            self.update_log.emit(f"웹 드라이버 오류: {e}")
            return []

    def extract_review_info(self, review_element, review_number):
        try:
            rating = review_element.find_element(By.CLASS_NAME, "js_reviewArticleRatingValue").get_attribute("data-rating")
            user_name = review_element.find_element(By.CLASS_NAME, "sdp-review__article__list__info__user__name").text
            review_date = review_element.find_element(By.CLASS_NAME, "sdp-review__article__list__info__product-info__reg-date").text
            product_name = review_element.find_element(By.CLASS_NAME, "sdp-review__article__list__info__product-info__name").text
            seller_name = review_element.find_element(By.CLASS_NAME, "sdp-review__article__list__info__product-info__seller_name").text

            return {
                "순번": review_number,
                "상품 정보": product_name,
                "작성 일자": review_date,
                "작성자": user_name,
                "평점": rating,
                "판매자 정보": seller_name
            }
        except NoSuchElementException as e:
            self.update_log.emit(f"리뷰 정보를 찾을 수 없습니다: {e}")
            return None

    def go_to_next_page(self, driver, current_page):
        try:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(SCROLL_PAUSE_TIME)

            if current_page % 10 == 0:
                next_page = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, 'button.sdp-review__article__page__next'))
                )
            else:
                next_page = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, f'button.sdp-review__article__page__num[data-page="{current_page + 1}"]'))
                )
            driver.execute_script("arguments[0].click();", next_page)
            time.sleep(PAGE_LOAD_PAUSE_TIME)
            return True
        except (NoSuchElementException, ElementClickInterceptedException) as e:
            self.update_log.emit(f"다음 페이지로 이동 중 오류 발생: {e}")
            return False

class CoupangReviewCrawlerGUI(QWidget):
    def __init__(self):
        super().__init__()
        self.initUI()

    def initUI(self):
        self.setWindowTitle('쿠팡 리뷰 크롤러')
        self.setGeometry(300, 300, 1000, 800)

        layout = QVBoxLayout()

        url_layout = QHBoxLayout()
        url_label = QLabel('상품 URL:')
        self.url_input = QLineEdit()
        url_layout.addWidget(url_label)
        url_layout.addWidget(self.url_input)
        layout.addLayout(url_layout)

        pages_layout = QHBoxLayout()
        pages_label = QLabel('크롤링할 페이지 수:')
        self.pages_input = QLineEdit()
        pages_layout.addWidget(pages_label)
        pages_layout.addWidget(self.pages_input)
        layout.addLayout(pages_layout)

        button_layout = QHBoxLayout()
        self.crawl_button = QPushButton('크롤링 시작')
        self.crawl_button.clicked.connect(self.start_crawling)
        self.reset_button = QPushButton('입력 초기화')
        self.reset_button.clicked.connect(self.reset_inputs)
        self.exit_button = QPushButton('종료')
        self.exit_button.clicked.connect(self.close)
        button_layout.addWidget(self.crawl_button)
        button_layout.addWidget(self.reset_button)
        button_layout.addWidget(self.exit_button)
        layout.addLayout(button_layout)

        self.progress_bar = QProgressBar()
        layout.addWidget(self.progress_bar)

        self.log_output = QTextEdit()
        self.log_output.setReadOnly(True)
        self.log_output.setMaximumHeight(100)
        layout.addWidget(self.log_output)

        self.result_table = QTableWidget()
        self.result_table.setColumnCount(6)
        self.result_table.setHorizontalHeaderLabels(["순번", "상품 정보", "작성 일자", "작성자", "평점", "판매자 정보"])
        self.result_table.horizontalHeader().setSectionResizeMode(QHeaderView.Stretch)
        layout.addWidget(self.result_table)

        self.setLayout(layout)

    def start_crawling(self):
        product_url = self.url_input.text()
        pages_to_crawl = int(self.pages_input.text())

        self.result_table.setRowCount(0)  # Clear existing rows
        self.progress_bar.setValue(0)
        self.log_output.clear()

        self.crawler_thread = CrawlerThread(product_url, pages_to_crawl)
        self.crawler_thread.update_progress.connect(self.update_progress_bar)
        self.crawler_thread.update_log.connect(self.update_log)
        self.crawler_thread.update_table.connect(self.update_table)
        self.crawler_thread.crawling_finished.connect(self.save_results)
        self.crawler_thread.crawling_stopped.connect(self.handle_crawling_stopped)
        self.crawler_thread.start()

        self.crawl_button.setText('크롤링 중지')
        self.crawl_button.clicked.disconnect()
        self.crawl_button.clicked.connect(self.stop_crawling)

    def stop_crawling(self):
        self.crawler_thread.stop()
        self.crawl_button.setEnabled(False)

    def reset_inputs(self):
        self.url_input.clear()
        self.pages_input.clear()
        self.log_output.clear()
        self.progress_bar.setValue(0)
        self.result_table.setRowCount(0)

    def update_progress_bar(self, value):
        self.progress_bar.setValue(value)

    def update_log(self, message):
        self.log_output.append(message)
        self.log_output.verticalScrollBar().setValue(self.log_output.verticalScrollBar().maximum())

    def update_table(self, review_info):
        row = self.result_table.rowCount()
        self.result_table.insertRow(row)
        for col, (key, value) in enumerate(review_info.items()):
            self.result_table.setItem(row, col, QTableWidgetItem(str(value)))
        self.result_table.scrollToBottom()

    def save_results(self, review_list):
        if review_list:
            options = QFileDialog.Options()
            fileName, _ = QFileDialog.getSaveFileName(self, "Save File", DEFAULT_FILENAME, "CSV Files (*.csv)", options=options)
            if fileName:
                df = pd.DataFrame(review_list)
                columns_order = ["순번", "상품 정보", "작성 일자", "작성자", "평점", "판매자 정보"]
                df = df[columns_order]
                df.to_csv(fileName, index=False, encoding='utf-8-sig')
                self.update_log(f"크롤링 결과를 {fileName} 파일로 저장했습니다.")
                
                self.open_file(fileName)
        else:
            self.update_log("크롤링된 리뷰가 없습니다.")

        self.crawl_button.setText('크롤링 시작')
        self.crawl_button.clicked.disconnect()
        self.crawl_button.clicked.connect(self.start_crawling)
        self.crawl_button.setEnabled(True)

    def handle_crawling_stopped(self):
        self.update_log("크롤링이 중지되었습니다. 지금까지 수집된 데이터를 저장합니다.")
        review_list = []
        for row in range(self.result_table.rowCount()):
            review_info = {}
            for col in range(self.result_table.columnCount()):
                header = self.result_table.horizontalHeaderItem(col).text()
                review_info[header] = self.result_table.item(row, col).text()
            review_list.append(review_info)
        self.save_results(review_list)

    def open_file(self, file_path):
        try:
            if sys.platform.startswith('darwin'):  # macOS
                subprocess.call(('open', file_path))
            elif sys.platform.startswith('win'):  # Windows
                os.startfile(file_path)
            else:  # linux
                subprocess.call(('xdg-open', file_path))
            self.update_log(f"{file_path} 파일이 열렸습니다.")
        except Exception as e:
            self.update_log(f"파일을 열 수 없습니다: {e}")

def run_gui():
    app = QApplication.instance()
    if not app:
        app = QApplication(sys.argv)
    ex = CoupangReviewCrawlerGUI()
    ex.show()
    return app, ex

# GUI 실행
if __name__ == "__main__":
    app, ex = run_gui()
    sys.exit(app.exec_())

SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
