In [20]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
import re

In [21]:

class MusinsaCrawling:
    def __init__(self, base_url, driver_path):
        self.base_url = base_url
        service = Service(driver_path)
        self.driver = webdriver.Chrome(service=service)
    
    def get_urls(self, num_items):
        url_data = []
        self.driver.get(self.base_url)
        
        # 페이지가 완전히 로드될 때까지 대기
        WebDriverWait(self.driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'a.sc-eldOKa.eYuOFs.gtm-view-item-list.gtm-select-item'))
        )
        
        while len(url_data) < num_items:
            # 현재 로드된 모든 상품 링크를 찾기
            selected_links = self.driver.find_elements(By.CSS_SELECTOR, 'a.sc-eldOKa.eYuOFs.gtm-view-item-list.gtm-select-item')
            
            # 현재까지 수집된 URL 수 출력
            print(f"수집된 URL 수: {len(url_data)}")
            
            for link in selected_links:
                href = link.get_attribute('href')
                if href and href not in url_data:  # 중복 방지
                    url_data.append(href)
                if len(url_data) >= num_items:
                    break
            
            if len(url_data) < num_items:
                # 더 많은 상품을 로드하기 위해 스크롤
                self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(0.5)  # 로딩 시간을 충분히 대기
        
        return url_data
    
    def close(self):
        self.driver.quit()

class MusinsaDetailScraper:
    def __init__(self, driver_path):
        service = Service(driver_path)
        self.driver = webdriver.Chrome(service=service)
    
    def scrape_product_details(self, product_url):
        try:
            self.driver.get(product_url)
            
            # 페이지가 완전히 로드될 때까지 대기
            WebDriverWait(self.driver, 30).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'span.text-lg.font-medium.break-all.flex-1.pr-3.font-pretendard'))
            )
        except Exception as e:
            print(f"Failed to load {product_url}: {e}")
            return None
        
        product_details = {}

        # 상품명
        try:
            product_name = self.driver.find_element(By.CSS_SELECTOR, 'span.text-lg.font-medium.break-all.flex-1.pr-3.font-pretendard').text
        except Exception as e:
            product_name = None
            print(f"Failed to retrieve product name for {product_url}: {e}")
        product_details['상품명'] = product_name
        
        # 상품 URL
        product_details['상품URL'] = product_url
        
        # 브랜드
        try:
            brand_element = self.driver.find_element(By.CSS_SELECTOR, 'div.sc-11x022e-2.dVnbGG span.text-sm.font-medium.font-pretendard')
            brand = brand_element.text
            product_details['브랜드'] = brand
        except Exception as e:
            print(f"Failed to retrieve brand for {product_url}: {e}")
            product_details['브랜드'] = None
        
        # 품번
        try:
            product_code_element = self.driver.find_element(By.XPATH, '//span[text()="품번"]/following-sibling::span')
            product_code = product_code_element.text
            product_details['품번'] = product_code
        except Exception as e:
            print(f"Failed to retrieve product code for {product_url}: {e}")
            product_details['품번'] = None
        
        # 성별
        try:
            gender_element = self.driver.find_element(By.XPATH, '//span[text()="성별"]/following-sibling::span')
            gender = gender_element.text
            product_details['성별'] = gender
        except Exception as e:
            print(f"Failed to retrieve gender for {product_url}: {e}")
            product_details['성별'] = None
        
        # 시즌
        try:
            season_element = self.driver.find_element(By.XPATH, '//span[text()="시즌"]/following-sibling::span')
            season = season_element.text
            product_details['시즌'] = season
        except Exception as e:
            print(f"Failed to retrieve season for {product_url}: {e}")
            product_details['시즌'] = None
        
        # 1달간 조회수
        try:
            views_1month_element = self.driver.find_element(By.XPATH, '//span[text()="조회수"]/following-sibling::span')
            views_1month = views_1month_element.text
            product_details['1달간 조회수'] = views_1month
        except Exception as e:
            print(f"Failed to retrieve views (1 month) for {product_url}: {e}")
            product_details['1달간 조회수'] = None
        
        # 1년간 구매수
        try:
            purchases_1year_element = self.driver.find_element(By.XPATH, '//span[text()="구매수"]/following-sibling::span')
            purchases_1year = purchases_1year_element.text
            product_details['1년간 구매수'] = purchases_1year
        except Exception as e:
            print(f"Failed to retrieve purchases (1 year) for {product_url}: {e}")
            product_details['1년간 구매수'] = None
        
        # 상품 판매가 (항상 존재)
        try:
            sale_price_element = self.driver.find_element(By.CSS_SELECTOR, 'span.text-lg.font-semibold.text-black.font-pretendard')
            sale_price = re.sub(r'\D', '', sale_price_element.text)  # 숫자만 추출
            product_details['상품판매가'] = sale_price
        except Exception as e:
            print(f"Failed to retrieve sale price for {product_url}: {e}")
            product_details['상품판매가'] = None
        
        # 할인율
        try:
            discount_rate_element = self.driver.find_element(By.CSS_SELECTOR, 'span.text-lg.font-semibold.mr-1.text-red.font-pretendard')
            discount_rate = discount_rate_element.text
            product_details['할인율'] = discount_rate
        except Exception as e:
            print(f"Failed to retrieve discount rate for {product_url}: {e}")
            product_details['할인율'] = None

        # 상세 이미지 URL 추출
        detailed_images = self.scrape_detailed_images()
        product_details['상세이미지'] = ', '.join(detailed_images) if detailed_images else None

        # 스냅사진 URL 추출
        snap_photos = self.scrape_snap_photos(product_url)
        product_details['스냅사진'] = ', '.join(snap_photos) if snap_photos else None
        
        return product_details
    
    def scrape_detailed_images(self):
        try:
            image_elements = self.driver.find_elements(By.CSS_SELECTOR, 'div.sc-1q6s5gw-0.cuvDxl img')
            detailed_images = [img.get_attribute('src') for img in image_elements]
            return detailed_images
        except Exception as e:
            print(f"Failed to retrieve detailed images: {e}")
            return []
    
    def scrape_snap_photos(self, product_url):
        # 상품번호 추출
        goods_no = re.search(r'goods/(\d+)', product_url).group(1)
        snap_url = f"https://www.musinsa.com/snap/goods?goodsNo={goods_no}&goodsPlatform=MUSINSA"
        
        try:
            self.driver.get(snap_url)
            
            # 스냅사진 페이지가 로드될 때까지 대기
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'img.object-cover'))
            )
            
            snap_images = self.driver.find_elements(By.CSS_SELECTOR, 'img.object-cover')
            snap_urls = [img.get_attribute('src') for img in snap_images[:5]]  # 전체 URL을 그대로 저장
            
            return snap_urls
        
        except Exception as e:
            print(f"Failed to retrieve snap photos for {snap_url}: {e}")
            return []
    
    def close(self):
        self.driver.quit()



In [22]:
# 크롤링할 카테고리와 URL
categories = {
    "맨투맨/스웨트": "https://www.musinsa.com/category/001005?gf=A&sortCode=REVIEW",
    "셔츠/블라우스": "https://www.musinsa.com/category/001002?gf=A&sortCode=REVIEW",
    "니트/스웨터": "https://www.musinsa.com/category/001006?gf=A&sortCode=REVIEW"
}

driver_path = '/opt/homebrew/bin/chromedriver'

# 전체 데이터를 저장할 리스트
all_data = []

# 각 카테고리에 대해 크롤링 수행
for category_name, category_url in categories.items():
    # URL 크롤링 클래스 인스턴스 생성
    url_scraper = MusinsaCrawling(category_url, driver_path)
    product_urls = url_scraper.get_urls(num_items=500) # 상품 수
    url_scraper.close()
    
    # 상세 정보 크롤링 클래스 인스턴스 생성
    detail_scraper = MusinsaDetailScraper(driver_path)
    
    # 각 상품의 정보를 저장할 리스트
    category_data = []
    
    # 각 URL에 대해 상세 정보를 크롤링
    for url in product_urls:
        details = detail_scraper.scrape_product_details(url)
        if details:  # 상세 정보를 성공적으로 가져온 경우만 추가
            details['카테고리'] = category_name  # 카테고리 추가
            category_data.append(details)
        time.sleep(1)  # 각 페이지 사이에 잠깐 대기 시간을 줌
    
    # 카테고리별 데이터를 전체 리스트에 추가
    all_data.extend(category_data)
    
    # 드라이버 종료
    detail_scraper.close()

# 수집한 데이터를 데이터프레임으로 변환
df = pd.DataFrame(all_data)

# '카테고리' 열을 첫 번째로 이동
df = df[['카테고리'] + [col for col in df.columns if col != '카테고리']]


수집된 URL 수: 0
Failed to retrieve season for https://www.musinsa.com/app/goods/947067: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//span[text()="시즌"]/following-sibling::span"}
  (Session info: chrome=128.0.6613.85); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF67B04B632+29090]
	(No symbol) [0x00007FF67AFBE6E9]
	(No symbol) [0x00007FF67AE7B1CA]
	(No symbol) [0x00007FF67AECEFD7]
	(No symbol) [0x00007FF67AECF22C]
	(No symbol) [0x00007FF67AF197F7]
	(No symbol) [0x00007FF67AEF672F]
	(No symbol) [0x00007FF67AF165D9]
	(No symbol) [0x00007FF67AEF6493]
	(No symbol) [0x00007FF67AEC09B1]
	(No symbol) [0x00007FF67AEC1B11]
	GetHandleVerifier [0x00007FF67B36881D+3294093]
	GetHandleVerifier [0x00007FF67B3B4403+3604339]
	GetHandleVerifier [0x00007FF67B3AA2C7+3563063]
	GetHandleVerifier [0x00007FF67B106F16+797318]
	(No symbol) 

Failed to retrieve season for https://www.musinsa.com/app/goods/728000: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//span[text()="시즌"]/following-sibling::span"}
  (Session info: chrome=128.0.6613.85); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF67B04B632+29090]
	(No symbol) [0x00007FF67AFBE6E9]
	(No symbol) [0x00007FF67AE7B1CA]
	(No symbol) [0x00007FF67AECEFD7]
	(No symbol) [0x00007FF67AECF22C]
	(No symbol) [0x00007FF67AF197F7]
	(No symbol) [0x00007FF67AEF672F]
	(No symbol) [0x00007FF67AF165D9]
	(No symbol) [0x00007FF67AEF6493]
	(No symbol) [0x00007FF67AEC09B1]
	(No symbol) [0x00007FF67AEC1B11]
	GetHandleVerifier [0x00007FF67B36881D+3294093]
	GetHandleVerifier [0x00007FF67B3B4403+3604339]
	GetHandleVerifier [0x00007FF67B3AA2C7+3563063]
	GetHandleVerifier [0x00007FF67B106F16+797318]
	(No symbol) [0x00007FF67A

In [24]:
# 데이터프레임을 CSV 파일로 저장
df.to_csv('musinsa_image_crawling.csv', index=False, encoding='utf-8-sig')

# 결과 출력
print(df)

       카테고리                                                상품명  \
0   맨투맨/스웨트                                                      
1   맨투맨/스웨트                                  980G 피그먼트 맨투맨-챠콜-   
2   맨투맨/스웨트  [2PACK] 유니)C/P기모쭈리 - LOCATION 맨투맨(세미오버핏)(15COL...   
3   맨투맨/스웨트                      헤비 코튼 오버 럭비 맨투맨_Midnight Blue   
4   맨투맨/스웨트                                     베츠 어센틱 맨투맨 그레이   
5   셔츠/블라우스                      [링클프리] 오버핏 옥스포드 셔츠_SPYWE49C51   
6   셔츠/블라우스                                릴렉스 핏 옥스포드 셔츠 [화이트]   
7   셔츠/블라우스                               솔리드 옥스포드 오버셔츠(스카이블루)   
8   셔츠/블라우스                                                      
9   셔츠/블라우스                                  베이식 옥스포드 셔츠 [화이트]   
10   니트/스웨터                                  [세트] 하프 폴라 니트 티셔츠   
11   니트/스웨터                                 [리뉴얼] 하프 터틀넥 니트 세트   
12   니트/스웨터                               워셔블 하찌 니트 - 12 COLOR   
13   니트/스웨터                                      화란 세미오버 니트 블랙   
14   니트/스웨