In [7]:
import requests
import pandas as pd
import csv
from datetime import datetime

def scrape_reviews_api(goods_no, max_pages=10, page_size=10):
    reviews = []
    base_url = "https://goods.musinsa.com/api2/review/v1/view/list"
    headers = {"user-agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/1337.36 Safari/537.36")}

    for page in range(1, max_pages + 1):
        url = f"{base_url}?page={page}&pageSize={page_size}&goodsNo={goods_no}"
        res = requests.get(url, headers=headers)

        if res.status_code != 200:
            print(f"[Error] {page}페이지 요청 실패:", res.status_code)
            break

        items = res.json().get("data", {}).get("list", [])
        if not items:
            print(f"[End] {page}페이지 리뷰 없음, 종료")
            break

        for r in items:
            # 작성일 변환
            raw_date = r.get("createDate") 
            if raw_date:
                dt = datetime.fromisoformat(raw_date.replace("Z", "+00:00"))
                new_date = dt.strftime("%Y-%m-%d")
            else:
                new_date = None

            # 상품 정보
            goods_info = r.get("goods", {})

            reviews.append({
                "상품번호": goods_info.get("goodsNo"),
                "상품이미지": goods_info.get("goodsImageFile"),
                "브랜드명": goods_info.get("brandName"),
                "상품명": goods_info.get("goodsName"),
                "작성자": r.get("userProfileInfo", {}).get("userNickName"),
                "작성일": new_date,
                "평점": r.get("grade"),
                "리뷰내용": r.get("content"),
            })

    return reviews

if __name__ == "__main__":
    goods_no = 3976350  # 크롤링할 상품 번호
    reviews = scrape_reviews_api(goods_no, max_pages=10, page_size=10)
    df = pd.DataFrame(reviews)
    df.to_csv(f"musinsa_{goods_no}_reviews.csv", index=False, encoding="utf-8-sig", quoting=csv.QUOTE_ALL)

    print(f" 총 {len(reviews)}개 리뷰 저장 완료 → musinsa_{goods_no}_reviews.csv")

 총 100개 리뷰 저장 완료 → musinsa_3976350_reviews.csv


In [None]:
import re
import requests
import pandas as pd
import streamlit as st
from datetime import datetime

def scrape_reviews_api(goods_no, max_pages=10, page_size=10):
    reviews = []
    base_url = "https://goods.musinsa.com/api2/review/v1/view/list"
    headers = {"user-agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                              "AppleWebKit/537.36 (KHTML, like Gecko) "
                              "Chrome/1337.36 Safari/537.36")}

    for page in range(1, max_pages + 1):
        url = f"{base_url}?page={page}&pageSize={page_size}&goodsNo={goods_no}"
        res = requests.get(url, headers=headers)

        if res.status_code != 200:
            st.error(f"[Error] {page}페이지 요청 실패: {res.status_code}")
            break

        items = res.json().get("data", {}).get("list", [])
        if not items:
            st.info(f"[End] {page}페이지 리뷰 없음, 종료")
            break

        for r in items:
            # 작성일 변환
            raw_date = r.get("createDate")
            if raw_date:
                dt = datetime.fromisoformat(raw_date.replace("Z", "+00:00"))
                new_date = dt.strftime("%Y-%m-%d")
            else:
                new_date = None

            # 상품 정보
            goods_info = r.get("goods", {})

            reviews.append({
                "상품번호": goods_info.get("goodsNo"),
                "상품이미지": goods_info.get("goodsImageFile"),
                "브랜드명": goods_info.get("brandName"),
                "상품명": goods_info.get("goodsName"),
                "작성자": r.get("userProfileInfo", {}).get("userNickName"),
                "작성일": new_date,
                "평점": r.get("grade"),
                "리뷰내용": r.get("content"),
            })
    return reviews

def extract_goods_no(url: str) -> int:
    """상품 링크에서 goods_no 추출"""
    m = re.search(r"/products/(\d+)", url)
    if not m:
        raise ValueError("상품번호를 찾을 수 없습니다. 올바른 상품 링크를 입력하세요.")
    return int(m.group(1))


In [13]:
import requests
import pandas as pd
import csv
from datetime import datetime

def scrape_reviews_api(goods_no, max_pages=10, page_size=10):
    reviews = []
    products = {}
    base_url = "https://goods.musinsa.com/api2/review/v1/view/list"
    headers = {
        "user-agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/1337.36 Safari/537.36"
        )
    }

    for page in range(1, max_pages + 1):
        url = f"{base_url}?page={page}&pageSize={page_size}&goodsNo={goods_no}"
        res = requests.get(url, headers=headers)

        if res.status_code != 200:
            print(f"[Error] {page}페이지 요청 실패:", res.status_code)
            break

        items = res.json().get("data", {}).get("list", [])
        if not items:
            print(f"[End] {page}페이지 리뷰 없음, 종료")
            break

        for r in items:
            # 작성일 변환
            raw_date = r.get("createDate")
            if raw_date:
                dt = datetime.fromisoformat(raw_date.replace("Z", "+00:00"))
                new_date = dt.strftime("%Y-%m-%d")
            else:
                new_date = None

            # 상품 정보 (products 테이블에 1회만 저장)
            goods_info = r.get("goods", {})
            goods_no = goods_info.get("goodsNo")

            if goods_no not in products:
                products[goods_no] = {
                    "product_id": goods_no,
                    "brandName": goods_info.get("brandName"),
                    "goodsName": goods_info.get("goodsName"),
                    "thumbnail": goods_info.get("goodsImageFile"),
                    "goodsLinkUrl": f"https://www.musinsa.com/products/{goods_no}",
                }

            # 리뷰 정보
            reviews.append({
                "review_no": r.get("no"),
                "product_id": goods_no,
                "createDate": new_date,
                "userNickName": r.get("userProfileInfo", {}).get("userNickName"),
                "reviewSex": r.get("userProfileInfo", {}).get("sex"),
                "grade": r.get("grade"),
                "content": r.get("content"),
            })

    return products, reviews


if __name__ == "__main__":
    goods_no = 3976350  # 크롤링할 상품 번호
    products, reviews = scrape_reviews_api(goods_no, max_pages=10, page_size=60)

    # DataFrame 변환
    product_df = pd.DataFrame(list(products.values()))
    review_df = pd.DataFrame(reviews)

    # CSV 저장
    product_df.to_csv("data/products.csv", index=False, encoding="utf-8-sig", quoting=csv.QUOTE_ALL)
    review_df.to_csv("data/reviews.csv", index=False, encoding="utf-8-sig", quoting=csv.QUOTE_ALL)

    print(f"상품 {len(product_df)}개, 리뷰 {len(review_df)}개 저장 완료")


상품 4개, 리뷰 600개 저장 완료


In [None]:
import requests
import pandas as pd
import csv
from datetime import datetime

def scrape_reviews_api(goods_no, max_pages=10, page_size=10):
    reviews_dict = {}  # (product_id, userNickName) → review 저장
    products = {}
    base_url = "https://goods.musinsa.com/api2/review/v1/view/list"
    headers = {
        "user-agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/1337.36 Safari/537.36"
        )
    }

    for page in range(1, max_pages + 1):
        url = f"{base_url}?page={page}&pageSize={page_size}&goodsNo={goods_no}"
        res = requests.get(url, headers=headers)

        if res.status_code != 200:
            print(f"[Error] {page}페이지 요청 실패:", res.status_code)
            break

        items = res.json().get("data", {}).get("list", [])
        if not items:
            print(f"[End] {page}페이지 리뷰 없음, 종료")
            break

        for r in items:
            # 작성일 변환
            raw_date = r.get("createDate")
            if raw_date:
                dt = datetime.fromisoformat(raw_date.replace("Z", "+00:00"))
                new_date = dt.strftime("%Y-%m-%d")
            else:
                new_date = None

            # 상품 정보 (products 테이블에 1회만 저장)
            goods_info = r.get("goods", {})
            goods_no = goods_info.get("goodsNo")

            if goods_no not in products:
                products[goods_no] = {
                    "product_id": goods_no,
                    "brandName": goods_info.get("brandName"),
                    "goodsName": goods_info.get("goodsName"),
                    "thumbnail": goods_info.get("goodsImageFile"),
                    "goodsLinkUrl": f"https://www.musinsa.com/products/{goods_no}",
                }
                

            # 리뷰 key (상품 + 사용자 조합)
            user = r.get("userProfileInfo", {}).get("userNickName")
            user_info = r.get("userProfileInfo", {})
            review_key = (goods_no, user)

            # 새로운 리뷰 객체
            review_data = {
                "review_no": r.get("no"),
                "product_id": goods_no,
                "createDate": new_date,
                "userNickName": user,
                "reviewSex": user_info.get("reviewSex") or user_info.get("sex"),  # 두 키 다 체크
                "grade": r.get("grade"),
                "content": r.get("content"),
            }

            # 기존에 같은 (상품, 사용자) 리뷰가 있다면 날짜 비교
            if review_key in reviews_dict:
                old_date = reviews_dict[review_key]["createDate"]
                if old_date and new_date and new_date > old_date:
                    reviews_dict[review_key] = review_data
            else:
                reviews_dict[review_key] = review_data

    # dict → list 변환
    reviews = list(reviews_dict.values())
    return products, reviews


if __name__ == "__main__":
    goods_no = 3976350  # 크롤링할 상품 번호
    products, reviews = scrape_reviews_api(goods_no, max_pages=10, page_size=60)

    # DataFrame 변환
    product_df = pd.DataFrame(list(products.values()))
    review_df = pd.DataFrame(reviews)

    # CSV 저장
    product_df.to_csv("data/products.csv", index=False, encoding="utf-8-sig", quoting=csv.QUOTE_ALL)
    review_df.to_csv("data/reviews.csv", index=False, encoding="utf-8-sig", quoting=csv.QUOTE_ALL)

    print(f"상품 {len(product_df)}개, 리뷰 {len(review_df)}개 저장 완료")


dict_keys(['no', 'type', 'typeName', 'subType', 'content', 'commentCount', 'grade', 'goods', 'userImageFile', 'goodsOption', 'commentReplyCount', 'userStaffYn', 'images', 'likeCount', 'userReactionType', 'createDate', 'goodsThumbnailImageUrl', 'userId', 'encryptedUserId', 'userProfileInfo', 'orderOptionNo', 'channelSource', 'channelSourceName', 'channelActivityId', 'relatedNo', 'isFirstReview', 'reviewProfileTypeEnum', 'specialtyCodes', 'reviewerWeeklyRanking', 'reviewerMonthlyRanking', 'reviewSurveySatisfaction', 'reviewSurveyRepurchase', 'showUserProfile'])
{'userNickName': 'ctrl0', 'userLevel': 4, 'userOutYn': 'N', 'userStaffYn': 'N', 'reviewSex': '', 'userWeight': 0, 'userHeight': 0, 'userSkinInfo': None, 'skinType': None, 'skinTone': None, 'skinWorry': None}
dict_keys(['no', 'type', 'typeName', 'subType', 'content', 'commentCount', 'grade', 'goods', 'userImageFile', 'goodsOption', 'commentReplyCount', 'userStaffYn', 'images', 'likeCount', 'userReactionType', 'createDate', 'goodsTh