In [1]:
import os
os.chdir(os.getcwd() + '/../../..')

In [2]:
# lib
from lib.python.log import get_logger

# crawling
from crawling.naver_shopping_review.utils.crawl import get_product, get_topic_code, get_review

# parallel
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed

# default
import time
import datetime
import logging

# 기본과 라이브러리의 로거를 가져와서 로그 레벨을 재설정
logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)
logging.getLogger("selenium").setLevel(logging.WARNING)
logging.getLogger("seleniumwire").setLevel(logging.WARNING)

In [4]:
keyword = '수분크림'
n_page = 1
verbose = False
max_workers = 1
collect_all_topics = False

In [5]:
from crawling.naver_shopping_review.pipeline import NaverShoppingReviewGetter

extractor = NaverShoppingReviewGetter(keyword, n_page)
extractor.get_review(collect_all_topics, verbose, max_workers)

[2024-04-13 16:16:35,103 pipeline.py:144][INFO] <<Crawling>> Naver Shopping Reviews: self.keyword='수분크림', self.n_page=1
[2024-04-13 16:16:35,105 pipeline.py:145][INFO] 
[2024-04-13 16:16:35,106 pipeline.py:148][INFO] >>> Get product
[2024-04-13 16:16:38,416 log.py:38][INFO] Retry 1/500, Error: IP has been blocked (ip=204.236.176.61:80, remaining=299)
[2024-04-13 16:16:40,134 log.py:38][INFO] Retry 2/500, Error: IP has been blocked (ip=122.10.225.55:8000, remaining=298)
[2024-04-13 16:16:41,848 log.py:38][INFO] Retry 3/500, Error: IP has been blocked (ip=50.174.7.156:80, remaining=297)
[2024-04-13 16:16:43,561 log.py:38][INFO] Retry 4/500, Error: IP has been blocked (ip=103.127.1.130:80, remaining=296)
[2024-04-13 16:16:45,292 log.py:38][INFO] Retry 5/500, Error: IP has been blocked (ip=117.54.114.96:80, remaining=295)
[2024-04-13 16:16:46,999 log.py:38][INFO] Retry 6/500, Error: IP has been blocked (ip=50.221.230.186:80, remaining=294)
[2024-04-13 16:16:48,791 log.py:38][INFO] Retry 7/

In [None]:
# 실행일자
nowdate = str(datetime.datetime.now())[:10].replace('-','')

# 로그 저장경로
log_path = f'.logs/crawling_naver_review_{nowdate}_{keyword}_{n_page}.log'
logger = get_logger(save_path=log_path)
trace_func = logger.info

# 저장경로
save_dir = f'crawling/naver_shopping_review/.result/{nowdate}/'
os.makedirs(save_dir, exist_ok=True)

# 저장경로 및 포맷
review_path_format = save_dir + f'{keyword}_' + 'product{}_topic{}.parquet'

# 크롤링 시작
print(f'<<Crawling>> Naver Shopping Reviews: {keyword=}, {n_page=}')
print('')

# 상품번호를 가져온다.
print('>>> Get product')
naver_all_df = get_product(keyword, n_page)
naver_df = naver_all_df[naver_all_df.isAd==0]

print(f'Length of products: {len(naver_df):,}')
print('')

In [None]:
# 상품번호 별
# 16개 상품에 1030분
# 33개 상품에 1700분
for product_iter in range(len(naver_df)):
    product_id, product_title = naver_df[['id','productTitle']].values[product_iter]
    product_iter_str = str(product_iter+1).zfill(len(str(len(naver_df))))

    print('')
    print("=====================================================================================================")
    print(">>> [{}/{}] Get Review (id='{}', title='{}')".format(product_iter_str,len(naver_df),product_id,product_title))
    print("=====================================================================================================")
    print('')

    print(f'> Get Topic Code')
    print('')
    # 대략 10분정도 걸림
    topic_info_list = get_topic_code(keyword,product_id)

    print(f'> Get Review')
    print('')
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_pid = {
            executor.submit(get_review, naver_df, product_iter, topic_info_list, topic_iter, review_path_format, verbose)
            for topic_iter in range(len(topic_info_list))
        }
        for future in as_completed(future_to_pid):
            future.result()
    
    time.sleep(300)

In [None]:
# unuse_columns = ['review_page','id','title','userId','mallLogoUrl','topicCount','topicYn','videoYn','videos',
#                  'aidaModifyTime','esModifyTime','modifyDate','registerDate','imageYn','images','nvMid',
#                  'mallId','mallProductId','mallReviewId','mallSeq','matchNvMid','pageUrl']
# review_df = pd.read_parquet(review_path_format.format(1,'주제전체'))
# review_df.drop(columns=unuse_columns, inplace=True)

# review_df.head(1)

In [None]:
# import glob
# review_paths = glob.glob('crawling/naver_shopping_review/.result/20240322/*.parquet')
# review_df = pd.concat([pd.read_parquet(path) for path in review_paths],axis=0)