In [1]:
from gevent import monkey
monkey.patch_all(thread=False)

from gevent.pool import Pool
pool = Pool(30)

from gevent.lock import BoundedSemaphore
semaphore_correct, semaphore_error, semaphore_remain = BoundedSemaphore(1), BoundedSemaphore(1), BoundedSemaphore(1)

from IPython.display import clear_output
from datetime import datetime

# = = =  = = =  = = =

import pandas as pd

# = = =  = = =  = = =

from urllib.parse import urlparse, parse_qs, urlencode

import requests

import sys
sys.path.append('../00.Tools')
from crawler_configuration import get_header, get_proxy

import math

# = = =  = = =  = = =

print('Data Loading...')
list_dict_correct, list_series_error, df_input = [], [], pd.read_excel('./file/input.xlsx',
                                                                       header=0,
                                                                       dtype=str).fillna('')
print('Loading Done !')
print()

df_input['No'] = df_input['No'].astype(int)

crawler_remain = len(df_input)

print(f'总数量：{crawler_remain}')
print()

# = = =  = = =  = = =

def crawler(crawler_tuple):
    global list_dict_correct, list_series_error, crawler_remain

    crawler_status, (crawler_index, crawler_series) = False, crawler_tuple

    # = = =  = = =  = = =

    try:
        url_parse = urlparse(crawler_series['Url'])

        dict_param = parse_qs(url_parse.query)

        crawler_retry = 0
        while True:
            crawler_retry += 1

            try:
                url_request, url_check = f'''https://ecatalog.smpcorp.com/V2/STD/api/part/partsearch?filter={dict_param['part'][0]}&filterType={dict_param['search'][0] if 'search' in dict_param else 'n'}&searchType={dict_param['type'][0] if 'type' in dict_param else 'null'}&imageSize=80&start=0&limit=96&sort=3&catFilter=-All-&yearFilter=-All-&makeFilter=-All-&modelFilter=-All-&engineFilter=-All-&attrCodeFilter=-All-&attrValueFilter=-All-&plkEngineMakeFilter=-All-&plkEngineModelFilter=-All-&plkEngineDispFilter=-All-''', crawler_series['Url']

                resp = requests.get(url_request,
                                    headers=get_header(),
                                    proxies=get_proxy(),
                                    timeout=(5, 15))

                if resp.status_code == 200:
                    list_dict = resp.json()

                    count = list_dict[0]['maxRows']
                    if count > 0:
                        break
            except KeyboardInterrupt:
                break
            except:
                continue

        # = = =  = = =  = = =

        with semaphore_correct:
            list_dict_correct.append({'No': crawler_series['No'],
                                      'Url': crawler_series['Url'],
                                      'SKU Count': count,
                                      'Page': math.ceil(count / 96)})

        # = = =  = = =  = = =

        crawler_status = True
    except:
        crawler_series['Request_Url'], crawler_series['Check_Url'] = url_request, url_check

        with semaphore_error:
            list_series_error.append(crawler_series)

    # = = =  = = =  = = =

    if crawler_remain % 30 == 0:
        clear_output()
    print(f'''[{crawler_status}] - [请求{crawler_retry}次] - [剩余{crawler_remain - 1}条] - [{datetime.now().strftime('%Y/%m/%d %H:%M:%S')}] - {crawler_series['No']}. {url_check}\n''')

    with semaphore_remain:
        crawler_remain -= 1

# = = =  = = =  = = =

pool.map(crawler, df_input.iterrows())

print('Data Outputting...')
if list_dict_correct:
    df_correct = pd.DataFrame(list_dict_correct).sort_values(by=['No'],
                                                             ascending=[True],
                                                             ignore_index=True)
    df_correct.to_excel(f'''./file/1.page-{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx''', index=False)
print('Output Finished !')
print()
if list_series_error:
    df_error = pd.DataFrame(list_series_error).sort_values(by=['No'],
                                                           ascending=[True],
                                                           ignore_index=True)
    df_error.to_excel('./file/page_error.xlsx', index=False)
    print('- Error Occurred -')
    print()

# = = =  = = =  = = =

print('Done ~')

Data Loading...
Loading Done !

总数量：1

[True] - [请求1次] - [剩余0条] - [2024/12/11 21:51:52] - 1. https://www.standardbrand.com/en/ecatalog?part=Accelerator%20Pedal%20Sensor%20(APS)&type=p

Data Outputting...
Output Finished !

Done ~
