In [1]:
from gevent import monkey
monkey.patch_all(thread=False)

from gevent.pool import Pool
pool = Pool(30)

from gevent.lock import BoundedSemaphore
semaphore_error, semaphore_remain = BoundedSemaphore(1), BoundedSemaphore(1)

from IPython.display import clear_output
from datetime import datetime

import os
import shutil
directory_out = './file/1.part_number'
if os.path.exists(directory_out):
    shutil.rmtree(directory_out)
os.makedirs(directory_out)
import re

# = = =  = = =  = = =

import pandas as pd

# = = =  = = =  = = =

import requests

import sys
sys.path.append('../00.Tools')
from crawler_configuration import get_header, get_proxy

from bs4 import BeautifulSoup
from lxml import etree

import re

# = = =  = = =  = = =

import json

# = = =  = = =  = = =

print('Data Loading...')
list_series_error, df_input = [], pd.read_excel('./file/input.xlsx',
                                                header=0,
                                                dtype=str).fillna('')
print('Loading Done !')
print()

df_input['No'] = df_input['No'].astype(int)

crawler_remain = len(df_input)

print(f'总数量：{crawler_remain}')
print()

# = = =  = = =  = = =

def crawler(crawler_tuple):
    global list_series_error, crawler_remain

    crawler_status, (crawler_index, crawler_series) = False, crawler_tuple

    # = = =  = = =  = = =

    try:
        crawler_retry = 0
        while True:
            crawler_retry += 1

            try:
                url_request = crawler_series['Url']

                resp = requests.get(url_request,
                                    headers=get_header(),
                                    proxies=get_proxy(),
                                    timeout=(5, 15))

                if resp.status_code == 200:
                    soup = BeautifulSoup(resp.text, 'lxml')
                    html = etree.HTML(str(soup))

                    manufacturer, type_ = html.xpath('//select[@id="manufacturer_partsearch_007"]/option[@selected]/@value')[0].strip(), html.xpath('//select[@id="parttype_partsearch_007"]/option[@selected]/text()')[0].strip()
                    if re.sub(r' +', ' ', f'{manufacturer} {type_} Part Numbers') == re.sub(r' +', ' ', html.xpath('//div[@id="nav_o[1]"]/div[@class="inner"]/div/div[@style="display: inline-block; padding: 0em 5em 0em 5em; "]/text()')[0].strip()):
                        break
            except KeyboardInterrupt:
                break
            except:
                continue

        # = = =  = = =  = = =

        group = html.xpath('//select[@id="partgroup_partsearch_007"]/option[@selected]/@value')[0].strip()

        # = = =  = = =  = = =

        list_dict = [json.loads(value) for value in html.xpath('//div[@id="navchildren[1]"]/div[contains(@id, "nav[")]/input[contains(@id, "jsn[")]/@value')]

        # = = =  = = =  = = =

        if not list_dict:
            raise

        # = = =  = = =  = = =

        list_url = [f'https://www.rockauto.com{href.strip()}' for href in html.xpath('//div[@id="navchildren[1]"]/div[contains(@id, "nav[")]/div[contains(@id, "nav_o[")]/div/table/tr/td[@class="nlabel nlbl-docolumns"]/a/@href')]

        # = = =  = = =  = = =

        print('Data Outputting...')
        df_correct = pd.DataFrame({'No': range(1, len(list_url) + 1),
                                   'Group': group,
                                   'Type': type_,
                                   'Manufacturer': [dict_['catalogname'] for dict_ in list_dict],
                                   'Part Number': [dict_['partnumber'] for dict_ in list_dict],
                                   'Url': list_url,
                                   'Type Code': [dict_['parttype'] for dict_ in list_dict]})
        df_correct.to_excel(os.path.join(directory_out, f'''{crawler_series['No']}-{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx'''), index=False)
        print('Output Finished !')
        print()

        # = = =  = = =  = = =

        crawler_status = True
    except:
        crawler_series['Request_Url'] = url_request

        with semaphore_error:
            list_series_error.append(crawler_series)

    # = = =  = = =  = = =

    if crawler_remain % 30 == 0:
        clear_output()
    print(f'''[{crawler_status}] - [请求{crawler_retry}次] - [剩余{crawler_remain - 1}条] - [{datetime.now().strftime('%Y/%m/%d %H:%M:%S')}] - {crawler_series['No']}. {url_request}\n''')

    with semaphore_remain:
        crawler_remain -= 1

# = = =  = = =  = = =

pool.map(crawler, df_input.iterrows())

if list_series_error:
    df_error = pd.DataFrame(list_series_error).sort_values(by=['No'],
                                                           ascending=[True],
                                                           ignore_index=True)
    df_error.to_excel(f'''./file/{file.removesuffix('.xlsx')}-part_number_error.xlsx''', index=False)
    print('- Error Occurred -')
    print()

# = = =  = = =  = = =

list_file = [file for file in sorted(list(os.walk(directory_out))[0][2]) if file not in ['.DS_Store'] and os.path.isfile(os.path.join(directory_out, file))]
for file in list_file:
    os.rename(os.path.join(directory_out, file), os.path.join(directory_out, re.sub(r'-[0-9]{8}_[0-9]{6}.xlsx$', '.xlsx', file)))

# = = =  = = =  = = =

print('Done ~')

Data Loading...
Loading Done !

总数量：2

Data Outputting...
Output Finished !

[True] - [请求1次] - [剩余1条] - [2024/12/11 21:04:30] - 1. https://www.rockauto.com/en/parts/STANDARD%20MOTOR%20PRODUCTS,Accelerator%20Pedal%20Position%20Sensor,5061

Data Outputting...
Output Finished !

[True] - [请求1次] - [剩余0条] - [2024/12/11 21:04:32] - 2. https://www.rockauto.com/en/parts/WVE,Accelerator%20Pedal%20Position%20Sensor,5061

Done ~
