In [1]:
from gevent import monkey
monkey.patch_all(thread=False)

from gevent.pool import Pool
pool = Pool(30)

from gevent.lock import BoundedSemaphore
semaphore_correct, semaphore_error, semaphore_remain = BoundedSemaphore(1), BoundedSemaphore(1), BoundedSemaphore(1)

from IPython.display import clear_output
from datetime import datetime

import os
import shutil
directory_out = './file/3.vehicle'
if os.path.exists(directory_out):
    shutil.rmtree(directory_out)
os.makedirs(directory_out)
import re

# = = =  = = =  = = =

import pandas as pd

# = = =  = = =  = = =

import json

import requests

import sys
sys.path.append('../00.Tools')
from crawler_configuration import get_header, get_proxy

# = = =  = = =  = = =

from bs4 import BeautifulSoup
from lxml import etree

# = = =  = = =  = = =

directory_in = './file/2.part'
list_file = [file for file in sorted(list(os.walk(directory_in))[0][2]) if file not in ['.DS_Store'] and os.path.isfile(os.path.join(directory_in, file))]
for file in list_file:

    # = = =  = = =  = = =

    print('Data Loading...')
    list_dict_correct, list_series_error, df_input = [], [], pd.read_excel(os.path.join(directory_in, file),
                                                                           header=0,
                                                                           dtype=str).fillna('')
    print('Loading Done !')
    print()

    df_input['No'] = df_input['No'].astype(int)

    crawler_remain = len(df_input)

    print(f'总数量：{crawler_remain}')
    print()

    # = = =  = = =  = = =

    def crawler(crawler_tuple):
        global list_dict_correct, list_series_error, crawler_remain

        crawler_status, (crawler_index, crawler_series) = False, crawler_tuple

        # = = =  = = =  = = =

        try:
            data = {'func': 'getbuyersguide',
                    'payload': json.dumps({'partData': {'listing_data_essential': {'parttype': crawler_series['Type Code'],
                                                                                   'partkey': crawler_series['Part Code']},
                                                        'listing_data_supplemental': {'partnumber': crawler_series['Part Number'],
                                                                                      'catalogname': crawler_series['Manufacturer']}}}),
                    'api_json_request': '1'}

            crawler_retry = 0
            while True:
                crawler_retry += 1

                try:
                    url_check = crawler_series['Url']

                    resp = requests.post('https://www.rockauto.com/catalog/catalogapi.php',
                                         data=data,
                                         headers=get_header(),
                                         proxies=get_proxy(),
                                         timeout=(5, 15))

                    if resp.status_code == 200:
                        dict_ = resp.json()

                        if 'redirect_to_url' not in dict_ and dict_['buyersguidepieces']['title'].startswith('Buyer\'s Guide :') and 'collected_javascript' in dict_:
                            break
                except KeyboardInterrupt:
                    break
                except:
                    continue

            # = = =  = = =  = = =

            soup = BeautifulSoup(dict_['buyersguidepieces']['body'], 'lxml')
            html = etree.HTML(str(soup))

            # = = =  = = =  = = =

            list_tr = html.xpath('//tr')

            # = = =  = = =  = = =

            if not list_tr and not (html.xpath('//p/text()') and html.xpath('//p/text()')[0].strip() == 'No applications found.'):
                raise

            # = = =  = = =  = = =

            list_dict_correct_temp = []
            for i, tr in enumerate(list_tr):
                list_text = [text.strip() for text in tr.xpath('./td/text()')]
                list_year = list_text[-1].split('-')
                year_begin, year_end = int(list_year[0]), int(list_year[-1])

                if len(list_text) in [2, 3]:
                    list_dict_correct_temp.extend([{'JOIN_MPNTCPC': crawler_series['JOIN_MPNTCPC'],
                                                    'Vehicle No': i + 1,
                                                    'Make': list_text[0],
                                                    'Model': list_text[1] if len(list_text) == 3 else '',
                                                    'Year': year} for year in range(year_begin, year_end + 1)])
                else:
                    raise

            # = = =  = = =  = = =

            with semaphore_correct:
                list_dict_correct.extend(list_dict_correct_temp)

            # = = =  = = =  = = =

            crawler_status = True
        except:
            crawler_series['Check_Url'] = url_check

            with semaphore_error:
                list_series_error.append(crawler_series)

        # = = =  = = =  = = =

        if list_file.index(file) % 3 == 0 and list_file.index(file) != 0 and crawler_remain == 1 or crawler_remain % 30 == 0:
            clear_output()
        print(f'''[{crawler_status}] - [请求{crawler_retry}次] - [剩余{crawler_remain - 1}条] - [{file.removesuffix('.xlsx')}（{(list_file.index(file) + 1) / len(list_file) * 100:.2f}%）] - [{datetime.now().strftime('%Y/%m/%d %H:%M:%S')}] - {crawler_series['No']}. {url_check}\n''')

        with semaphore_remain:
            crawler_remain -= 1

    # = = =  = = =  = = =

    pool.map(crawler, df_input.iterrows())

    print('Data Outputting...')
    if list_dict_correct:
        df_correct = pd.DataFrame(list_dict_correct).sort_values(by=['JOIN_MPNTCPC', 'Vehicle No', 'Year'],
                                                                 ascending=[True, True, False],
                                                                 ignore_index=True)
        df_correct.to_excel(os.path.join(directory_out, f'''{file.removesuffix('.xlsx')}-{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx'''), index=False)
    print('Output Finished !')
    print()
    if list_series_error:
        df_error = pd.DataFrame(list_series_error).sort_values(by=['No'],
                                                               ascending=[True],
                                                               ignore_index=True)
        df_error.to_excel(f'''./file/{file.removesuffix('.xlsx')}-vehicle_error.xlsx''', index=False)
        print('- Error Occurred -')
        print()

# = = =  = = =  = = =

list_file = [file for file in sorted(list(os.walk(directory_out))[0][2]) if file not in ['.DS_Store'] and os.path.isfile(os.path.join(directory_out, file))]
for file in list_file:
    os.rename(os.path.join(directory_out, file), os.path.join(directory_out, re.sub(r'-[0-9]{8}_[0-9]{6}.xlsx$', '-vehicle.xlsx', file)))

# = = =  = = =  = = =

print('Done ~')

[True] - [请求2次] - [剩余29条] - [2（100.00%）] - [2024/12/11 21:10:02] - 355. https://www.rockauto.com/en/parts/wve,5S7937,accelerator+pedal+position+sensor,5061

[True] - [请求1次] - [剩余28条] - [2（100.00%）] - [2024/12/11 21:10:02] - 295. https://www.rockauto.com/en/parts/wve,5S15771,accelerator+pedal+position+sensor,5061

[True] - [请求2次] - [剩余27条] - [2（100.00%）] - [2024/12/11 21:10:02] - 278. https://www.rockauto.com/en/parts/wve,5S14923,accelerator+pedal+position+sensor,5061

[True] - [请求2次] - [剩余26条] - [2（100.00%）] - [2024/12/11 21:10:02] - 354. https://www.rockauto.com/en/parts/wve,5S7934,accelerator+pedal+position+sensor,5061

[True] - [请求1次] - [剩余25条] - [2（100.00%）] - [2024/12/11 21:10:03] - 370. https://www.rockauto.com/en/parts/wve,5S8768,accelerator+pedal+position+sensor,5061

[True] - [请求1次] - [剩余24条] - [2（100.00%）] - [2024/12/11 21:10:03] - 398. https://www.rockauto.com/en/parts/wve,5S8828,accelerator+pedal+position+sensor,5061

[True] - [请求2次] - [剩余23条] - [2（100.00%）] - [2024/12/11 2