In [1]:
from gevent import monkey
monkey.patch_all(thread=False)

from gevent.pool import Pool
pool = Pool(5)

from gevent.lock import BoundedSemaphore
semaphore_error, semaphore_remain = BoundedSemaphore(1), BoundedSemaphore(1)

from IPython.display import clear_output
from datetime import datetime

import os
import shutil
directory_out = './file/1.item_number'
if os.path.exists(directory_out):
    shutil.rmtree(directory_out)
os.makedirs(directory_out)
import re

# = = =  = = =  = = =

import pandas as pd

# = = =  = = =  = = =

from urllib.parse import urlparse, parse_qs, urlencode

import requests

import sys
sys.path.append('../00.Tools')
from crawler_configuration import get_header, get_proxy

from bs4 import BeautifulSoup
from lxml import etree

# = = =  = = =  = = =

print('Data Loading...')
list_series_error, df_input = [], pd.read_excel('./file/input.xlsx',
                                                header=0,
                                                dtype=str).fillna('')
print('Loading Done !')
print()

df_input['No'] = df_input['No'].astype(int)

crawler_remain = len(df_input)

print(f'总数量：{crawler_remain}')
print()

# = = =  = = =  = = =

def crawler(crawler_tuple):
    global list_series_error, crawler_remain

    crawler_index, crawler_series = crawler_tuple

    # = = =  = = =  = = =

    try:
        url_parse = urlparse(crawler_series['Url'])

        dict_param = parse_qs(url_parse.query)
        dict_param.pop('_pgn', None)
        dict_param['orig_cvip'] = 'true'
        dict_param['_ipg'] = '240'
        if crawler_series['Url'].startswith('https://www.ebay.com/'):
            dict_param['_ul'] = 'US'
            dict_param['_stpos'] = '91710'
        elif crawler_series['Url'].startswith('https://www.ebay.de/'):
            dict_param['_ul'] = 'DE'
            dict_param['_stpos'] = '10115'
        elif crawler_series['Url'].startswith('https://www.ebay.co.uk/'):
            dict_param['_ul'] = 'GB'
            dict_param['_stpos'] = 'BS81QU'
        elif crawler_series['Url'].startswith('https://www.ebay.com.au/'):
            dict_param['_ul'] = 'AU'
            dict_param['_stpos'] = '2019'
        elif crawler_series['Url'].startswith('https://www.ebay.ca/'):
            dict_param['_ul'] = 'CA'
            dict_param['_stpos'] = 'M5S2E8'

        url_base = f'{url_parse.scheme}://{url_parse.netloc}{url_parse.path}?{urlencode(dict_param, doseq=True)}'

        list_item_number = []
        for crawler_cycle in range(10):

            page = 1
            while True:

                crawler_retry = 0
                while True:
                    crawler_retry += 1

                    try:
                        url_request = f'{url_base}&_pgn={page}'

                        resp = requests.get(url_request,
                                            headers=get_header(),
                                            proxies=get_proxy(),
                                            timeout=(5, 15))

                        if resp.status_code == 200:
                            soup = BeautifulSoup(resp.text, 'lxml')
                            html = etree.HTML(str(soup))

                            list_item_number_temp = [href.split('?')[0].split('/')[-1].strip() for href in html.xpath('//a[@class="s-item__link"]/@href')]
                            list_item_number_temp = [item_number_temp for item_number_temp in list_item_number_temp if len(item_number_temp) == 12]
                            if list_item_number_temp:
                                break
                    except KeyboardInterrupt:
                        break
                    except:
                        continue

                # = = =  = = =  = = =

                list_item_number.extend(list_item_number_temp)
                list_item_number = list(dict.fromkeys(list_item_number))

                # = = =  = = =  = = =

                if crawler_remain % 30 == 0:
                    clear_output()
                print(f'''[第{crawler_cycle + 1}轮] - [第{page}页] - [请求{crawler_retry}次] - [{len(list_item_number)}] - [剩余{crawler_remain - 1}条] - [{datetime.now().strftime('%Y/%m/%d %H:%M:%S')}] - {crawler_series['No']}. {url_request}\n''')

                # = = =  = = =  = = =

                page += 1

                # = = =  = = =  = = =

                if not html.xpath('//nav[@class="pagination"]/a[@type="next"]'):
                    break

        # = = =  = = =  = = =

        if not list_item_number:
            raise

        # = = =  = = =  = = =

        print('Data Outputting...')
        df_correct = pd.DataFrame({'No': range(1, len(list_item_number) + 1),
                                   'Item Number': list_item_number})
        df_correct.to_excel(os.path.join(directory_out, f'''crawler_{crawler_series['No']}-{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx'''), index=False)
        print('Output Finished !')
        print()
    except:
        crawler_series['Request_Url'] = url_request

        with semaphore_error:
            list_series_error.append(crawler_series)

    # = = =  = = =  = = =

    with semaphore_remain:
        crawler_remain -= 1

# = = =  = = =  = = =

pool.map(crawler, df_input.iterrows())

if list_series_error:
    df_error = pd.DataFrame(list_series_error).sort_values(by=['No'],
                                                           ascending=[True],
                                                           ignore_index=True)
    df_error.to_excel('./file/item_number_error.xlsx', index=False)
    print('- Error Occurred -')
    print()

# = = =  = = =  = = =

list_file = [file for file in sorted(list(os.walk(directory_out))[0][2]) if file not in ['.DS_Store'] and os.path.isfile(os.path.join(directory_out, file))]
for file in list_file:
    os.rename(os.path.join(directory_out, file), os.path.join(directory_out, re.sub(r'-[0-9]{8}_[0-9]{6}.xlsx$', '.xlsx', file)))

# = = =  = = =  = = =

print('Done ~')

Data Loading...
Loading Done !

总数量：2

[第1轮] - [第1页] - [请求1次] - [240] - [剩余1条] - [2024/12/05 19:51:01] - 2. https://www.ebay.de/sch/i.html?_dkr=1&iconV2Request=true&_blrs=recall_filtering&_ssn=atp-autoteile&store_name=atpautoteile&_oac=1&_nkw=t%C3%BCrschloss&rt=nc&LH_BIN=1&orig_cvip=true&_ipg=240&_ul=DE&_stpos=10115&_pgn=1

[第1轮] - [第2页] - [请求1次] - [439] - [剩余1条] - [2024/12/05 19:51:09] - 2. https://www.ebay.de/sch/i.html?_dkr=1&iconV2Request=true&_blrs=recall_filtering&_ssn=atp-autoteile&store_name=atpautoteile&_oac=1&_nkw=t%C3%BCrschloss&rt=nc&LH_BIN=1&orig_cvip=true&_ipg=240&_ul=DE&_stpos=10115&_pgn=2

[第1轮] - [第1页] - [请求2次] - [240] - [剩余1条] - [2024/12/05 19:51:19] - 1. https://www.ebay.com/sch/i.html?_dkr=1&iconV2Request=true&_blrs=recall_filtering&_ssn=scitoo-autoparts&store_cat=0&store_name=cnbatteries&_oac=1&_nkw=ac+compressor&orig_cvip=true&_ipg=240&_ul=US&_stpos=91710&_pgn=1

[第1轮] - [第3页] - [请求1次] - [623] - [剩余1条] - [2024/12/05 19:51:25] - 2. https://www.ebay.de/sch/i.html?_d