In [1]:
from gevent import monkey
monkey.patch_all(thread=False)

from gevent.pool import Pool
pool = Pool(5)

from gevent.lock import BoundedSemaphore
semaphore_error, semaphore_remain = BoundedSemaphore(1), BoundedSemaphore(1)

from IPython.display import clear_output
from datetime import datetime

# = = = = = = = = = = = = = = =

import os
import shutil

directory_out = './file/1.url'

if os.path.exists(directory_out):
    shutil.rmtree(directory_out)
os.makedirs(directory_out)

# = = = = = = = = = = = = = = =

import pandas as pd

# = = = = = = = = = = = = = = =

from urllib.parse import urlparse, parse_qs, urlencode

import requests

import sys
sys.path.append('../00.Tools')
from crawler_configuration import get_header, get_proxy

from bs4 import BeautifulSoup
from lxml import etree

# = = = = = = = = = = = = = = =

from tqdm import tqdm
import re

# = = = = = = = = = = = = = = =

list_series_error, df_input = [], pd.read_excel('./file/input.xlsx',
                                                header=0,
                                                dtype=str).fillna('')

df_input['No'] = df_input['No'].astype(int)

count_remain = len(df_input)

print(f'总数量：{count_remain}')
print()

# = = = = = = = = = = = = = = =

def crawler(crawler_tuple):
    global list_series_error, count_remain

    (crawler_index, crawler_series) = crawler_tuple

    # = = = = = = = = = = = = = = =

    try:
        url_parse = urlparse(crawler_series['Url'])

        dict_param = parse_qs(url_parse.query)
        dict_param.pop('page', None)
        dict_param['sort'] = 'alphaasc'
        dict_param['limit'] = '96'

        url_base = f'{url_parse.scheme}://{url_parse.netloc}{url_parse.path}?{urlencode(dict_param, doseq=True)}'

        list_url, page = [], 1
        while True:

            count_retry = 0
            while True:
                count_retry += 1

                try:
                    url_request = f'{url_base}&page={page}'

                    resp = requests.get(url_request,
                                        headers=get_header(),
                                        proxies=get_proxy(),
                                        timeout=(5, 15))

                    if resp.status_code == 200:
                        soup = BeautifulSoup(resp.text, 'lxml')
                        html = etree.HTML(str(soup))

                        list_href = html.xpath('//h4[@class="card-title NoMargin"]/a/@href')
                        if list_href:
                            break
                except KeyboardInterrupt:
                    break
                except:
                    continue

            # = = = = = = = = = = = = = = =

            list_url.extend(list_href)
            list_url = list(dict.fromkeys(list_url))

            # = = = = = = = = = = = = = = =

            if count_remain % 30 == 0 and count_remain != 0:
                clear_output()
            print(f'''[第{page}页] - [请求{count_retry}次] - [剩余{count_remain - 1}条] - [{datetime.now().strftime('%Y/%m/%d %H:%M:%S')}] - {crawler_series['No']}. {url_request}\n''')

            # = = = = = = = = = = = = = = =

            page += 1

            # = = = = = = = = = = = = = = =

            if not html.xpath('//li[@class="pagination-item pagination-item--next"]'):
                break

        # = = = = = = = = = = = = = = =

        if not list_url:
            raise

        # = = = = = = = = = = = = = = =

        df_correct = pd.DataFrame({'No': range(1, len(list_url) + 1),
                                   'Url': list_url})
        df_correct.to_excel(f'''{directory_out}/crawler_{crawler_series['No']}-{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx''', index=False)
    except:
        crawler_series['Request_Url'] = url_request

        with semaphore_error:
            list_series_error.append(crawler_series)

    # = = = = = = = = = = = = = = =

    with semaphore_remain:
        count_remain -= 1

# = = = = = = = = = = = = = = =

pool.map(crawler, df_input.iterrows())

print('输出ing...')
print()
if list_series_error:
    df_error = pd.DataFrame(list_series_error).sort_values(by=['No'],
                                                           ascending=[True],
                                                           ignore_index=True)
    df_error.to_excel('./file/item_url_error.xlsx', index=False)
    print('爬虫存在error')
    print()

# = = = = = = = = = = = = = = =

list_file = [file for file in sorted(list(os.walk(directory_out))[0][2]) if file not in ['.DS_Store'] and os.path.isfile(f'{directory_out}/{file}')]
for file in tqdm(list_file, desc='Progress', ncols=77):
    os.rename(f'{directory_out}/{file}',
              f'''{directory_out}/{re.sub(r'-[0-9]{8}_[0-9]{6}.xlsx$', '.xlsx', file)}''')

print('Done ~')

总数量：2

[第1页] - [请求1次] - [剩余1条] - [2024/10/24 14:05:36] - 2. https://www.cardone.com/drivetrain/driveline-axles/driveshaft-prop-shaft/?limit=96&sort=alphaasc&page=1

[第2页] - [请求1次] - [剩余1条] - [2024/10/24 14:05:38] - 2. https://www.cardone.com/drivetrain/driveline-axles/driveshaft-prop-shaft/?limit=96&sort=alphaasc&page=2

[第3页] - [请求1次] - [剩余1条] - [2024/10/24 14:05:41] - 2. https://www.cardone.com/drivetrain/driveline-axles/driveshaft-prop-shaft/?limit=96&sort=alphaasc&page=3

[第4页] - [请求1次] - [剩余1条] - [2024/10/24 14:05:46] - 2. https://www.cardone.com/drivetrain/driveline-axles/driveshaft-prop-shaft/?limit=96&sort=alphaasc&page=4

[第1页] - [请求2次] - [剩余1条] - [2024/10/24 14:05:51] - 1. https://www.cardone.com/motors/wiper-and-washer/windshield-wiper-motor/?sort=alphaasc&limit=96&page=1

[第5页] - [请求2次] - [剩余1条] - [2024/10/24 14:05:54] - 2. https://www.cardone.com/drivetrain/driveline-axles/driveshaft-prop-shaft/?limit=96&sort=alphaasc&page=5

[第2页] - [请求1次] - [剩余1条] - [2024/10/24 14:05:55]

Progress: 100%|██████████████████████████████| 2/2 [00:00<00:00, 3419.73it/s]

Done ~



