In [15]:
import requests
from bs4 import BeautifulSoup
import csv
import re
import glob
import json
import concurrent.futures
import time
import datetime
import pandas as pd

In [5]:
def get_page(url):
    response = requests.get(url)
    response.encoding = 'utf8'
    return BeautifulSoup(response.text, 'html.parser')

In [6]:
def get_brand_links():
    url = f'https://auto.ru'
    soup_main = get_page(url)

    all_brand_links = []

    for brand in soup_main.find_all('a', class_='IndexMarks__item'):
        all_brand_links.append(brand.get('href').replace('all', 'used'))

    return all_brand_links

In [7]:
def check_number(url):
    strr = 'ButtonWithLoader__content'
    page = get_page(url)
    cntt = page.find(class_=strr).text.split(' ')[1].split('\xa0')
    if len(cntt[1]) > 3:
        cnt = int(cntt[0])
    else:
        cnt = int(cntt[0]+cntt[1])
    return cnt

In [8]:
def get_page_car_links(page_url):
    page = get_page(page_url)
    if page == None:
        return []
    all_link = page.find_all(class_='ListingItemTitle-module__link')
    links = []
    for link in all_link:
        links.append(link['href'])
    return links

In [9]:
def get_car_links(url):
    try:
        strr = 'ListingPagination-module__page'
        max_page_num = int(get_page(url).find_all(class_=strr)[-1].text)
    except:
        max_page_num = 0

    links = []

    page_urls = [url + '?page=' + str(i) for i in range(1, max_page_num+1)]

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []
        for url in page_urls:
            futures.append(executor.submit(get_page_car_links, page_url=url))
        for feature in concurrent.futures.as_completed(futures):
            links += feature.result()
    return links

In [10]:
def get_links(brand_links):
    car_body = ['sedan', 'hatchback', 'allroad', 'wagon',
                'coupe', 'minivan', 'pickup', 'limousine', 'van', 'cabrio']
    car_links = []
    for link in brand_links:
        cnt = check_number(link)
        if cnt <= 3700:
            car_links += get_car_links(link)
        else:
            for year in range(1999, 2022):
                if year > 2000:
                    aa = link.split('/')
                    url_1 = '/'.join(aa[:5])+'/'+str(year)+'-year/used/'
                else:
                    url_1 = link+'?year_to=2000'
                cnt = check_number(url_1)
                if cnt <= 3700:
                    car_links += get_car_links(url_1)
                else:
                    for cb in car_body:
                        if year > 2000:
                            url_2 = url_1+f'body-{cb}/'
                        else:
                            url_2 = link+f'body-{cb}/?year_to=2000'
                        car_links += get_car_links(url_2)
    return car_links

In [11]:
def get_car_info(car_link):
    soup_car = get_page(car_link)
    car_info = {}
    cls_str = 'CardInfoRow_'
    span_str = 'CardInfoRow__cell'
    regex = re.compile('.*__info-item.*')
    sd = 'sale-data-attributes'
    
    if soup_car.find('div', class_='CardSold') == None:

        car_info['datetime'] = datetime.datetime.now(
            datetime.timezone(datetime.timedelta(hours=3)))
        car_info['parsing_unixtime'] = int(time.time())
        car_info['card'] = soup_car.find(
            'div', class_='CardSidebarActions__title').text
        car_info['region'] = soup_car.find(
            'span', class_='MetroListPlace__regionName MetroListPlace_nbsp').text
        if soup_car.find('div', class_='CardSellerNamePlace__name'):
            car_info['sellerName'] = soup_car.find(
                'div', class_='CardSellerNamePlace__name').text
        elif soup_car.find('a', class_='CardSellerNamePlace__name_dealer'):
            car_info['dealerName'] = soup_car.find(
                'a', class_='CardSellerNamePlace__name_dealer').text
        car_info['sell_id'] = soup_car.find(
            'div', title='Идентификатор объявления').text
        car_info['car_url'] = car_link
        car_info['price'] = soup_car.find(
            'span', class_='OfferPriceCaption__price').text
        car_info['description'] = soup_car.find(
            'div', class_='CardDescription__textInner').text
        car_info['image'] = 'https:' + \
            soup_car.find(
                'img', class_='ImageGalleryDesktop__image').get('src')

        soup_name = soup_car.find(
            'div', class_='CardBreadcrumbs').find_all('a')

        car_info['bodyType'] = soup_car.find(
            'li', class_=cls_str+'bodytype').find('a').text
        car_info['color'] = soup_car.find(
            'li', class_=cls_str+'color').find('a').text
        car_info['engine'] = soup_car.find(
            'li', class_=cls_str+'engine').find('div').text
        car_info['engineDisplacement'], car_info['enginePower'], car_info['fuelType'] = car_info['engine'].split(
            '/')
        car_info['mileage'] = soup_car.find(
            'li', class_=cls_str+'kmAge').find_all('span')[1].text
        car_info['productionDate'] = soup_car.find(
            'li', class_=cls_str+'year').find_all('span')[1].text
        car_info['vehicleTransmission'] = soup_car.find(
            'li', class_=cls_str+'transmission').find_all('span')[1].text
        car_info['Владельцы'] = soup_car.find(
            'li', class_=cls_str+'ownersCount').find_all('span')[1].text
        car_info['ПТС'] = soup_car.find(
            'li', class_=cls_str+'pts').find_all('span')[1].text
        car_info['Привод'] = soup_car.find(
            'li', class_=cls_str+'drive').find_all('span')[1].text
        car_info['Руль'] = soup_car.find(
            'li', class_=cls_str+'wheel').find_all('span')[1].text
        car_info['Состояние'] = soup_car.find(
            'li', class_=cls_str+'state').find_all('span')[1].text
        car_info['Таможня'] = soup_car.find(
            'li', class_=cls_str+'customs').find_all('span')[1].text
        car_info['card_type'] = soup_name[1].text
        car_info['brand'] = soup_name[2].text
        car_info['model_name'] = soup_name[3].text
        car_info['model_name2'] = soup_name[4].text
        car_info['name'] = soup_name[6].text
        car_info['equipment_dict'] = sale_data = json.loads(
            soup_car.find(id=sd)['data-bem'])[sd]

        car_info['public_date'] = soup_car.find_all(
            'div', {"class": regex})[0].text
        car_info['nview'] = soup_car.find_all('div', {"class": regex})[
            1].text.split(' ')[0]
        car_info['model_name_full'] = soup_car.find('h1').text
        url_2 = soup_car.find('a', class_='SpoilerLink')['href']
        pag = get_page(url_2)
        car_info['confDict'] = json.loads(pag.find(id=sd)['data-bem'])[sd]
    else:
        car_info = {}

    return car_info

In [12]:
def get_data(urls):
    data = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_to_url = {executor.submit(
            get_car_info, url): url for url in urls}
        for future in concurrent.futures.as_completed(future_to_url):
            url = future_to_url[future]
            try:
                data.append(future.result())
            except Exception as exc:
                print('%r generated an exception: %s' % (url, exc))
    return data

In [17]:
for link in br:
    print(link, check_number(link))
    threaded_start = time.time()
    brand = link.split('/')[4]
    try:
        if len(res_dict[brand])==0:
            res_dict[brand] = get_links([link])
    except:
        res_dict[brand]=[]
    print("Load time:", time.time() - threaded_start)

https://auto.ru/cars/vaz/used/ 65250


NameError: name 'res_dict' is not defined