In [1]:
import warnings
warnings.simplefilter('ignore')

import pandas as pd
from pandas import CategoricalDtype
from matplotlib import pyplot as plt
import seaborn as sns
import re

from selenium.webdriver import Chrome

In [2]:
driver = Chrome('chromedriver')
url = 'https://auto.ru/rossiya/cars/toyota/'
models = ['camry', 'corolla', 'land_cruiser', 'land_cruiser_prado', 'rav_4']

In [3]:
def parse_block(block):
    summary = block.find_element_by_class_name('ListingItem__summary')
    h3 = summary.find_element_by_tag_name('h3')
    name = h3.text
    url = h3.find_element_by_tag_name('a').get_attribute('href')

    params1, params2 = dfblock.find_elements_by_class_name('ListingItemTechSummaryDesktop__column')
    eng = params1.find_element_by_class_name('ListingItemTechSummaryDesktop__cell').text

    drive, color = (div.text for div in params2.find_elements_by_tag_name('div'))

    price = ''.join(re.findall('[\d]+', block.find_element_by_class_name('ListingItem__priceBlock')
                               .find_element_by_tag_name('div')
                               .text))
    try:
        year = block.find_element_by_class_name('ListingItem__yearBlock').text
    except:
        year = None
    
    try:
        mileage = ''.join(re.findall('[\d]+', block.find_element_by_class_name('ListingItem__kmAge').text))
    except:
        mileage = None
    
    try:
        city = block.find_element_by_class_name('MetroListPlace__regionName').text
    except:
        city = None
        
    row = {
        'model': model,
        'name': name,
        'url': url,
        'eng': eng,
        'drive': drive,
        'color': color,
        'price': price,
        'year': year,
        'mileage': mileage,
        'city': city,
    }
    
    return row

In [4]:
columns = ['model', 'name', 'url', 'eng', 'drive', 'color', 'price', 'year', 'mileage', 'city']
df = pd.DataFrame(columns=columns)

for model in models:
    model_url = url + model + '/used'
    driver.get(model_url)
    
    last_page = int(driver.find_element_by_class_name('ListingPagination__pages').text.split('\n')[-1])
    model_pages = [model_url] + [model_url+'?page='+str(p) for p in range(2,last_page+1)]
    
    for page in model_pages:
        driver.get(page)
        blocks = driver.find_elements_by_class_name('ListingItem')

        for block in blocks:
            row = parse_block(block)
            df = df.append(row, ignore_index=True)

In [6]:
df = df.drop_duplicates()
df.to_csv('data_.csv')
df = pd.read_csv('data_.csv')

In [7]:
print(f'Строк в исходном файле: {df.shape[0]}\nСтрок после удаления N/A: {df.dropna().shape[0]}')

Строк в исходном файле: 12402
Строк после удаления N/A: 12158


In [8]:
df = df.dropna()

In [9]:
df.drop([col for col in df if 'Unnamed' in col], axis=1, inplace=True)

In [10]:
df['name'] = df.apply(lambda r: r['name'][8+len(r['model']):], axis=1)
df[['eng_vol', 'hp', 'fuel']] = df['eng'].str.split('/', expand=True)
df['eng_vol'] = df['eng_vol'].str.replace(' л', '').astype('float')
df.drop('eng', axis=1, inplace=True)
df['hp'] = df['hp'].str.replace(' л.с.', '').astype('int')

In [11]:
int_feat = ['price', 'year', 'mileage', 'hp']
float_feat = ['eng_vol']
nom_feat = ['model', 'name', 'url', 'city']
unordered_feat = ['drive', 'color', 'fuel']

for f in int_feat:
    df[f] = df[f].astype('int')

for f in float_feat:
    df[f] = df[f].astype('float')

for f in nom_feat:
    df[f] = df[f].astype('category')

for f in unordered_feat:
    df[f] = df[f].astype(CategoricalDtype(df[f].unique(), ordered=False))

In [12]:
df.to_pickle('data.csv')