In [96]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import os
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import psycopg2
from psycopg2 import sql
from urllib.parse import urlparse
from urllib.parse import parse_qs
from urllib.parse import urlencode
import time
import random
import re

In [114]:
SITE_CFG = {
    'origin': 'https://market.yandex.ru',
    'url': 'https://market.yandex.ru/catalog--kvadrokoptery/18042097/list',
    'params': {'glfilter': '23674510%3A2~4000',
               'hid': '12410815',
               'pricefrom': '1',
               'onstock': '0',
               'qrfrom': '2',
               'local-offers-first': '0',
               'viewtype': 'list'}
}

DB_CFG = {
    'dbname': 'diploma',
    'user': 'diploma',
    'password': 'diploma',
    'host': 'localhost'
}
PROPS = {
    'search_links_class': '_24Q6d',  #'2f75n'
    'next_page_class': '_3OFYT'
}

DRIVER = webdriver.Chrome(executable_path='/Users/17dzat/chromedriver', options=Options())

In [167]:
class Parser:

    def __init__(self, driver, site_cfg, db_cfg):
        self.driver = driver
        self.site_cfg = site_cfg
        self.db_cfg = db_cfg
        self.driver = self.init_driver()
        self.links = []
        self.helicopters = []

    def parse_links(self):
        self.links = []
        page = 1
        while True:
            page_params = {**self.site_cfg['params'], **{'page': page}}
            page_url = f"{self.site_cfg['url']}?{urlencode(page_params)}"
            self.driver.get(page_url)
            page += 1

            search_links = self.driver.find_elements(by=By.CLASS_NAME, value=PROPS['search_links_class'])
            for x in search_links:
                self.links.append(x.get_attribute('href'))

            try:
                self.driver.find_element(by=By.CSS_SELECTOR,
                                         value=f"a.{PROPS['next_page_class']}[aria-label='Следующая страница']")

                time.sleep(random.randrange(2, 6))

            except NoSuchElementException:
                break

    def save_links(self):
        conn = psycopg2.connect(**self.db_cfg)
        with conn.cursor() as cursor:
            conn.autocommit = True
            l = [(x,) for x in self.links]
            insert = sql.SQL('INSERT INTO links (link) VALUES {}').format(
                sql.SQL(',').join(map(sql.Literal, l))
            )
            cursor.execute(insert)

    def delete_link(self, link_id):
        conn = psycopg2.connect(**self.db_cfg)
        with conn.cursor() as cursor:
            conn.autocommit = True

            delete = """Delete
                        From links
                        WHERE id = %s"""

            cursor.execute(delete, (link_id,))

    def get_not_processed_links(self):
        conn = psycopg2.connect(**self.db_cfg)
        cursor = conn.cursor()

        cursor.execute('SELECT id, link FROM links WHERE processed = FALSE')
        return cursor.fetchall()

    def save_helicopter(self, h, link_id):
        conn = psycopg2.connect(**self.db_cfg)
        with conn.cursor() as cursor:
            conn.autocommit = True
            helicopter = (
                h['name'], h['weight'], h['duration'], h['distance'], h['height'], h['speed'], h['pixels'], h['fps'],
                h['rating'],
                h['price'],
                link_id)
            insert = sql.SQL(
                'INSERT INTO helicopters (name, weight, duration, distance, height, speed, pixels, fps, rating, price, link_id) VALUES {}').format(
                sql.Literal(helicopter)
            )
            cursor.execute(insert)

            update = """UPDATE links
                SET processed = %s
                WHERE id = %s"""

            cursor.execute(update, (True, link_id))

    def get_helicopters(self):
        links = self.get_not_processed_links()
        for link in links:
            link_id = link[0]
            url = link[1]
            self.driver.get(url)
            time.sleep(random.randrange(3,5))
            h = {}
            h['name'] = self.driver.find_element(by=By.CSS_SELECTOR, value='h1._1BWd_').text
            try:
                rtng = self.driver.find_element(by=By.CSS_SELECTOR, value='span._2v4E8')
            except:
                self.delete_link(link_id)
                continue
            h['rating'] = rtng.text or rtng.get_attribute('textContent')
            h['price'] = int(''.join(
                self.driver.find_element(by=By.CSS_SELECTOR, value='div._3NaXx._3kWlK > span > span').get_attribute(
                    'textContent').split(
                    ' ')))

            h['props'] = {}
            for el in self.driver.find_elements(by=By.CSS_SELECTOR, value='table.Ksay3 > tbody > tr._2oLGf'):
                row_name = el.find_element(by=By.CSS_SELECTOR, value='td._2trXG').get_attribute('textContent')
                h['props'][row_name] = el.find_element(by=By.CSS_SELECTOR, value='td._3M0mF').get_attribute(
                    'textContent')

            if h['props'].get('полет'):
                flight = h['props']['полет']
                try:
                    h['duration'] = int(re.findall(r"(\d+)\s*мин", flight)[0])
                except:
                    h['duration'] = 15
                try:
                    h['distance'] = int(re.findall(r"дальность полета\s*(\d+)\s*м", flight)[0])
                except:
                    h['distance'] = 50
                try:
                    h['height'] = int(re.findall(r"высота\s*(\d+)\s*м", flight)[0])
                except:
                    h['height'] = h['distance']
                try:
                    h['speed'] = int(re.findall(r"скорость\s*(\d+)\s*м/с", flight)[0])
                except:
                    h['speed'] = 1

            if h['props'].get('видео'):
                videos = h['props']['видео'].split(',')
                specs = []
                for spec in videos:
                    only_pixels = re.findall(r"(\d+)p", spec)
                    only_k = re.findall(r"(\d+)K", spec)
                    pixels_multi = re.findall(r"(\d{2,5})x(\d{2,5})", spec)
                    # fps_with_p = re.match(r"\s*(?P<fps>\d+)\s*к/с.+?(?P<pixels>\d+)p\s*", spec)
                    # fps_with_k = re.match(r"\s*(?P<fps>\d+)\s*к/с.+?(?P<k>\d+)K\s*", spec)
                    fps = re.findall(r"\s*(\d+)\s*к/с", spec)
                    if len(fps) > 0:
                        fps_val = int(fps[0])
                        if len(only_pixels) > 0:
                            specs.append({'pixels': int(only_pixels[0]), 'fps': fps_val})
                        elif len(only_k) > 0:
                            specs.append({'pixels': int(int(only_k[0]) / 2 * 1024), 'fps': fps_val})
                        elif len(pixels_multi) > 0:
                            specs.append({'pixels': int(pixels_multi[0][1]), 'fps': fps_val})
                    else:
                        if len(only_pixels) > 0:
                            specs.append({'pixels': int(only_pixels[0]), 'fps': 30})
                        elif len(only_k) > 0:
                            specs.append({'pixels': int(int(only_k[0]) / 2 * 1024), 'fps': 30})
                        elif len(pixels_multi) > 0:
                            specs.append({'pixels': int(pixels_multi[0][1]), 'fps': 30})

                with_max_fps = max(specs, key=lambda x: x['fps'])
                h['fps'] = with_max_fps['fps']
                h['pixels'] = with_max_fps['pixels']
            else:
                self.delete_link(link_id)
                continue

            if h['props'].get('вес'):
                h['weight'] = int(re.findall(r"(\d+)\s*г", h['props'].get('вес'))[0])

            self.save_helicopter(h, link_id)

In [168]:
parser = Parser(DRIVER, SITE_CFG, DB_CFG)

  return webdriver.Chrome(executable_path='/Users/17dzat/chromedriver', options=Options())


In [174]:
parser.get_helicopters()

In [110]:
conn = psycopg2.connect(**DB_CFG)
cursor = conn.cursor()

cursor.execute('SELECT id, link FROM links')
links = cursor.fetchall()

In [99]:
driver.get(links[0][0])

In [100]:
# time.sleep(5)
h = {}
h['name'] = driver.find_element(by=By.CSS_SELECTOR, value='h1._1BWd_').text
rtng = driver.find_element(by=By.CSS_SELECTOR, value='span._2v4E8')
h['rating'] = rtng.text or rtng.get_attribute('textContent')
h['price'] = int(''.join(
    driver.find_element(by=By.CSS_SELECTOR, value='div._3NaXx._3kWlK > span > span').get_attribute('textContent').split(
        ' ')))

h['props'] = {}
for el in driver.find_elements(by=By.CSS_SELECTOR, value='table.Ksay3 > tbody > tr._2oLGf'):
    row_name = el.find_element(by=By.CSS_SELECTOR, value='td._2trXG').get_attribute('textContent')
    h['props'][row_name] = el.find_element(by=By.CSS_SELECTOR, value='td._3M0mF').get_attribute('textContent')

if h['props'].get('полет'):
    flight = h['props']['полет']
    try:
        h['duration'] = int(re.findall(r"(\d+)\s*мин", flight)[0])
    except:
        h['duration'] = 15
    try:
        h['distance'] = int(re.findall(r"дальность полета\s*(\d+)\s*м", flight)[0])
    except:
        h['distance'] = 50
    try:
        h['height'] = int(re.findall(r"высота\s*(\d+)\s*м", flight)[0])
    except:
        h['height'] = h['distance']
    try:
        h['speed'] = int(re.findall(r"скорость\s*(\d+)\s*м/с", flight)[0])
    except:
        h['speed'] = 1

if h['props'].get('видео'):
    videos = h['props']['видео'].split(',')
    specs = []
    for spec in videos:
        only_pixels = re.findall(r"(\d+)p", spec)
        only_k = re.findall(r"(\d+)K", spec)
        pixels_multi = re.findall(r"(\d{2,5})x(\d{2,5})", spec)
        # fps_with_p = re.match(r"\s*(?P<fps>\d+)\s*к/с.+?(?P<pixels>\d+)p\s*", spec)
        # fps_with_k = re.match(r"\s*(?P<fps>\d+)\s*к/с.+?(?P<k>\d+)K\s*", spec)
        fps = re.findall(r"\s*(\d+)\s*к/с", spec)
        if len(fps) > 0:
            fps_val = int(fps[0])
            if len(only_pixels) > 0:
                specs.append({'pixels': int(only_pixels[0]), 'fps': fps_val})
            elif len(only_k) > 0:
                specs.append({'pixels': int(int(only_k[0]) / 2 * 1024), 'fps': fps_val})
            elif len(pixels_multi) > 0:
                specs.append({'pixels': int(pixels_multi[0][1]), 'fps': fps_val})
        else:
            if len(only_pixels) > 0:
                specs.append({'pixels': int(only_pixels[0]), 'fps': 30})
            elif len(only_k) > 0:
                specs.append({'pixels': int(int(only_k[0]) / 2 * 1024), 'fps': 30})
            elif len(pixels_multi) > 0:
                specs.append({'pixels': int(pixels_multi[0][1]), 'fps': 30})

    with_max_fps = max(specs, key=lambda x: x['fps'])
    h['fps'] = with_max_fps['fps']
    h['pixels'] = with_max_fps['pixels']

if h['props'].get('вес'):
    h['weight'] = int(re.findall(r"(\d+)\s*г", h['props'].get('вес'))[0])

In [156]:
conn = psycopg2.connect(**DB_CFG)
with conn.cursor() as cursor:
    conn.autocommit = True

    delete = """Delete
                From links
                WHERE id = %s"""

    cursor.execute(delete, (46))

TypeError: 'int' object does not support indexing