In [2]:
#First stage
from lxml import etree, html as lhtml
from bs4 import BeautifulSoup
import pandas as pd
from html.parser import HTMLParser
from tqdm import tqdm_notebook
import json
import requests

In [3]:
html = "https://www.moscowbooks.ru/catalog/author/"
author_id = {
  "Достоевский Ф. М.": 9150,
  "Роллинс Дж.": 59396,
  "Фицджеральд Ф. С.": 28727,
  "Глуховский Д. А.": 53427,
  "Стругацкий А. Н.": 26268,
  "Лукьяненко С. В.": 16626,
  "Фрай М.": 28927,
  "Хантер Э.": 37969,
  "Роулинг Дж. К.": 104832
}
author_url = {name: html + str(aid) for name, aid in author_id.items()}
html_a = { name:requests.get(url).text for name, url in author_url.items()}

In [4]:
result = {}
for name,url in author_url.items():
    tmp = requests.get(url).text
    soup = BeautifulSoup(tmp, 'lxml')
    pages = [int(p['data-ajaxpage']) for p in soup.find_all(class_ = 'pager__text') if len(p.attrs) > 1]
    if len(pages) == 0:
        max_page = 1
    else:
        max_page = max(pages)
    url += "?page="
    res_author = []
    for i in range (1, max_page + 1):
        html = requests.get(url + str(i)).text
        soup = BeautifulSoup(html, 'lxml')
        for book_info in soup.find_all(class_ ='book-preview__buy-button button button_primary tocart_btn'):
            res_author.append(book_info.attrs['data-productid'])
    result.setdefault(name, res_author)

In [5]:
book_ids = []
for l in list(result.values()):
    book_ids.extend(l)
len(book_ids)

243

In [6]:
def get_name(soup):
    name = soup.find(itemprop = 'name')
    if name is not None:
        return {'Название': name.text}
    else:
        return {'Название': ""}

In [7]:
def get_author(soup):
    author = soup.find(class_ = 'author-name')
    if author is not None:
        return {'Автор': author.text}
    else:
        return {'Автор': ""}

In [24]:
def get_avability(soup):
    aval = soup.find(class_ = 'instock1')
    if aval is not None:
        if aval.text.find("В наличии"):
            aval = 'true'
        else:
            aval = 'false'
        return {'Наличие': aval}
    else:
        return {'Наличие': 'false'}

In [25]:
def get_descr(soup):
    descr = soup.find(class_ = "book__description collapsed js-book-description")
    if descr is not None:
        descr = descr.text
        descr = descr[descr.find(':') + 1:descr.find('\r')].strip()
        return {'Описание': descr}
    else:
        return {'Описание': ""}

In [26]:
def get_rating(soup):
    rate = soup.find(class_ = 'book___rating-stars rating-stars rating-stars_lg')
    if rate is not None:
        return {'Рейтинг': rate['data-rate']}
    else:
        return {'Рейтинг': ""}

In [27]:
def get_cover(soup):
    cover_url = soup.find(class_ = 'link_gallery')
    if cover_url is not None:
        return {'Обложка': 'https://www.moscowbooks.ru' + cover_url['href']}
    else:
        return {'Обложка': ""}

In [28]:
import re
def get_stickers(soup):
    stickers = soup.find(class_ = 'book__stickers')
    if stickers is not None:
        stickers =  re.findall(r'\w+', stickers.text)
        stickers = ', '.join(stickers)
        return {'Стикеры': stickers}
    else:
        return {'Стикеры': ""}

In [29]:
def get_price(soup):
    price = soup.find(class_ = 'book__price')
    if price is not None:
        price = re.findall(r'\d+', price.text)
        return {'Цена': price[0]}
    else:
        return {'Цена': ""}

In [30]:
def get_details(soup):
    details_name = soup.find_all(attrs = 'book__details-name')
    details_value = soup.find_all(attrs = 'book__details-value')
    if details_name is not None and details_value is not None:
        details_name = [s.text.strip() for s in details_name]
        details_value = [s.text.strip() for s in details_value]
        return dict(zip(details_name, details_value))
    else:
        return {"":""}

In [31]:
def extract_book_info(book_id):
    result = {}
    result.update({'Код товара':book_id})
    url = "https://www.moscowbooks.ru/book/" + book_id
    soup = BeautifulSoup(requests.get(url).text, 'lxml')
    result.update(get_name(soup))
    result.update(get_author(soup))
    result.update(get_avability(soup))
    result.update(get_descr(soup))
    result.update(get_rating(soup))
    result.update(get_cover(soup))
    result.update(get_stickers(soup))
    result.update(get_price(soup))
    result.update(get_details(soup))
    return result

In [32]:
from multiprocessing import Pool, Lock, Value
from time import sleep

mutex = Lock()
n_processed = Value('i', 0)

def func_wrapper(uid):
    res = extract_book_info(uid) 
    with mutex:
        global n_processed
        n_processed.value += 1
        if n_processed.value % 10 == 0:
            print(f"\r{n_processed.value} objects are processed...", end='', flush=True)
    return res

with Pool(processes=5) as pool:
    res = pool.map(func_wrapper, book_ids)

240 objects are processed...

In [33]:
df = pd.DataFrame(res)
df.sort_values(by=['Код товара'], inplace=True)

with open('hw_3.csv', mode='w', encoding='utf-8') as f_csv:
    df.to_csv(f_csv, index=False)