In [1]:
from bs4 import BeautifulSoup
import requests as req
import tqdm
import pandas as pd
import re
from multiprocessing.dummy import Pool, Lock, Value
from time import sleep
from requests.exceptions import ConnectionError

In [2]:
def get_page(url, n_attempts=5, t_sleep=2, exc_counter=0):
    try:
        for i in range(n_attempts):
            resp = req.get(url)
            if not resp:
                sleep(t_sleep)
                continue
            else:
                break
        return resp
    except ConnectionError as e: # in case if website timeouted me
        print(url, ' ', e)
        exc_counter += 1
        sleep(t_sleep + 1)
        if exc_counter < n_attempts:
            get_page(url, err_counter=exc_counter)
        else:
            return False

In [3]:
mutex = Lock()

def parse_page(url, book_list):
    resp = get_page(url)
    if not resp:
        print('couldn\'t get page with list of books ', url)
        return
    soup0 = BeautifulSoup(resp.text, 'lxml')
    books = soup0.find('div', class_="rd-page-listing__products").\
                            find_all('div', class_="rd-listing-product-item-data-wrap")
    next_p_button = soup0.find('a', class_="pagination-next")
    for i in books:
        book_list.append(i)
        with mutex:
            global n_processed_links
            n_processed_links.value += 1
            print(f"\r{n_processed_links.value} links are processed...", end='', flush=True)
    return next_p_button

In [4]:
def parse_book(book_url):
    url = template_link + book_url.find('a').get('href')
    resp = get_page(url)
    if not resp:
        print('couldn\'t get page of book ', url)
        return
    soup = BeautifulSoup(resp.text, 'lxml')
    this_page_dict = {
        'ID': soup.find('span', itemprop="sku").text,
        'Название': soup.find('h1', class_="rd-page-product__title").text,
        'Автор': soup.find('a', itemprop="brand").text,
        'Категория': '; '.join(map(lambda t: t.text.strip(),soup.find('div', class_="rd-page-product__breadcrumbs").\
                                   find_all('span', itemprop='name'))),
        'Изображение': soup.find('meta', property="og:image").get('content'),
        'Наличие': soup.find('span', class_="rd-page-product__buy-text").text == 'Купить',
    }
    desc = soup.find('div', class_="rd-page-product__desc-body").find_all('p')
    if desc:
        this_page_dict['Описание'] = ' '.join(map(lambda t: t.text.strip(), desc))
    else:
        this_page_dict['Описание'] = soup.find('div', class_="rd-page-product__desc-body", itemprop="description").text.strip()
    characteristics = soup.find('div', class_="rd-page-product__desc-params").find_all('p')
    for i in characteristics:
        this_page_dict[i.find(itemprop="name").text] = i.find(itemprop="value").text

    price = soup.find('div', itemtype="http://schema.org/Offer")
    this_page_dict['Цена'] = price.find('span', class_="num").text
    old_price = price.find('span', class_="prev")
    if old_price:
        this_page_dict['Цена (старая)'] = old_price.text.strip().split()[0]
    feedback = soup.find('div', class_="rd-rating-stars")
    if feedback.find('span', itemprop="aggregateRating"):
        this_page_dict['Число отзывов'] = feedback.find('meta', itemprop="reviewCount").get('content')
        this_page_dict['Число оценок'] = feedback.find('meta', itemprop="ratingCount").get('content')
        this_page_dict['Оценка'] = feedback.find('meta', itemprop="ratingValue").get('content')

    preview = soup.find('a', class_="download-pdf")
    if preview:
        this_page_dict['Превью'] = template_link + preview.get('href')
    res_list.append(this_page_dict)
    with mutex:
        global n_processed_books
        n_processed_books.value += 1
        print(f"\r{n_processed_books.value} books are processed...", end='', flush=True)

In [5]:
def get_all_links(author):
    url = 'https://www.respublica.ru/authors/{}'.format(author)
    next_p_button = parse_page(url, book_list)
    while next_p_button:
        next_p_url = template_link + next_p_button.get('href')
        next_p_button = parse_page(next_p_url, book_list)

In [6]:
template_link = 'https://www.respublica.ru'
book_list = []
n_processed_links = Value('i', 0)
with open('./Data_analysis/Site_parsing_hw/authors.txt', 'r') as f:
    with Pool(5) as pool:
        pool.map(get_all_links, f)

2453 links are processed...

In [7]:
len(book_list)

2453

In [8]:
#parsing all books from the list we got
res_list = []
n_processed_books = Value('i', 0)
with Pool(8) as pool:
    pool.map(parse_book, book_list)

2453 books are processed...

In [9]:
len(res_list)

2453

In [16]:
df = pd.DataFrame(res_list).fillna('')
df.sort_values(by=['ID'], inplace=True)
with open('./hw_3.csv', mode='w', encoding = 'utf-8') as f_csv:
    tmp = df.to_csv(index=False, sep='\t')
    f_csv.write(tmp)