# Сбор данных с сайта [quotes](https://quotes.toscrape.com/)

In [20]:
import json
import requests
import uuid
from bs4 import BeautifulSoup
from tqdm import tqdm

In [21]:
# Функция для получения информации об авторе
def get_author_info(author_url):
    try:
        response = requests.get(author_url)
        response.raise_for_status()  # Проверка на успешный статус код
    except requests.RequestException as e:
        print(f"Ошибка при запросе к {author_url}: {e}")
        return None  # Возвращаем None в случае ошибки

    soup = BeautifulSoup(response.text, 'html.parser')

    # Извлечение информации об авторе
    birth_date = soup.find('span', class_='author-born-date').get_text()
    birth_place = soup.find('span', class_='author-born-location').get_text()
    description = soup.find('div', class_='author-description').get_text(strip=True)

    return {
        "birth_date": birth_date,
        "birth_place": birth_place,
        "description": description
    }

# URL сайта
base_url = "https://quotes.toscrape.com"
url = f"{base_url}/"

# Словарь для хранения информации об авторах
authors_info = {}
# Словарь для хранения цитат с ID
quotes_dict = {}
# Множество для проверки оригинальности цитат
unique_quotes = set()
# Счётчик страниц
pgs_cnt = 1

while url:
    try:
        response = requests.get(url)
        response.raise_for_status()  # Проверка на успешный статус код
    except requests.RequestException as e:
        print(f"Ошибка при запросе к {url}: {e}")
        break  # Завершаем выполнение программы в случае ошибки

    soup = BeautifulSoup(response.text, 'html.parser')

    # Сбор данных с текущей страницы
    for quote in tqdm(soup.find_all("div", class_="quote")):
        text = quote.find("span", class_="text").get_text()
        author = quote.find("small", class_="author").get_text()

        # Проверка на оригинальность цитаты
        if text in unique_quotes:
            continue  # Пропускаем, если цитата уже существует
        unique_quotes.add(text)  # Добавляем цитату в множество

        # Получение ссылки на страницу автора
        author_link = quote.find("small", class_="author").find_next_sibling("a")['href']
        author_url = f"{base_url}{author_link}"

        # Проверка, есть ли информация об авторе в словаре
        if author not in authors_info:
            author_info = get_author_info(author_url)
            if author_info is not None:
                authors_info[author] = author_info

        # Получение информации об авторе из словаря
        author_info = authors_info.get(author, {})

        # Генерация уникального ID для цитаты
        quote_id = str(uuid.uuid4())  # Генерация уникального ID

        # Добавление цитаты в словарь с использованием ID
        quotes_dict[quote_id] = {
            "text": text,
            "author": author,
            "author_info": author_info,
            "tags": [tag.get_text() for tag in quote.find_all('a', class_='tag')]  # Добавляем теги в словарь
        }

    # Поиск ссылки на следующую страницу
    next_button = soup.find('li', class_='next')
    if next_button:
        next_link = next_button.find('a')['href']
        url = f"{base_url}{next_link}"  # Обновляем URL для следующей страницы
        pgs_cnt += 1
    else:
        url = None  # Если кнопка "Next" отсутствует, завершаем цикл
        print(f'\nУра! Мы спарсили {pgs_cnt} страниц!')

# Сохранение данных в JSON файл
with open("quotes.json", "w", encoding="utf-8") as json_file:
    json.dump(quotes_dict, json_file, ensure_ascii=False, indent=4)

print("\n\nДанные успешно собраны и сохранены в quotes.json")

100%|██████████| 10/10 [00:06<00:00,  1.53it/s]
100%|██████████| 10/10 [00:05<00:00,  1.76it/s]
100%|██████████| 10/10 [00:04<00:00,  2.45it/s]
100%|██████████| 10/10 [00:03<00:00,  3.08it/s]
100%|██████████| 10/10 [00:03<00:00,  3.11it/s]
100%|██████████| 10/10 [00:05<00:00,  1.82it/s]
100%|██████████| 10/10 [00:02<00:00,  4.06it/s]
100%|██████████| 10/10 [00:05<00:00,  1.75it/s]
100%|██████████| 10/10 [00:00<?, ?it/s]
100%|██████████| 10/10 [00:04<00:00,  2.03it/s]


Ура! Мы спарсили 10 страниц!


Данные успешно собраны и сохранены в quotes.json





Посмотрим на образец.

In [30]:
with open('quotes.json', 'r', encoding='utf-8') as f:
    quotes = json.load(f)

print(f'Number of quotes: {len(quotes)}')

quotes[list(quotes.keys())[1]]

Number of quotes: 100


{'text': '“It is our choices, Harry, that show what we truly are, far more than our abilities.”',
 'author': 'J.K. Rowling',
 'author_info': {'birth_date': 'July 31, 1965',
  'birth_place': 'in Yate, South Gloucestershire, England, The United Kingdom',
  'description': 'See also: Robert GalbraithAlthough she writes under the pen name J.K. Rowling, pronounced like rolling, her name when her first Harry Potter book was published was simply Joanne Rowling. Anticipating that the target audience of young boys might not want to read a book written by a woman, her publishers demanded that she use two initials, rather than her full name. As she had no middle name, she chose K as the second initial of her pen name, from her paternal grandmother Kathleen Ada Bulgen Rowling. She calls herself Jo and has said, "No one ever called me \'Joanne\' when I was young, unless they were angry." Following her marriage, she has sometimes used the name Joanne Murray when conducting personal business. During t