<a href="https://colab.research.google.com/github/iKatePy/Study_projects/blob/master/learning_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Образ задачи

Создание системы для автоматического сбора данных о книгах с сайта.
Описание задачи

Вы дата-саентист, занимающийся исследованием литературного рынка. И вам необходимо создать систему для автоматической выгрузки и обработки данных о книгах с веб-сайта Books to Scrape. Система должна собирать данные ежедневно в 19:00 и сохранять их в табличном формате с выводом дополнительного анализа.

In [None]:
!pip install schedule

Collecting schedule
  Downloading schedule-1.2.2-py3-none-any.whl.metadata (3.8 kB)
Downloading schedule-1.2.2-py3-none-any.whl (12 kB)
Installing collected packages: schedule
Successfully installed schedule-1.2.2


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import schedule
import time

#def correct_currency_sign(price):
    #return price.replace("Г‚ВЈ", "£")
def scraper():
    url = 'http://books.toscrape.com/catalogue/page-1.html'
    data = []  # Define empty data list

    while url:  # Iterating through book pages
        response = requests.get(url)
        response.encoding = "utf-8"
        soup = BeautifulSoup(response.text, 'lxml')  # Creating BeautifulSoup object was 'html'

        books = soup.find_all('article', class_='product_pod')  # Finding class product_pod and article elements
        for book in books:                                    # looping through an array of elements
            title = book.h3.a['title']
            price = book.find('p', class_='price_color').text
            #price = correct_currency_sign(price)  # Correcting the currency sign
            rating = book.p['class'][1]  # Rating is stored in the class attribute
            availability = book.find('p', class_='instock availability').text.strip()

            book_url = book.h3.a['href']        # Navigate to the book page to get additional characteristics
            book_response = requests.get(f"http://books.toscrape.com/catalogue/{book_url}")
            book_response.encoding = "utf-8"
            book_soup = BeautifulSoup(book_response.text, 'lxml')
            description_tag = book_soup.find('meta', attrs={'name': 'description'})
            description = description_tag['content'].strip() if description_tag else 'No description available'

            additional_info = {}            # Collect additional characteristics
            info_table = book_soup.find('table', class_='table table-striped')
            if info_table:
                rows = info_table.find_all('tr')        # going through elements of the table
                for row in rows:
                    key = row.find('th').text.strip()
                    value = row.find('td').text.strip()
                    additional_info[key] = value

            data.append({
                'title': title,
                'price': price,
                'rating': rating,
                'availability': availability,
                'description': description,
                **additional_info
            })

        # Move to the next page
        next_button = soup.find('li', class_='next')
        if next_button:
            url = f"http://books.toscrape.com/catalogue/{next_button.a['href']}"
        else:
            url = None

    return pd.DataFrame(data)

def preprocess_data(df):
    # Check for missing values
    df.fillna('N/A', inplace=True)

    # Check for duplicates
    df.drop_duplicates(inplace=True)

    # Display the total number of books and main statistics
    print(f"Total number of books: {len(df)}")
    print(df.describe(include='all'))

    return df

def task():
    print("Gathering book data...")
    books_df = scraper()
    processed_df = preprocess_data(books_df)

    # Save data to CSV file
    processed_df.to_csv('books_data.csv', index=False, encoding = "utf-8")
    print("Data successfully saved to books_data.csv")

schedule.every().day.at("19:00").do(task)

while True:
    schedule.run_pending()
    time.sleep(60)

Gathering book data...
Total number of books: 1000
                         title   price rating availability description  \
count                     1000    1000   1000         1000        1000   
unique                     999     903      5            1         999   
top     The Star-Touched Queen  £44.18    One     In stock               
freq                         2       3    226         1000           2   

                     UPC Product Type Price (excl. tax) Price (incl. tax)  \
count               1000         1000              1000              1000   
unique              1000            1               903               903   
top     a897fe39b1053632        Books            £44.18            £44.18   
freq                   1         1000                 3                 3   

          Tax            Availability Number of reviews  
count    1000                    1000              1000  
unique      1                      21                 1  
top     £0.00  In 

KeyboardInterrupt: 