# **Project : Books to scrape**

In [1]:
import pandas as pdb
import os
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin
from unidecode import unidecode
from time import sleep, time
import pandas as pd
import math

## Scraping de l'ensemble des livres

In [2]:
def parsePages(num_pages, filename = 'books_to_scrape_{num_pages}_pages.csv'):
    
    book_data = []

    # Extraction des pages 1 à num_pages
    for page in range(1, num_pages + 1):
        html_website = requests.get(f'https://books.toscrape.com/catalogue/page-{page}.html')
        soup = BeautifulSoup(html_website.text, 'html.parser')
        product_containers = soup.find_all('article', class_='product_pod')
            
        # Récupérer des informations dans le code HTML de chaque page
        for container in product_containers:
            
            # Titre du livre
            title = container.find('h3').find('a')['title']

            # Stocks
            stock_available = container.find('p', class_='instock availability').text.replace('\n', '')
           
            # Lien du livre
            book_url = container.find('h3').find('a')['href']
            book_url = f'https://books.toscrape.com/catalogue/{book_url}'

            # Lien de l'image
            img_url = container.find('img')['src']
            img_url = f'https://books.toscrape.com{img_url}'

            # Nombre d'étoiles
            star_rating_class = container.find('p', class_='star-rating')['class'][1]
            star_rating = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}.get(star_rating_class, 0)

            # Accéder à la page du détail du livre 
            book_detail_page = requests.get(book_url)
            detail_soup = BeautifulSoup(book_detail_page.text, 'html.parser')

            # Catégorie du livre
            category = detail_soup.find('ul', class_='breadcrumb').find_all('li')[2].text.strip()

            # Description
            description = detail_soup.find('meta', {'name': 'description'})['content']
            description = description.replace('\n    ', '').replace('\n', '')
            
             # UPC
            upc = detail_soup.find('th', string='UPC')
            upc = upc.find_next('td').get_text(strip=True)
        
            # Product type
            product_type = detail_soup.find('th', string='Product Type')
            product_type = product_type.find_next('td').get_text(strip=True) 
        
            # Price without tax
            price_excl_tax = detail_soup.find('th', string='Price (excl. tax)')
            price_excl_tax = price_excl_tax.find_next('td').get_text(strip=True).replace('Â£', '') 
        
            # Price with tax
            price_incl_tax = detail_soup.find('th', string='Price (incl. tax)')
            price_incl_tax = price_incl_tax.find_next('td').get_text(strip=True).replace('Â£', '')
        
            # Price of tax only
            tax = detail_soup.find('th', string='Tax')
            tax = tax.find_next('td').get_text(strip=True).replace('Â£', '')

            # Number of available
            number_available = detail_soup.find_all('p', class_='instock availability')
            number_available = number_available[0].get_text(strip=True).replace("In stock (", "").replace(" available)","")
            
            # number of reviews
            number_of_reviews = detail_soup.find('th', string='Number of reviews')
            number_of_reviews = number_of_reviews.find_next('td').get_text(strip=True)
            
            # Ajouter les données du livre à la liste
            book_data.append({
                'Title': title,
                'Book Link': book_url,
                'Image Link': img_url,
                'Category': category,
                'Description': description,
                'UPC': upc,
                'Product Type': product_type,
                'Price £ (excl. tax)': price_excl_tax,
                'Price £ (incl. tax)': price_incl_tax,
                'Tax': tax,
                'Stock available' : stock_available,
                'Number of available': number_available,
                'Number of reviews': number_of_reviews
            })

    # Creation of the DataFrame and the CSV file for the specified number of pages
    data = pd.DataFrame(book_data)
    data.to_csv(filename.format(num_pages=num_pages), index = False, encoding = 'utf-8')

# Demander à l'utilisateur de saisir le nombre de pages
num_pages = int(input("Entrez le nombre de pages à extraire : "))

# Appeler la fonction pour exécuter le scraping et enregistrer dans le CSV
parsePages(num_pages)

# La base de données sur le nombre de page sélectionné
books = pd.read_csv(f"books_to_scrape_{num_pages}_pages.csv")
print(f"Books to scrape for {num_pages} page(s)")
books.head(3)

Entrez le nombre de pages à extraire :  1


Books to scrape for 1 page(s)


Unnamed: 0,Title,Book Link,Image Link,Category,Description,UPC,Product Type,Price £ (excl. tax),Price £ (incl. tax),Tax,Stock available,Number of available,Number of reviews
0,A Light in the Attic,https://books.toscrape.com/catalogue/a-light-i...,https://books.toscrape.com../media/cache/2c/da...,Poetry,It's hard to imagine a world without A Light i...,a897fe39b1053632,Books,51.77,51.77,0.0,In stock,22,0
1,Tipping the Velvet,https://books.toscrape.com/catalogue/tipping-t...,https://books.toscrape.com../media/cache/26/0c...,Historical Fiction,"""Erotic and absorbing...Written with starling ...",90fa61229261140a,Books,53.74,53.74,0.0,In stock,20,0
2,Soumission,https://books.toscrape.com/catalogue/soumissio...,https://books.toscrape.com../media/cache/3e/ef...,Fiction,"Dans une France assez proche de la nÃ´tre, un ...",6957f44c3847a760,Books,50.1,50.1,0.0,In stock,20,0


### Visualisation de l'ensemble des livres du site 'Books to scrape', soit des 50 pages

In [3]:
books_50_pages = pd.read_csv("books_to_scrape_50_pages.csv")
books_50_pages.head(5)

Unnamed: 0,Title,Book Link,Image Link,Category,Description,UPC,Product Type,Price £ (excl. tax),Price £ (incl. tax),Tax,Stock available,Number of available,Number of reviews
0,A Light in the Attic,https://books.toscrape.com/catalogue/a-light-i...,https://books.toscrape.com../media/cache/2c/da...,Poetry,It's hard to imagine a world without A Light i...,a897fe39b1053632,Books,51.77,51.77,0.0,In stock,22,0
1,Tipping the Velvet,https://books.toscrape.com/catalogue/tipping-t...,https://books.toscrape.com../media/cache/26/0c...,Historical Fiction,"""Erotic and absorbing...Written with starling ...",90fa61229261140a,Books,53.74,53.74,0.0,In stock,20,0
2,Soumission,https://books.toscrape.com/catalogue/soumissio...,https://books.toscrape.com../media/cache/3e/ef...,Fiction,"Dans une France assez proche de la nÃ´tre, un ...",6957f44c3847a760,Books,50.1,50.1,0.0,In stock,20,0
3,Sharp Objects,https://books.toscrape.com/catalogue/sharp-obj...,https://books.toscrape.com../media/cache/32/51...,Mystery,"WICKED above her hipbone, GIRL across her hear...",e00eb4fd7b871a48,Books,47.82,47.82,0.0,In stock,20,0
4,Sapiens: A Brief History of Humankind,https://books.toscrape.com/catalogue/sapiens-a...,https://books.toscrape.com../media/cache/be/a5...,History,From a renowned historian comes a groundbreaki...,4165285e1663650f,Books,54.23,54.23,0.0,In stock,20,0


In [4]:
books_50_pages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Title                1000 non-null   object 
 1   Book Link            1000 non-null   object 
 2   Image Link           1000 non-null   object 
 3   Category             1000 non-null   object 
 4   Description          998 non-null    object 
 5   UPC                  1000 non-null   object 
 6   Product Type         1000 non-null   object 
 7   Price £ (excl. tax)  1000 non-null   float64
 8   Price £ (incl. tax)  1000 non-null   float64
 9   Tax                  1000 non-null   float64
 10  Stock available      1000 non-null   object 
 11  Number of available  1000 non-null   int64  
 12  Number of reviews    1000 non-null   int64  
dtypes: float64(3), int64(2), object(8)
memory usage: 101.7+ KB


Il y a au total 1000 livres

In [5]:
# Nombre de livres total par catégorie
books_50_pages["Category"].value_counts()

Category
Default               152
Nonfiction            110
Sequential Art         75
Add a comment          67
Fiction                65
Young Adult            54
Fantasy                48
Romance                35
Mystery                32
Food and Drink         30
Childrens              29
Historical Fiction     26
Poetry                 19
Classics               19
History                18
Horror                 17
Womens Fiction         17
Science Fiction        16
Science                14
Music                  13
Business               12
Thriller               11
Travel                 11
Philosophy             11
Humor                  10
Autobiography           9
Art                     8
Psychology              7
Religion                7
Spirituality            6
Christian Fiction       6
New Adult               6
Sports and Games        5
Biography               5
Self Help               5
Health                  4
Christian               3
Politics                3
Con

Nous pouvons observer le nombre de livres au total par catégories

## Scraping sur 5 catégories

In [11]:
url = "https://books.toscrape.com/index.html"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Trouver toutes les catégories
categories = soup.select('ul.nav-list > li > ul > li > a')
categories_list = [category.text.strip() for category in categories]

category_select = []

for i in range(5):
    while True:
        user_input = input(f"Entrez le nom de la catégorie {i + 1} à extraire : ")
        if user_input in categories_list and user_input not in category_select:
            category_select.append(user_input)
            break
        else:
            print("La catégorie n'est pas valide ou a déjà été choisie. Veuillez réessayer.")

book_data = []

for category_name in category_select:
    index = categories_list.index(category_name)
    category_link = urljoin(url, categories[index]['href'])
    
    page_number = 1
    while True:
        if page_number == 1:
            category_website = requests.get(category_link)
        else:
            category_website = requests.get(urljoin(category_link, f"page-{page_number}.html"))
        

        soup = BeautifulSoup(category_website.text, 'html.parser')
        product_containers = soup.find_all('article', class_='product_pod')

        # Récupérer des informations dans le code HTML de chaque page
        for container in product_containers:
            
            # Titre du livre
            title = container.find('h3').find('a')['title']

            # Stocks
            stock_available = container.find('p', class_='instock availability').text.replace('\n', '')
           
            # Lien du livre
            book_url = container.find('h3').find('a')['href']
            book_url = f'https://books.toscrape.com/catalogue/{book_url}'
            book_url = book_url.replace('../','')

            # Lien de l'image
            img_url = container.find('img')['src']
            img_url = f'https://books.toscrape.com{img_url}'
            img_url = img_url.replace('../','')

            # Nombre d'étoiles
            star_rating_class = container.find('p', class_='star-rating')['class'][1]
            star_rating = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}.get(star_rating_class, 0)

            # Accéder à la page du détail du livre 
            book_detail_page = requests.get(book_url)
            detail_soup = BeautifulSoup(book_detail_page.text, 'html.parser')

            # Catégorie du livre
            category = category_name

            # Description
            description = detail_soup.find('meta', {'name': 'description'})['content']
            description = description.replace('\n    ', '').replace('\n', '')
            
            # UPC
            upc = detail_soup.find('th', string='UPC')
            upc = upc.find_next('td').get_text(strip=True)
        
            # Type de produit
            product_type = detail_soup.find('th', string='Product Type')
            product_type = product_type.find_next('td').get_text(strip=True) 
        
            # Prix sans la taxe
            price_excl_tax = detail_soup.find('th', string='Price (excl. tax)')
            price_excl_tax = price_excl_tax.find_next('td').get_text(strip=True).replace('Â£', '') 
        
            # Prix avec la taxe
            price_incl_tax = detail_soup.find('th', string='Price (incl. tax)')
            price_incl_tax = price_incl_tax.find_next('td').get_text(strip=True).replace('Â£', '')
        
            # Prix de la taxe
            tax = detail_soup.find('th', string='Tax')
            tax = tax.find_next('td').get_text(strip=True).replace('Â£', '')

            # Nombre de livre disponible
            number_available = detail_soup.find_all('p', class_='instock availability')
            number_available = number_available[0].get_text(strip=True).replace("In stock (", "").replace(" available)","")
            
            # Nombre de revue
            number_of_reviews = detail_soup.find('th', string='Number of reviews')
            number_of_reviews = number_of_reviews.find_next('td').get_text(strip=True)
            
            # Ajouter les données du livre à la liste
            book_data.append({
                'Title': title,
                'Book Link': book_url,
                'Image Link': img_url,
                'Description': description,
                'UPC': upc,
                'Category' : category,
                'Product Type': product_type,
                'Price £ (excl. tax)': price_excl_tax,
                'Price £ (incl. tax)': price_incl_tax,
                'Tax': tax,
                'Stock available' : stock_available,
                'Number of available': number_available,
                'Number of reviews': number_of_reviews
            })

            page_number += 1
        else:
            break  # la boucle s'arrête quand on atteint le nombre de page possible

# Création du dataframe avec les 5 catégories
data_category = pd.DataFrame(book_data)
data_category.to_csv('books_data_category.csv', index = False)

Entrez le nom de la catégorie 1 à extraire :  Fiction
Entrez le nom de la catégorie 2 à extraire :  Horror
Entrez le nom de la catégorie 3 à extraire :  Thriller
Entrez le nom de la catégorie 4 à extraire :  Art
Entrez le nom de la catégorie 5 à extraire :  Business


In [7]:
# La base de données sur le nombre de page sélectionné
books_category = pd.read_csv('books_data_category.csv')
books_category.head(3)

Unnamed: 0,Title,Book Link,Image Link,Description,UPC,Category,Product Type,Price £ (excl. tax),Price £ (incl. tax),Tax,Stock available,Number of available,Number of reviews
0,A Light in the Attic,https://books.toscrape.com/catalogue/a-light-i...,https://books.toscrape.commedia/cache/2c/da/2c...,It's hard to imagine a world without A Light i...,a897fe39b1053632,Poetry,Books,51.77,51.77,0.0,In stock,22,0
1,The Black Maria,https://books.toscrape.com/catalogue/the-black...,https://books.toscrape.commedia/cache/58/46/58...,"Praise for Aracelis Girmay:""[Girmay's] every l...",1dfe412b8ac00530,Poetry,Books,52.15,52.15,0.0,In stock,19,0
2,Shakespeare's Sonnets,https://books.toscrape.com/catalogue/shakespea...,https://books.toscrape.commedia/cache/10/48/10...,This book is an important and complete collect...,30a7f60cd76ca58c,Poetry,Books,20.66,20.66,0.0,In stock,19,0


In [8]:
data_category['Category'].value_counts()

Category
Fiction    65
Fantasy    48
Mystery    32
Poetry     19
Horror     17
Name: count, dtype: int64