# [Python Project] Part 1 - Web Scraping

## 1. Load the libraries

In [1]:
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
import requests 

import sys
import warnings
import time
import ssl
import random
import re
import pandas as pd

In [2]:
## Create a session
session = requests.Session() 

## For ignoring SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

## Give a pause for some time between each loop in other not to be considered as a robot
rand_value = random.randint(2, 3)
time.sleep(rand_value)

## declare the header
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}

## 2. Get urls of all the pages in Littérature Française in poche format

In [3]:
pages = []

for i in range(7, 8):  # change to range 1-76 for entire pages 
    page_url = "https://www.amazon.fr/s/ref=sr_pg_{0}?fst=as%3Aoff&rh=n%3A301061%2Cn%3A%21301130%2Cn%3A301132%2Cn%3A302038%2Cp_n_binding_browse-bin%3A492481011&page={1}&bbn=302038&ie=UTF8&qid=1540392876".format(i, i)
    pages.append(page_url)

print(pages)


['https://www.amazon.fr/s/ref=sr_pg_7?fst=as%3Aoff&rh=n%3A301061%2Cn%3A%21301130%2Cn%3A301132%2Cn%3A302038%2Cp_n_binding_browse-bin%3A492481011&page=7&bbn=302038&ie=UTF8&qid=1540392876']


## 3. Get urls of each product (book) from each page of the category

In [4]:
asin_pattern = re.compile(r"(?<=/dp/)(\w{10})") # to extract the ASIN (Amazon Standard Identification Number)
books_url = []

for page in pages:
    resp = session.get(page, headers= headers).content # Giving User-Agent will help to be considered as a real user 
    html_amazon = BeautifulSoup(resp,"html.parser")  
    books = html_amazon.find_all('a', attrs={"class", "a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal"})

    for book in books:
        url = book.get('href') # get the href
        asin = re.search(asin_pattern, url) # search the asin pattern
        product_url = "https://www.amazon.fr/dp/" + asin.group(1) # wirte an adress with asin code
        books_url.append(product_url) # append it in the list of books_url

print(books_url)

['https://www.amazon.fr/dp/2013949766', 'https://www.amazon.fr/dp/2266255126', 'https://www.amazon.fr/dp/2266163744', 'https://www.amazon.fr/dp/2253108618', 'https://www.amazon.fr/dp/2350305368', 'https://www.amazon.fr/dp/2081412144', 'https://www.amazon.fr/dp/2253152846', 'https://www.amazon.fr/dp/2253004227', 'https://www.amazon.fr/dp/2253095060', 'https://www.amazon.fr/dp/225310907X', 'https://www.amazon.fr/dp/2266276298', 'https://www.amazon.fr/dp/226627628X', 'https://www.amazon.fr/dp/2266226061', 'https://www.amazon.fr/dp/2266275143', 'https://www.amazon.fr/dp/2070410854', 'https://www.amazon.fr/dp/2070368106']


# 4. Get the product details from each product page

In [5]:
## Regular Expression to extract specific character string 

page_pattern = re.compile(r'(?<=: )(\d+)')  # to extract the number of pages in each book

edition = 'Édition'
editor_pattern1 = re.compile(r'(?<=: ).+(?=;)')  # to extract the name of the editor
editor_pattern2 = re.compile(r'(?<=: ).+(?= [(].*[)]$)')  # to extract the name of the editor

collect = 'Collection'
collection_pattern = re.compile(r'(?<=: ).+')  # to extract the name of the collection  
                    
stars_pattern = re.compile(r'^\d[.|,]?\d*')  # to extract the number of the stars  
comments_pattern = re.compile(r'^.*(?= commentaire)')  # to extract the number of comments

price_pattern = re.compile(r'EUR (\d*,\d*)$')  # to extract the price of the book 

In [6]:
## Lists to save all the values from each product

book_names = []
author_names = []
page_numbers = []
editor_names = []
publication_dates = []
collections = []
prices_new = []
stars_counts = []
comments_counts = []

In [7]:
for book in books_url:
    webpage = session.get(book, headers= headers).content
    html_product = BeautifulSoup(webpage, "html.parser") 

    # book names
    if html_product.find('span', attrs={"id" : "productTitle"}):
        book_name = html_product.find('span', attrs={"id" : "productTitle"}).text.strip() 
        #print(book_name)
        book_names.append(book_name)          
    elif html_product.find('span', attrs={"id" : "ebooksProductTitle"}):
        book_name = html_product.find('span', attrs={"id" : "ebooksProductTitle"}).text.strip()
        #print(book_name)
        book_names.append(book_name)


    # author names
    if html_product.find('a', class_ = "a-link-normal contributorNameID"):
        author_name = html_product.find('a', class_ = "a-link-normal contributorNameID").text.strip()
        #print(author_name)
        author_names.append(author_name)
    elif html_product.find('span', class_ = "author notFaded"):
        author_name = html_product.find('span', class_ = "author notFaded").find('a').text.strip()
        #print(author_name)
        author_names.append(author_name)


    # Product detail table                
    for row in html_product.find_all('div', class_ = 'content')[1].find_all('li'):
 
        # page numbers
        if row.find_all(string = re.compile(r'^Poche|Broché')): # if this line contains "Poche or Broché" at the begining of the line
            #print(row)
            pages = row.get_text()
            check = page_pattern.search(pages)
            
            if check:   # check if there are page numbers.
                page_number = re.search(page_pattern, pages).group(1)
                #print(page_number)
                page_numbers.append(page_number)
            else:   # if not, put Nan value
                page_number = None
                #print(page_number)
                page_numbers.append(page_number) 
            
        # stars and comments
        elif row.find_all(string = re.compile(r'^Moyenne')): # if this line contains "Moyenne" at the begining of the line
            #print(row)
            
            if row.find('span', attrs={'class': 'a-icon-alt'}):
                # stars
                stars = row.find('span', attrs={'class': 'a-icon-alt'}).text.strip()
                stars_count = re.search(stars_pattern, stars).group().replace(',', '.')
                stars_count = float(stars_count)
                #print(stars_count)
                stars_counts.append(stars_count)
                
                # comments
                comments = row.find('span', attrs={'class': 'a-size-small'}).text.strip()
                comments_count = re.search(comments_pattern, comments).group().replace('.', '').replace(',', '')
                comments_count = int(comments_count)
                #print(comments_count)
                comments_counts.append(comments_count)
                
            else: # if not, put 0 
                #print('there are no stars and comments')
                stars_count = 0
                stars_counts.append(stars_count)
                
                comments_count = 0
                comments_counts.append(comments_count) 
    
    # editor names
    if html_product.find_all('div', class_ = 'content')[1].find_all('li')[1]:
        editor = html_product.find_all('div', class_ = 'content')[1].find_all('li')[1].text.strip()
        
        if edition in editor:
            editor_name = re.search(editor_pattern1, editor).group()
            #print(editor_name)
            editor_names.append(editor_name)
        elif re.search(editor_pattern2, editor): 
            editor_name = re.search(editor_pattern2, editor).group()
            #print(editor_name)
            editor_names.append(editor_name)
        else:
            #print('there is no editor name')
            editor_name = None
            editor_names.append(editor_name)            

    
    # collections
    if html_product.find_all('div', class_ = 'content')[1].find_all('li')[2]:
        collection = html_product.find_all('div', class_ = 'content')[1].find_all('li')[2].text.strip()
        
        if collect in collection:          
            collection_name = re.search(collection_pattern, collection).group()
            #print("collection: " + collection_name)
            collections.append(collection_name)
        else:
            #print('there is no collection name')
            collection_name = None
            collections.append(collection_name)


    # publication dates
    if html_product.find_all('span', class_ = 'a-size-medium a-color-secondary a-text-normal')[-1]:
        publication_date = html_product.find_all('span', class_ = 'a-size-medium a-color-secondary a-text-normal')[-1].text.replace('– ', '')
        #print(publication_date)
        publication_dates.append(publication_date)
    else:
        #print('there is no publication date')
        publication_date = None
        publication_dates.append(publication_date)

        
    # prices
    if html_product.find('span', class_ = "a-size-base a-color-price a-color-price"):
        price = html_product.find('span', class_ = "a-size-base a-color-price a-color-price").text.strip()
        price_new = re.search(price_pattern, price).group(1).replace(',', '.') # need to change ',' to '.' in order to consider this as price and also not to be considered as delimiter=',' in csv file
        price_new = float(price_new)
        #print(price_new)
        prices_new.append(price_new)
    else:
        #print('there is no price')
        price_new = None
        prices_new.append(price_new)

In [8]:
print(len(book_names))
print(len(author_names))
print(len(page_numbers))
print(len(editor_names))
print(len(collections))
print(len(stars_counts))
print(len(comments_counts))
print(len(publication_dates))
print(len(prices_new))

16
16
16
16
16
16
16
16
16


# 5. Make a dataframe with the acquired information

In [9]:
columns = {'book_names': book_names, 'author_names': author_names, 'editor_names': editor_names, 'collections': collections,
           'publication_dates': publication_dates, 'page_numbers': page_numbers, 'stars_counts': stars_counts, 
           'comments_counts': comments_counts, 'prices_new': prices_new }

df = pd.DataFrame(columns)
print(df)


                                           book_names  \
0     Bibliocollège - Nouvelles réalistes, Maupassant   
1               Entre mes mains le bonheur se faufile   
2                                Bel-Ami à 1,99 euros   
3   La mort du roi Tsongor - Prix Goncourt des Lyc...   
4   Marivaux : La Dispute ; La Fausse suivante ; L...   
5                                Le Mariage de Figaro   
6                              Métaphysique des tubes   
7                                            Germinal   
8                               Et tu n'es pas revenu   
9                           Le ventre de l'Atlantique   
10                                  L'Instant présent   
11                                       Central Park   
12                                    Marie d'en haut   
13                               La Fille de Brooklyn   
14                                     Pierre et Jean   
15                     Le Ravissement de Lol V. Stein   

                             a

# 6. Write a csv file with the dataframe

In [10]:
df.to_csv('amazon_data_page7.csv', sep=',', encoding='utf-8-sig', index=False)
