# Projet 4

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
URL_PAGE2 = "https://kim.fspot.org/cours/page2.html"
URL_PAGE3 = "https://kim.fspot.org/cours/page3.html"

# 1) Ecrire une fonction get_prices_from_url() qui extrait des informations à partir des 2 pages ci-dessus.
# Exemple get_prices_from_url(URL_PAGE2) doit retourner :
# {'Personal': {'price': '$5', 'storage': '1GB', 'databases': 1},
#  'Small Business': {'price': '$25', 'storage': '10GB', 'databases': 5},
#  'Enterprise': {'price': '$45', 'storage': '100GB', 'databases': 25}}

def get_soup_from_url(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'html.parser')
    return soup

def get_prices_from_url(url):
    soup = get_soup_from_url(url)
    prices = {}
    
    pricing_table = soup.find_all(class_="pricing-table")
    for t in pricing_table:
        product_name = t.find("h2").text
        price = t.find(class_="pricing-table-price").text.strip().split()[0]
        storage, db = t.select(".pricing-table-list li")[3:5]
        prices[product_name] = {
            'price': price,
            'storage': storage.text.split()[0],
            'databases': int(db.text.split()[0]),
        }
    
    return prices

In [None]:
# 2) Ecrire une fonction qui extrait des informations sur une bière de beowulf
# Exemple URL: https://www.beerwulf.com/fr-fr/p/bieres/melusine-bio.33
# Doit retourner: {'name': 'Mélusine Bio', 'note': 70, 'price': 38.99, 'volume': 33}

def extract_beer_infos(url):
    # Example url: https://www.beerwulf.com/fr-fr/p/bieres/melusine-bio.33
    soup = get_soup_from_url(url)
    
    # Extract name:
    name = soup.find("h1").text
    
    # Extract evaluation:
    note = soup.find('div', class_='stars')
    note = int(note.attrs['data-percent'])
    
    # Extract price:
    price = soup.select('span.price')[0].text
    price = float(price[:-2].replace(',', '.', 1))  # "2,29 €" => 2.29
    
    # Extract volume:
    volume = soup.find('dt', text='Contenu').find_next_sibling()
    volume = int(volume.text[:-3])  # "33 cl" => 33
    
    infos = {
        'name': name,  # h1, text
        'note': note,  # div, class: stars
        'price': price,  # span, class: price
        'volume': volume,  # dt, text: Contenu
    }
    return infos

In [None]:
# Cette URL retourne un JSON avec une liste de bières
URL_BEERLIST_FRANCE = "https://www.beerwulf.com/fr-FR/api/search/searchProducts?country=France&container=Bouteille"

# 3) Ecrire une fonction qui prend l'argument "url" retourne les informations sur une liste de bière via l'API de beowulf.
# Cette fonction doit retourner la liste des informations obtenues par la fonction extract_beer_infos() définie ci-dessus.
# Chercher comment optimiser cette fonction en utilisant multiprocessing.Pool pour paralléliser les accès web.
#
# Exemple de retour :
# [{'name': 'Gallia East IPA', 'note': 80, 'price': 42.99, 'volume': 33},
# {'name': 'La Lager Sans Gluten de Vézelay',   'note': 60,   'price': 38.99,   'volume': 25},
# {'name': 'Brasserie De Sutter Brin de Folie',  'note': 70,  'price': 44.99,  'volume': 33},
# {'name': 'La Cristal IPA du Mont Blanc',  'note': 70,  'price': 44.99,  'volume': 33},
# {'name': 'Mélusine Bio', 'note': 70, 'price': 38.99, 'volume': 33},
# {'name': 'La Parisienne Le Titi Parisien',  'note': 70,  'price': 38.99,  'volume': 33},
# {'name': 'Gallia Session IPA', 'note': 70, 'price': 42.99, 'volume': 33},
# {'name': 'Ninkasi Brut IPA', 'note': 70, 'price': 44.99, 'volume': 33},
# {'name': 'Pietra', 'note': 60, 'price': 38.99, 'volume': 33},
# {'name': 'Desperados', 'note': 60, 'price': 35.99, 'volume': 33},
# {'name': 'Gallia West IPA', 'note': 70, 'price': 42.99, 'volume': 33}]
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

def extract_beer_list_infos(url):
    res = requests.get(url).json()
    beer_pages = ['https://www.beerwulf.com' + item['contentReference'] for item in res['items']]
    
    # Sequential version (slow):
    #beers = [extract_beer_infos(u) for u in beer_pages]
    
    # Parallel version (faster):
    # Windows workaround
    # see: https://medium.com/@grvsinghal/speed-up-your-python-code-using-multiprocessing-on-windows-and-jupyter-or-ipython-2714b49d6fac
    #
    from multiprocessing import Pool
    from extract_beer_infos import extract_beer_infos
    with Pool() as p:
        beers = p.map(extract_beer_infos, beer_pages)
    
    return beers

In [None]:
import unittest

class Lesson2Tests(unittest.TestCase):
    def test_01_get_prices_from_url_page2(self):
        prices = get_prices_from_url(URL_PAGE2)
        # We should have found 3 products:
        self.assertIsInstance(prices, dict)
        self.assertEqual(len(prices), 3)
        self.assertIn('Personal', prices)
        self.assertIn('Small Business', prices)
        self.assertIn('Enterprise', prices)
        
        personal = prices['Personal']
        self.assertIn('price', personal)
        self.assertIn('storage', personal)
        self.assertIn('databases', personal)
        self.assertEqual(personal['price'], '$5')
        self.assertEqual(personal['storage'], '1GB')
        self.assertEqual(personal['databases'], 1)
        
    def test_02_get_prices_from_url_page3(self):
        prices = get_prices_from_url(URL_PAGE3)
        self.assertIsInstance(prices, dict)
        self.assertEqual(len(prices), 4)
        self.assertEqual(
            prices['Privilege'],
            {'databases': 100, 'price': '$99', 'storage': '1TB'}
        )
    
    def test_03_extract_beer_list_infos(self):
        infos = extract_beer_list_infos(URL_BEERLIST_FRANCE)
        # We should have 8 austrian beers:
        self.assertIsInstance(infos, list)
        self.assertEqual(len(infos), 11)
        # All of them are 33cl:
        for beer in infos:
            self.assertIn(beer['volume'], [25, 33])

            
def run_tests():
    test_suite = unittest.makeSuite(Lesson2Tests)
    runner = unittest.TextTestRunner(verbosity=2)
    runner.run(test_suite)

In [None]:
if __name__ == '__main__':
    run_tests()