# Web Scraping

## Importing libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

## Overview

|	Produktname	|	Marke	|	Preis	|	Gewicht	|	Brennwert	|	Fett	|	gesättigte Fettsäuren	|	Kohlehydrate	|	Zucker	|	Ballaststoffe	|	Eiweiß	|	Salz	|	Zutaten	|	Fotolink	|
|	:----------	|	:----------	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|
|	product name	|	brand	|	price	|	weight	|		|	fat	|	saturated fat	|	carbs	|	sugar	|		|	protein	|	salt	|	ingredients	|	picture link	|
|	e.g. Snack bar	|		|	in EUR	|	in gr/kg	|	in kj/kcal	|	in gr	|	in gr	|	in gr	|	in gr	|	in gr	|	in gr	|	in gr	|		|		|

**List of websites:**
- Koro: https://www.korodrogerie.de/snacks/?p=1&o=2&n=144&f=233
- Vantastic foods: https://www.vantastic-foods.com/vegane-lebensmittel/snacks-und-suesswaren
- Kokku: https://kokku-online.de/vegane-suessigkeiten-snacks/
- Foodist: https://www.foodist.de/suesses-snacks?p=1&o=9&n=84&f=36

## Web Scraping

### Foodist

#### Getting product links

In [5]:
# storing url in a variable
url_1 = "https://www.foodist.de/suesses-snacks?p=1&o=9&n=84&f=36"

# downloading html-code with a get request
response_1 = requests.get(url_1)

response_1.status_code

200

In [6]:
# parsing html (creating the 'soup')
soup_1 = BeautifulSoup(response_1.content, "html.parser")


In [9]:
# retrieving/extracting the desired info

# general product info
product_info = soup_1.find_all("div", class_="product--title")

# product title
product_info[14]

<div class="product--title"> Fruchtpapier Apfel &amp; Erdbeere </div>

In [35]:
# general product info
product_info = soup_1.find_all("div", class_="price--unit")

# product price
product_info[0].getText()

'  3,36\xa0€ / 100 g  '

In [41]:
# general product info
product_info = soup_1.find_all("div", class_="product--box box--basic")
product_info_2 = soup_1.find_all("a", class_="product--link")

# product link
#product_info[0].find_all("a", class_="product--link") #.get("href")
product_info_2[0].get("href")

'https://www.foodist.de/foodist-bier-braukasten-set-pils-zum-selbermachen-diy-kit-7685'

In [45]:
def get_first_details(soup):
    
    link = soup.find_all("a", class_="product--link")
    name = soup.find_all("div", class_="product--title")
    brand = soup.find_all("div", class_="product--supplier")
    price = soup.find_all("div", class_="product--price")
    price_gr = soup.find_all("div", class_="price--unit")
    
    names = []
    brands = []
    prices = []
    prices_gr = []
    links = []
    
    
    for p in name:
        name = p.getText()
        names.append(name)
        
    for p in brand:
        brand = p.getText()
        brands.append(brand)
        
    for p in price:
        price = p.getText()
        prices.append(price)
    
    for p in price_gr:
        price = p.getText()
        prices_gr.append(price)
    
    for p in link:
        link = p.get("href")
        links.append(link)
    
    return [names, brands, prices, prices_gr, links]

In [46]:
url = "https://www.foodist.de/suesses-snacks?p=1&o=9&n=84&f=36"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

products = get_first_details(soup)

In [47]:
len(products[0]), len(products[1]), len(products[2]), len(products[3]), len(products[4])

(91, 91, 84, 84, 91)

In [53]:
def scraping_pages(page):
    
    names = []
    brands = []
    links = []
    
    for p in page:
        url = "https://www.foodist.de/suesses-snacks?p={p}&o=9&n=84&f=36"
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")

        link = soup.find_all("a", class_="product--link")
        name = soup.find_all("div", class_="product--title")
        brand = soup.find_all("div", class_="product--supplier")
        
        for p in name:
            name = p.getText()
            names.append(name)
        
        for p in brand:
            brand = p.getText()
            brands.append(brand)
    
        for p in link:
            link = p.get("href")
            links.append(link)


    return [names, brands, links]

In [54]:
products = scraping_pages(page=range(1,7))

In [55]:
len(products[0]), len(products[1]), len(products[2])

(546, 546, 546)

In [57]:
# Building a dataframe

product_overview = pd.DataFrame(
    {"product": products[0],
     "brand": products[1],
     "links": products[2]})

In [51]:
pd.set_option('display.max_rows', None)

In [59]:
product_overview.sample(10)

Unnamed: 0,product,brand,links
411,BIO Fruchtgummi Coala Weihnachtsbärchen,mind sweets,https://www.foodist.de/mind-sweets-bio-fruchtg...
492,Gefriergetrocknete Erdbeeren in Zartbittersch...,Foodist,https://www.foodist.de/foodist-gefriergetrockn...
538,BIO Fruchtgummi mit saurem Himbeer- und Grape...,Not Guilty,https://www.foodist.de/not-guilty-bio-fruchtgu...
434,Energy Balls mit Vanille und Cashew,N.A! Nature Addicts,https://www.foodist.de/n.a-nature-addicts-ener...
522,Veganer Adventskalender 2021,Foodist,https://www.foodist.de/veganer-adventskalender
178,Erbsen Flips mit Tomaten- und Basilikumgeschm...,N.A! Nature Addicts,https://www.foodist.de/n.a-nature-addicts-erbs...
313,Pea Chili Lime Snack,Vaya,https://www.foodist.de/vaya-pea-chili-lime-sna...
0,Bier-Braukasten-Set PILS zum Selbermachen DIY...,Foodist,https://www.foodist.de/foodist-bier-braukasten...
1,Feinschmecker Adventskalender 2021 von Thomas...,Foodist,https://www.foodist.de/foodist-feinschmecker-a...
480,"Bonbons ""Vanillezauber""",Delica,https://www.foodist.de/delica-bonbons-vanillez...


#### Getting all product infos

In [60]:
# storing url in a variable
url = "https://www.foodist.de/vista-portuguese-mandeln-mit-zimt-zucker-10854"

# downloading html-code with a get request
response = requests.get(url)
response.status_code

200

In [61]:
soup = BeautifulSoup(response.content, "html.parser")

In [63]:
# price
price = soup.find_all("div", class_="price--default")
price[0].getText()

'  7,90\xa0€ '

In [67]:
# weight
weight = soup.find_all("span", class_="product--unit")
weight[0].getText()

'  Inhalt:  125\xa0g '

In [69]:
# price/weight
price_gr = soup.find_all("span", class_="product--reference-unit-price")
price_gr[0].getText()

' 6,32\xa0€ / 100\xa0g '

In [87]:
# kcal
# nutrients
nutritients = soup.find_all("tbody")
nutritients[0].getText()

'  Brennwert 2.022 kJ(442 kcal)   Fett 26,0 g   davon gesättigte Fettsäuren 2,0 g   Kohlenhydrate 59,0 g   davon Zucker 52,0 g   Eiweiß 11,0 g   Salz 0,00 g  '

In [80]:
# photo
photo = soup.find_all("span", class_="image--media")
photo[0].canvas.get("data-image")

'https://foodist.imgix.net/media/image/5602132652727_0.jpg?auto=compress%2Cformat&h=60&q=20&w=60'

In [19]:
# price
price = soup.find_all("div", class_="price--default")

# weight
weight = soup.find_all("span", class_="product--unit")

# price/weight
price_gr = soup.find_all("span", class_="product--reference-unit-price")

# nutrients
nutritients = soup.find_all("td", class_="nutrition-value")

# ingredients
ingredients = soup.find_all("span", class_="base-info--label")

# photo
photo = soup.find_all("span", class_="image--media")

In [105]:
def scraping_details(links=[], skip_link=[]):
    
    lst_price = []
    lst_weight = []
    lst_price_gr = []
    lst_nutrients = []
    # lst_ingredients = []
    lst_photos = []

    for l in links:
        
        if l not in skip_link:
            
            try:

                url = l
                response = requests.get(url)

                soup = BeautifulSoup(response.content, "html.parser")
                
                
                # price
                price = soup.find_all("div", class_="price--default")

                # weight
                weight = soup.find_all("span", class_="product--unit")

                # price/weight
                price_gr = soup.find_all("span", class_="product--reference-unit-price")

                # nutrients
                nutritients = soup.find_all("td", class_="nutrition-value")

                # ingredients
                # ingredients = soup.find_all("span", class_="base-info--label")

                
                # getting text for all necessary features
                for i in [price, weight, price_gr]:
                    for j in range(len(i)):
                        i[j] = i[j].getText()

                
                # photo + getting photo link
                photo = soup.find_all("span", class_="image--media")

                photo_link = photo[0].canvas.get("data-image")


                # nutrients
                nutrients = soup.find_all("tbody")
                
                nutrient = nutrients[0].getText()


                lst_price.append(price[0])
                lst_weight.append(weight[0])
                lst_price_gr.append(price_gr[0])
                lst_nutrients.append(nutrient)
                lst_photos.append(photo_link)

            except IndexError: 

                print("error for: ", l)
        
        
    return [lst_price, lst_weight, lst_price_gr, lst_nutrients, lst_photos]


In [94]:
product_1 = scraping_details(links=['https://www.foodist.de/vista-portuguese-mandeln-mit-zimt-zucker-10854'])
product_1

https://www.foodist.de/vista-portuguese-mandeln-mit-zimt-zucker-10854


[['  7,90\xa0€ '],
 ['  Inhalt:  125\xa0g '],
 [' 6,32\xa0€ / 100\xa0g '],
 ['  Brennwert 2.022 kJ(442 kcal)   Fett 26,0 g   davon gesättigte Fettsäuren 2,0 g   Kohlenhydrate 59,0 g   davon Zucker 52,0 g   Eiweiß 11,0 g   Salz 0,00 g  '],
 ['https://foodist.imgix.net/media/image/5602132652727_0.jpg?auto=compress%2Cformat&h=60&q=20&w=60']]

In [96]:
two_products = scraping_details(links=['https://www.foodist.de/n.a-nature-addicts-fruchtsticks-himbeere-10895', 
                                       'https://www.foodist.de/vista-portuguese-mandeln-mit-zimt-zucker-10854'])
two_products

https://www.foodist.de/n.a-nature-addicts-fruchtsticks-himbeere-10895
https://www.foodist.de/n.a-nature-addicts-fruchtsticks-himbeere-10895


[['  0,90\xa0€ ', '  0,90\xa0€ '],
 ['  Inhalt:  16\xa0g ', '  Inhalt:  16\xa0g '],
 [' 5,63\xa0€ / 100\xa0g ', ' 5,63\xa0€ / 100\xa0g '],
 ['  Brennwert 1.415 kJ(334 kcal)   Fett 1,2 g   davon gesättigte Fettsäuren 0,3 g   Kohlenhydrate 75,0 g   davon Zucker 69,0 g   Ballaststoffe 8,1 g   Eiweiß 18,0 g   Salz 0,10 g  ',
  '  Brennwert 1.415 kJ(334 kcal)   Fett 1,2 g   davon gesättigte Fettsäuren 0,3 g   Kohlenhydrate 75,0 g   davon Zucker 69,0 g   Ballaststoffe 8,1 g   Eiweiß 18,0 g   Salz 0,10 g  '],
 ['https://foodist.imgix.net/media/image/3609200010757_0.jpg?auto=compress%2Cformat&h=60&q=20&w=60',
  'https://foodist.imgix.net/media/image/3609200010757_0.jpg?auto=compress%2Cformat&h=60&q=20&w=60']]

In [114]:
test = scraping_details(links=['https://www.foodist.de/foodist-bier-braukasten-set-pils-zum-selbermachen-diy-kit-7685',
                  'https://www.foodist.de/vista-portuguese-mandeln-mit-zimt-zucker-10854', 
                   'https://www.foodist.de/n.a-nature-addicts-fruchtsticks-himbeere-10895'], 
            skip_link=['https://www.foodist.de/foodist-bier-braukasten-set-pils-zum-selbermachen-diy-kit-7685'])
test

TypeError: scraping_details() got an unexpected keyword argument 'skip_link'

In [111]:
def creating_df(links=[], skip_link=[]):
    
    result = scraping_details(links=links, skip_link=skip_link)
    
    details = pd.DataFrame(
    {"links": links,
     "price": result[0],
     "weight": result[1],
     "price_gr": result[2],
     "nutritients": result[3],
     "photo_link": result[4]}
        )
    return details

In [108]:
creating_df(links=['https://www.foodist.de/n.a-nature-addicts-fruchtsticks-himbeere-10895'])

Unnamed: 0,links,price,weight,price_gr,nutritients,photo_link
0,https://www.foodist.de/n.a-nature-addicts-fruc...,"0,90 €",Inhalt: 16 g,"5,63 € / 100 g","Brennwert 1.415 kJ(334 kcal) Fett 1,2 g ...",https://foodist.imgix.net/media/image/36092000...


In [103]:
products_foodist = creating_df(links=product_overview['links'])

error for:  https://www.foodist.de/foodist-bier-braukasten-set-pils-zum-selbermachen-diy-kit-7685
error for:  https://www.foodist.de/foodist-feinschmecker-adventskalender-2021-von-thomas-buehner-11069
error for:  https://www.foodist.de/foodist-foodist-snacks-adventskalender-2021-11092
error for:  https://www.foodist.de/craft-beer-adventskalender
error for:  https://www.foodist.de/gaia-bienenwachstuch-starterset-jungle-3-teilig-10074
error for:  https://www.foodist.de/foodist-bio-spice-your-pumpkin-pie-gewuerzmischung-by-marry-kotter-11340
error for:  https://www.foodist.de/foodist-foodist-snack-bites-probierpaket-10-teilig-8465
error for:  https://www.foodist.de/foodist-bio-pralinen-adventskalender-2021-11249
error for:  https://www.foodist.de/foodist-bio-schokoladen-weihnachtsmann-sparset-4-teilig-11704
error for:  https://www.foodist.de/foodist-kleine-freuden-sparset-2-teilig-11674
error for:  https://www.foodist.de/the-beginnings-cookies-doppelpack-mit-orange-cranberry-11565
error f

ValueError: array length 450 does not match index length 546

In [112]:
creating_df(links=['https://www.foodist.de/foodist-bier-braukasten-set-pils-zum-selbermachen-diy-kit-7685',
                  'https://www.foodist.de/vista-portuguese-mandeln-mit-zimt-zucker-10854', 
                   'https://www.foodist.de/n.a-nature-addicts-fruchtsticks-himbeere-10895'], 
            skip_link=['https://www.foodist.de/foodist-bier-braukasten-set-pils-zum-selbermachen-diy-kit-7685'])

TypeError: scraping_details() got an unexpected keyword argument 'skip_link'

**Deleting rows with too much info missing**

In [34]:
product_overview_2 = product_overview.copy()
product_overview_2.index[product_overview['links']=='']

Int64Index([131], dtype='int64')

In [35]:
product_overview = product_overview.drop(product_overview.index[]) 

In [36]:
product_overview.index[product_overview['links']=='']

Int64Index([], dtype='int64')

**Getting product details**

In [None]:
products_foodist = creating_df(links=product_overview_2['links'])

In [None]:
products_foodist.head(10)

### Merging dataframes

In [None]:
df_foodist = pd.merge(product_overview, products_foodist, on='links', how='inner')
df_foodist.head(15)

In [42]:
df_foodist.to_csv('df_foodist.csv')