# Web Scraping

## Importing libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

## Overview

|	Produktname	|	Marke	|	Preis	|	Gewicht	|	Brennwert	|	Fett	|	gesättigte Fettsäuren	|	Kohlehydrate	|	Zucker	|	Ballaststoffe	|	Eiweiß	|	Salz	|	Zutaten	|	Fotolink	|
|	:----------	|	:----------	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|
|	product name	|	brand	|	price	|	weight	|		|	fat	|	saturated fat	|	carbs	|	sugar	|		|	protein	|	salt	|	ingredients	|	picture link	|
|	e.g. Snack bar	|		|	in EUR	|	in gr/kg	|	in kj/kcal	|	in gr	|	in gr	|	in gr	|	in gr	|	in gr	|	in gr	|	in gr	|		|		|

**List of websites:**
- Koro: https://www.korodrogerie.de/snacks/?p=1&o=2&n=144&f=233
- Vantastic foods: https://www.vantastic-foods.com/vegane-lebensmittel/snacks-und-suesswaren
- Kokku: https://kokku-online.de/vegane-suessigkeiten-snacks/
- Foodist: https://www.foodist.de/suesses-snacks?p=1&o=9&n=84&f=36

## Web Scraping

### Koro

#### Getting product links

In [2]:
# storing url in a variable
url_1 = "https://www.korodrogerie.de/snacks/?p=1&o=2&n=60&f=233"
url_2 = "https://www.korodrogerie.de/snacks/?p=2&o=2&n=60&f=233"
url_3 = "https://www.korodrogerie.de/snacks/?p=3&o=2&n=60&f=233"
url_4 = "https://www.korodrogerie.de/snacks/?p=4&o=2&n=60&f=233"

# downloading html-code with a get request
response_1 = requests.get(url_1)
response_2 = requests.get(url_2)
response_3 = requests.get(url_3)
response_4 = requests.get(url_4)

response_1.status_code

200

In [3]:
# parsing html (creating the 'soup')
soup_1 = BeautifulSoup(response_1.content, "html.parser")
soup_2 = BeautifulSoup(response_2.content, "html.parser")
soup_3 = BeautifulSoup(response_3.content, "html.parser")
soup_4 = BeautifulSoup(response_4.content, "html.parser")

In [4]:
# retrieving/extracting the desired info

# general product info
product_info = soup_1.find_all("div", class_="product--info")

In [6]:
# product links
product_info[0]#.a.get("href")

<div class="product--info">
<a class="product--image" href="https://www.korodrogerie.de/soja-protein-crispies-58-mit-kakao-1-kg" title="Soja Protein Crispies 58 % mit Kakao 1 kg">
<span class="koro-image--media-second">
<img alt="" class="koro-hl-image" data-koro-hl="" data-srcset="https://koro2.imgix.net/media/image/ea/ef/04/CRIS_005_04qCPfUp9s62Etr.jpg?auto=compress%2Cformat&amp;w=200&amp;h=200 200w, https://koro2.imgix.net/media/image/ea/ef/04/CRIS_005_04qCPfUp9s62Etr.jpg?auto=compress%2Cformat&amp;w=400&amp;h=400 400w, https://koro2.imgix.net/media/image/ea/ef/04/CRIS_005_04qCPfUp9s62Etr.jpg?auto=compress%2Cformat&amp;w=900&amp;h=900 900w" srcset="" title=""/>
</span>
<span class="image--media">
<picture class="lazy" data-loader="emzPicturePlaceholderLoader">
<source data-srcset="https://koro2.imgix.net/media/image/f1/50/81/CRIS_005_01E3j9hQ8K66BPv.jpg?auto=compress%2Cformat&amp;w=200&amp;h=200&amp;dpr=2 400w, https://koro2.imgix.net/media/image/f1/50/81/CRIS_005_01E3j9hQ8K66BPv.jp

In [88]:
def get_link(soup):
    
    product_info = soup.find_all("div", class_="product--info")
    
    links = []
    for p in product_info:
        link = p.a.get("href")
        links.append(link)
    
    return links

In [89]:
# product name
product_info[0].a.get("title")

'Soja Protein Crispies 58\xa0% mit Kakao 1\xa0kg'

In [90]:
def get_name(soup):
    
    product_info = soup.find_all("div", class_="product--info")
    
    names = []
    for p in product_info:
        name = p.a.get("title")
        names.append(name)
    
    return names

In [91]:
links_1 = get_link(soup_1)
links_2 = get_link(soup_2)
links_3 = get_link(soup_3)
links_4 = get_link(soup_4)

In [92]:
names_1 = get_name(soup_1)
names_2 = get_name(soup_2)
names_3 = get_name(soup_3)
names_4 = get_name(soup_4)

In [94]:
len(names_1), len(links_1), len(names_2), len(links_2), len(names_3), len(links_3), len(names_4), len(links_4)

(60, 60, 60, 60, 60, 60, 42, 42)

In [96]:
links = links_1+links_2+links_3+links_4
len(links)

222

In [97]:
names = names_1+names_2+names_3+names_4

In [98]:
# Building a dataframe

product_overview = pd.DataFrame(
    {"product": names,
     "link": links})

In [158]:
pd.set_option('display.max_rows', None)

In [159]:
product_overview

Unnamed: 0,product,link
0,Soja Protein Crispies 58 % mit Kakao 1 kg,https://www.korodrogerie.de/soja-protein-crisp...
1,Schokodrops mit Xylit 1 kg,https://www.korodrogerie.de/schokodrops-mit-xy...
2,Schoko Protein Crunchies ohne Zuckerzusatz 1 kg,https://www.korodrogerie.de/schoko-protein-cru...
3,Soja Protein Crispies 60 % 1 kg,https://www.korodrogerie.de/soja-protein-crisp...
4,Bohnen-Erbsen-Mix geröstet & gesalzen 1 kg,https://www.korodrogerie.de/bohnen-erbsen-mix-...
5,Reisgebäck Superior Mix 750 g,https://www.korodrogerie.de/reisgebaeck-superi...
6,Amarena Kirschen mit Zartbitterschokolade 250 g,https://www.korodrogerie.de/amarena-kirschen-m...
7,Sonnengetrocknete Tomaten 1 kg,https://www.korodrogerie.de/sonnengetrocknete-...
8,Apfelchips Deutscher Elstar 250 g,https://www.korodrogerie.de/apfelchips-deutsch...
9,Bio Rote Bete Chips 200 g,https://www.korodrogerie.de/bio-rote-bete-chip...


#### Getting all product infos

In [100]:
# storing url in a variable
url = "https://www.korodrogerie.de/soja-protein-crispies-58-mit-kakao-1-kg"

# downloading html-code with a get request
response = requests.get(url)
response.status_code

200

In [101]:
soup = BeautifulSoup(response.content, "html.parser")

In [102]:
# price
price = soup.find_all("span", class_="price--content content--default")

# weight
weight = soup.find_all("td", class_="base-info--content")

# kcal
kcal = soup.find_all("span", class_="nutrition--value nutrition--energy")

# fat
fat = soup.find_all("span", class_="nutrition--value nutrition--fat")

# sat_fat
sat_fat = soup.find_all("span", class_="nutrition--value nutrition--saturates")

# carbs
carbs = soup.find_all("span", class_="nutrition--value nutrition--carbohydrate")

# sugar
sugar = soup.find_all("span", class_="nutrition--value nutrition--sugars")

# fibre
fibre = soup.find_all("span", class_="nutrition--value nutrition--fibre")

# protein
protein = soup.find_all("span", class_="nutrition--value nutrition--protein")

# salt
salt = soup.find_all("span", class_="nutrition--value nutrition--salt")

# ingredients
ingredients = soup.find_all("span", class_="base-info--label")

# photo
photo = soup.find_all("span", class_="image--media")

In [103]:
# how to get photo link
photo_link = photo[0].img.get("src")
photo_link

'https://koro2.imgix.net/media/image/f1/50/81/CRIS_005_01E3j9hQ8K66BPv.jpg?auto=compress%2Cformat&w=900&h=900'

In [104]:
url = 'https://www.korodrogerie.de/schokodrops-mit-xylit-1-kg'
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

In [105]:
weight = soup.find_all("td", class_="base-info--content")  
weight[10].getText()

'Kakaomasse, 25 % Süßungsmittel: Xylit; Kakaobutter, Emulgator: Sonnenblumenlecithin. Kakao: 72 % min.'

In [153]:
def scraping_details(links=[], not_in=[]):
    
    
    lst_price = []
    lst_weight = []
    lst_kcal = []
    lst_fat = []
    lst_sat_fat = []
    lst_carbs = []
    lst_sugar = []
    lst_fibre = []
    lst_protein = []
    lst_salt = []
    lst_ingredients = []
    lst_photos = []

    for l in links:
        
        if l in not_in:
            continue
        
        else:
            
            try:

                url = l
                response = requests.get(url)

                soup = BeautifulSoup(response.content, "html.parser")

                # price
                price = soup.find_all("span", class_="price--content content--default")

                # kcal
                kcal = soup.find_all("span", class_="nutrition--value nutrition--energy")

                # fat
                fat = soup.find_all("span", class_="nutrition--value nutrition--fat")

                # sat_fat
                sat_fat = soup.find_all("span", class_="nutrition--value nutrition--saturates")

                # carbs
                carbs = soup.find_all("span", class_="nutrition--value nutrition--carbohydrate")

                # sugar
                sugar = soup.find_all("span", class_="nutrition--value nutrition--sugars")

                # fibre
                fibre = soup.find_all("span", class_="nutrition--value nutrition--fibre")

                # protein
                protein = soup.find_all("span", class_="nutrition--value nutrition--protein")

                # salt
                salt = soup.find_all("span", class_="nutrition--value nutrition--salt")


                # getting text for all necessary features
                for i in [price, kcal, fat, sat_fat, carbs, sugar, fibre, protein, salt]:
                    for j in range(len(i)):
                        i[j] = i[j].getText()

                # photo
                photo = soup.find_all("span", class_="image--media")

                # getting photo link
                photo_link = photo[0].img.get("src")


                # general info including weight & ingredients
                info = soup.find_all("td", class_="base-info--content")  

                # weight
                weight = info[2].getText()

                # ingredients
                ingredients = info[10].getText()


                lst_price.append(price[0])
                lst_weight.append(weight[0])
                lst_kcal.append(kcal[0])
                lst_fat.append(fat[0])
                lst_sat_fat.append(sat_fat[0])
                lst_carbs.append(carbs[0])
                lst_sugar.append(sugar[0])
                lst_fibre.append(fibre[0])
                lst_protein.append(protein[0])
                lst_salt.append(salt[0])
                lst_ingredients.append(ingredients[0])
                lst_photos.append(photo_link)

                print(l)

            except IndexError: 

                print("error for: ", l)
        
        
    return [lst_price, lst_weight, lst_kcal, lst_fat, lst_sat_fat, lst_carbs, lst_sugar, lst_fibre, lst_protein, lst_salt, lst_ingredients, lst_photos]


In [128]:
product_1 = scraping_details(links=['https://www.korodrogerie.de/soja-protein-crispies-58-mit-kakao-1-kg'])
product_1

https://www.korodrogerie.de/soja-protein-crispies-58-mit-kakao-1-kg


[['\n\n14,00\xa0€\n'],
 ['1'],
 ['1535                                   / 362'],
 ['1,9 g'],
 ['0,4 g'],
 ['28,2 g'],
 ['9,1 g'],
 ['1,8 g'],
 ['58 g'],
 ['2,7 g'],
 ['7'],
 ['https://koro2.imgix.net/media/image/f1/50/81/CRIS_005_01E3j9hQ8K66BPv.jpg?auto=compress%2Cformat&w=900&h=900']]

In [108]:
product_2 = scraping_details(links=['https://www.korodrogerie.de/schokodrops-mit-xylit-1-kg'])
product_2

[['\n\n21,00\xa0€\n'],
 ['1'],
 ['2290                                   / 555'],
 ['46 g'],
 ['28 g'],
 ['31 g'],
 ['1,0 g'],
 ['10 g'],
 ['9,2 g'],
 ['0,07 g'],
 ['K'],
 ['https://koro2.imgix.net/media/image/2c/01/f8/SCHOKO_001_01c6BzVK50WZCzf.jpg?auto=compress%2Cformat&w=900&h=900']]

In [109]:
two_products = scraping_details(links=['https://www.korodrogerie.de/soja-protein-crispies-58-mit-kakao-1-kg', 'https://www.korodrogerie.de/schokodrops-mit-xylit-1-kg'])
two_products

[['\n\n14,00\xa0€\n', '\n\n21,00\xa0€\n'],
 ['1', '1'],
 ['1535                                   / 362',
  '2290                                   / 555'],
 ['1,9 g', '46 g'],
 ['0,4 g', '28 g'],
 ['28,2 g', '31 g'],
 ['9,1 g', '1,0 g'],
 ['1,8 g', '10 g'],
 ['58 g', '9,2 g'],
 ['2,7 g', '0,07 g'],
 ['7', 'K'],
 ['https://koro2.imgix.net/media/image/f1/50/81/CRIS_005_01E3j9hQ8K66BPv.jpg?auto=compress%2Cformat&w=900&h=900',
  'https://koro2.imgix.net/media/image/2c/01/f8/SCHOKO_001_01c6BzVK50WZCzf.jpg?auto=compress%2Cformat&w=900&h=900']]

In [149]:
def creating_df(links=[], not_in=[]):
    
    result = scraping_details(links=links)
    
    details = pd.DataFrame(
    {"links": links,
     "price": result[0],
     "weight": result[1],
     "kcal": result[2],
     "fat": result[3],
     "sat_fat": result[4],
     "carbs": result[5],
     "sugar": result[6],
     "fibre": result[7],
     "protein": result[8],
     "salt": result[9],
     "ingredients": result[10],
     "photo_link": result[11]}
        )
    return details

In [111]:
creating_df(links=['https://www.korodrogerie.de/soja-protein-crispies-58-mit-kakao-1-kg'])

Unnamed: 0,links,price,weight,kcal,fat,sat_fat,carbs,sugar,fibre,protein,salt,ingredients,photo_link
0,https://www.korodrogerie.de/soja-protein-crisp...,"\n\n14,00 €\n",1,1535 / 362,"1,9 g","0,4 g","28,2 g","9,1 g","1,8 g",58 g,"2,7 g",7,https://koro2.imgix.net/media/image/f1/50/81/C...


In [112]:
creating_df(links=['https://www.korodrogerie.de/schokodrops-mit-xylit-1-kg'])

Unnamed: 0,links,price,weight,kcal,fat,sat_fat,carbs,sugar,fibre,protein,salt,ingredients,photo_link
0,https://www.korodrogerie.de/schokodrops-mit-xy...,"\n\n21,00 €\n",1,2290 / 555,46 g,28 g,31 g,"1,0 g",10 g,"9,2 g","0,07 g",K,https://koro2.imgix.net/media/image/2c/01/f8/S...


**Deleting rows with too much info missing**

In [160]:
product_overview = product_overview.drop(product_overview.index[130]) #https://www.korodrogerie.de/bio-energy-ball-cashew-zimt-30-g 

In [161]:
products_koro = creating_df(links=product_overview['link'])

https://www.korodrogerie.de/soja-protein-crispies-58-mit-kakao-1-kg
https://www.korodrogerie.de/schokodrops-mit-xylit-1-kg
https://www.korodrogerie.de/schoko-protein-crunchies-ohne-zuckerzusatz-1-kg
https://www.korodrogerie.de/soja-protein-crispies-60-1-kg
https://www.korodrogerie.de/bohnen-erbsen-mix-geroestet-gesalzen-1-kg
https://www.korodrogerie.de/reisgebaeck-superior-mix-750-g
https://www.korodrogerie.de/amarena-kirschen-mit-zartbitterschokolade-250-g
https://www.korodrogerie.de/sonnengetrocknete-tomaten-1-kg
https://www.korodrogerie.de/apfelchips-deutscher-elstar-250-g
https://www.korodrogerie.de/bio-rote-bete-chips-200-g
https://www.korodrogerie.de/apfelstuecke-mit-zimt-zucker-1-kg
https://www.korodrogerie.de/reiscracker-chili-1-kg
https://www.korodrogerie.de/edamame-bohnen-geroestet-und-gesalzen-750-g
https://www.korodrogerie.de/schwarze-sojabohnen-geroestet-und-gesalzen-750-g
https://www.korodrogerie.de/saubohnen-geroestet-und-gesalzen-750-g
https://www.korodrogerie.de/bio-vo

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://www.korodrogerie.de/zimtwaffel-100-g
https://www.korodrogerie.de/bio-tomatenchips-13-x-500-g
https://www.korodrogerie.de/mandeln-mit-veganer-kakao-glasur-1-kg
https://www.korodrogerie.de/cracker-dinkel-chia-500-g
https://www.korodrogerie.de/cracker-hanf-mohnsamen-500-g
https://www.korodrogerie.de/cracker-dinkel-chia-6-x-500-g
https://www.korodrogerie.de/cracker-hanf-mohnsamen-6-x-500-g
https://www.korodrogerie.de/bio-gepuffter-apfel-16-x-500-g
https://www.korodrogerie.de/bio-gepuffte-physalis-16-x-500-g
https://www.korodrogerie.de/bio-gepuffte-mango-16-x-500-g
https://www.korodrogerie.de/bio-gepuffte-banane-16-x-500-g
https://www.korodrogerie.de/protein-ball-brownie-15-x-30-g
https://www.korodrogerie.de/bio-flapjack-kakao-15-x-60-g
https://www.korodrogerie.de/bio-veganer-proteinriegel-haselnuss-12-x-60-g
https://www.korodrogerie.de/protein-ball-cookie-dough-15-x-30-g
https://www.korodrogerie.de/bio-geile-schnitte-schokolade-10-x-30-g
https://www.korodrogerie.de/bio-nut-butter-b

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://www.korodrogerie.de/zimtwaffel-10-x-100-g
https://www.korodrogerie.de/bio-rohkostriegel-cashew-12-x-50-g
https://www.korodrogerie.de/bio-rohkostriegel-kakao-12-x-50-g
https://www.korodrogerie.de/bio-rohkostriegel-himbeere-12-x-50-g
https://www.korodrogerie.de/haferkeks-mit-schokodrops-200-g
https://www.korodrogerie.de/haferkekse-mit-schokodrops-12-x-200-g
https://www.korodrogerie.de/gebrannte-haselnuesse-salted-caramel-1-kg
https://www.korodrogerie.de/mandeln-mit-veganer-kakao-glasur-3-x-1-kg
https://www.korodrogerie.de/gebrannte-cashewkerne-salted-caramel-1-kg
https://www.korodrogerie.de/gebrannte-pekannuesse-1-kg
https://www.korodrogerie.de/vegane-schokobrezeln-ohne-zuckerzusatz-1-kg
https://www.korodrogerie.de/vegane-schokobrezeln-ohne-zuckerzusatz-12-x-1-kg
https://www.korodrogerie.de/gebrannte-erdnuesse-salted-caramel-1-kg
https://www.korodrogerie.de/gebrannte-erdnuesse-salted-caramel-12-x-1-kg
https://www.korodrogerie.de/gebrannte-haselnuesse-salted-caramel-12-x-1-kg
http

In [162]:
products_koro

Unnamed: 0,links,price,weight,kcal,fat,sat_fat,carbs,sugar,fibre,protein,salt,ingredients,photo_link
0,https://www.korodrogerie.de/soja-protein-crisp...,"\n\n14,00 €\n",1,1535 / 362,"1,9 g","0,4 g","28,2 g","9,1 g","1,8 g",58 g,"2,7 g",7,https://koro2.imgix.net/media/image/f1/50/81/C...
1,https://www.korodrogerie.de/schokodrops-mit-xy...,"\n\n21,00 €\n",1,2290 / 555,46 g,28 g,31 g,"1,0 g",10 g,"9,2 g","0,07 g",K,https://koro2.imgix.net/media/image/2c/01/f8/S...
2,https://www.korodrogerie.de/schoko-protein-cru...,"\n\n20,00 €\n",1,1875 / 448,28 g,12 g,37 g,"1,4 g","8,3 g",21 g,"0,36 g",S,https://koro2.imgix.net/media/image/30/78/62/P...
3,https://www.korodrogerie.de/soja-protein-crisp...,"\n\n13,00 €\n",1,1541 / 363,"1,8 g","0,2 g",26 g,"0,8 g","0,8 g",60 g,"3,2 g",6,https://koro2.imgix.net/media/image/84/55/ca/C...
4,https://www.korodrogerie.de/bohnen-erbsen-mix-...,"\n\n11,50 €\n",1,1766 / 421,14 g,"2,3 g",35 g,"6,7 g",10 g,33 g,"1,0 g",1,https://koro2.imgix.net/media/image/a9/37/79/B...
5,https://www.korodrogerie.de/reisgebaeck-superi...,"\n\n11,50 €\n",0,1710 / 405,"6,3 g","1,2 g",78 g,13 g,"2,7 g","7,6 g","1,2 g",4,https://koro2.imgix.net/media/image/68/7c/90/R...
6,https://www.korodrogerie.de/amarena-kirschen-m...,"\n\n8,50 €\n",0,1715 / 409,17 g,11 g,58 g,45 g,"5,4 g","3,5 g","0,08 g",5,https://koro2.imgix.net/media/image/31/8f/d6/A...
7,https://www.korodrogerie.de/sonnengetrocknete-...,"\n\n12,00 €\n",1,1078 / 258,"3,0 g",0 g,44 g,38 g,12 g,14 g,"5,3 g",8,https://koro2.imgix.net/media/image/5b/aa/28/T...
8,https://www.korodrogerie.de/apfelchips-deutsch...,"\n\n7,00 €\n",0,1507 / 356,0 g,0 g,77 g,72 g,14 g,"2,2 g","0,03 g",1,https://koro2.imgix.net/media/image/95/50/b3/A...
9,https://www.korodrogerie.de/bio-rote-bete-chip...,"\n\n11,00 €\n",2,1415 / 335,0 g,0 g,63 g,54 g,17 g,11 g,"0,5 g",K,https://koro2.imgix.net/media/image/13/f2/e2/R...


In [163]:
products_koro.to_csv('products_koro.csv')