# Web Scraping

## Importing libraries

In [17]:
!pip install selenium

Collecting selenium
  Downloading selenium-4.0.0-py3-none-any.whl (954 kB)
Collecting trio-websocket~=0.9
  Downloading trio_websocket-0.9.2-py3-none-any.whl (16 kB)
Collecting trio~=0.17
  Downloading trio-0.19.0-py3-none-any.whl (356 kB)
Collecting sortedcontainers
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl (29 kB)
Collecting outcome
  Downloading outcome-1.1.0-py2.py3-none-any.whl (9.7 kB)
Collecting wsproto>=0.14
  Downloading wsproto-1.0.0-py3-none-any.whl (24 kB)


You should consider upgrading via the 'c:\users\katha\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.


Collecting pyOpenSSL>=0.14
  Downloading pyOpenSSL-21.0.0-py2.py3-none-any.whl (55 kB)
Collecting h11<1,>=0.9.0
  Downloading h11-0.12.0-py3-none-any.whl (54 kB)
Installing collected packages: sortedcontainers, outcome, h11, wsproto, trio, pyOpenSSL, trio-websocket, selenium
Successfully installed h11-0.12.0 outcome-1.1.0 pyOpenSSL-21.0.0 selenium-4.0.0 sortedcontainers-2.4.0 trio-0.19.0 trio-websocket-0.9.2 wsproto-1.0.0


In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

## Overview

|	Produktname	|	Marke	|	Preis	|	Gewicht	|	Brennwert	|	Fett	|	gesättigte Fettsäuren	|	Kohlehydrate	|	Zucker	|	Ballaststoffe	|	Eiweiß	|	Salz	|	Zutaten	|	Fotolink	|
|	:----------	|	:----------	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|
|	product name	|	brand	|	price	|	weight	|		|	fat	|	saturated fat	|	carbs	|	sugar	|		|	protein	|	salt	|	ingredients	|	picture link	|
|	e.g. Snack bar	|		|	in EUR	|	in gr/kg	|	in kj/kcal	|	in gr	|	in gr	|	in gr	|	in gr	|	in gr	|	in gr	|	in gr	|		|		|

**List of websites:**
- Koro: https://www.korodrogerie.de/snacks/?p=1&o=2&n=144&f=233
- Vantastic foods: https://www.vantastic-foods.com/vegane-lebensmittel/snacks-und-suesswaren
- Kokku: https://kokku-online.de/vegane-suessigkeiten-snacks/
- Foodist: https://www.foodist.de/suesses-snacks?p=1&o=9&n=84&f=36

## Web Scraping

### Koro

#### Getting product links

In [2]:
# storing url in a variable
url = "https://www.korodrogerie.de/snacks/?p=1&o=2&n=144&f=233"

# downloading html-code with a get request
response = requests.get(url)
response.status_code

200

In [3]:
# parsing html (creating the 'soup')
soup = BeautifulSoup(response.content, "html.parser")

In [41]:
# retrieving/extracting the desired info

# general product info
product_info = soup.find_all("div", class_="product--info")

In [42]:
# product links
product_info[0].a.get("href")

'https://www.korodrogerie.de/soja-protein-crispies-58-mit-kakao-1-kg'

In [43]:
links = []
for p in product_info:
    link = p.a.get("href")
    links.append(link)

In [53]:
# product name
product_info[0].a.get("title")

'Soja Protein Crispies 58\xa0% mit Kakao 1\xa0kg'

In [55]:
names = []
for p in product_info:
    name = p.a.get("title")
    names.append(name)

In [57]:
len(product_link), len(names), len(links)

(100, 100, 100)

In [64]:
# Building a dataframe

product_details = pd.DataFrame(
    {"Produkt": names,
     "Link": links})

In [65]:
product_details

Unnamed: 0,Produkt,Link
0,Soja Protein Crispies 58 % mit Kakao 1 kg,https://www.korodrogerie.de/soja-protein-crisp...
1,Schokodrops mit Xylit 1 kg,https://www.korodrogerie.de/schokodrops-mit-xy...
2,Schoko Protein Crunchies ohne Zuckerzusatz 1 kg,https://www.korodrogerie.de/schoko-protein-cru...
3,Soja Protein Crispies 60 % 1 kg,https://www.korodrogerie.de/soja-protein-crisp...
4,Bohnen-Erbsen-Mix geröstet & gesalzen 1 kg,https://www.korodrogerie.de/bohnen-erbsen-mix-...
...,...,...
95,Bio Geile Schnitte Schokolade 10 x 30 g,https://www.korodrogerie.de/bio-geile-schnitte...
96,Bio Nut Butter Bar Peanut 12 x 30 g,https://www.korodrogerie.de/bio-nut-butter-bar...
97,Bio Nut Butter Bar Hazelnut 12 x 30 g,https://www.korodrogerie.de/bio-nut-butter-bar...
98,Bio Energy Ball Haselnuss 15 x 30 g,https://www.korodrogerie.de/bio-energy-ball-ha...


#### Getting all product infos

In [66]:
# storing url in a variable
url = "https://www.korodrogerie.de/soja-protein-crispies-58-mit-kakao-1-kg"

# downloading html-code with a get request
response = requests.get(url)
response.status_code

200

In [67]:
soup = BeautifulSoup(response.content, "html.parser")

In [97]:
# price
price = soup.find_all("span", class_="price--content content--default")

# weight
weight = soup.find_all("td", class_="base-info--content")

# kcal
kcal = soup.find_all("span", class_="nutrition--value nutrition--energy")

# fat
fat = soup.find_all("span", class_="nutrition--value nutrition--fat")

# sat_fat
sat_fat = soup.find_all("span", class_="nutrition--value nutrition--saturates")

# carbs
carbs = soup.find_all("span", class_="nutrition--value nutrition--carbohydrate")

# sugar
sugar = soup.find_all("span", class_="nutrition--value nutrition--sugars")

# fibre
fibre = soup.find_all("span", class_="nutrition--value nutrition--fibre")

# protein
protein = soup.find_all("span", class_="nutrition--value nutrition--protein")

# salt
salt = soup.find_all("span", class_="nutrition--value nutrition--salt")

# ingredients
ingredients = soup.find_all("span", class_="base-info--label")

# photo
photo = soup.find_all("span", class_="image--media")

In [110]:
# how to get photo link
photo_link = photo[0].img.get("src")
photo_link

'https://koro2.imgix.net/media/image/f1/50/81/CRIS_005_01E3j9hQ8K66BPv.jpg?auto=compress%2Cformat&w=900&h=900'

In [101]:
def scraping_details(links=[]):
    for link in links:
        response = requests.get(link)
        
        soup = BeautifulSoup(response.content, "html.parser")
        
        # price
        price = soup.find_all("span", class_="price--content content--default")

        # kcal
        kcal = soup.find_all("span", class_="nutrition--value nutrition--energy")

        # fat
        fat = soup.find_all("span", class_="nutrition--value nutrition--fat")

        # sat_fat
        sat_fat = soup.find_all("span", class_="nutrition--value nutrition--saturates")

        # carbs
        carbs = soup.find_all("span", class_="nutrition--value nutrition--carbohydrate")

        # sugar
        sugar = soup.find_all("span", class_="nutrition--value nutrition--sugars")

        # fibre
        fibre = soup.find_all("span", class_="nutrition--value nutrition--fibre")

        # protein
        protein = soup.find_all("span", class_="nutrition--value nutrition--protein")

        # salt
        salt = soup.find_all("span", class_="nutrition--value nutrition--salt")

        
        for i in [price, kcal, fat, sat_fat, carbs, sugar, fibre, protein, salt]:
            for j in range(len(i)):
                i[j] = i[j].getText()
                
        
        # weight
        weight = soup.find_all("td", class_="base-info--content")      
                
        # ingredients
        ingredients = soup.find_all("span", class_="base-info--label")
        
        
        # photo
        photo = soup.find_all("span", class_="image--media")
        
        photo_link = photo[0].img.get("scr")
        
        return price, kcal, fat, sat_fat, carbs, sugar, fibre, protein, salt, photo_link
    
        # still need to work on weight + ingredients!!

In [106]:
scraping_details(links=['https://www.korodrogerie.de/soja-protein-crispies-58-mit-kakao-1-kg'])

(['\n\n14,00\xa0€\n'],
 ['1535                                   / 362'],
 ['1,9 g'],
 ['0,4 g'],
 ['28,2 g'],
 ['9,1 g'],
 ['1,8 g'],
 ['58 g'],
 ['2,7 g'],
 None)

In [None]:
def creating_lists()