# Web Scraping

## Importing libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

## Overview

|	Produktname	|	Marke	|	Preis	|	Gewicht	|	Brennwert	|	Fett	|	gesättigte Fettsäuren	|	Kohlehydrate	|	Zucker	|	Ballaststoffe	|	Eiweiß	|	Salz	|	Zutaten	|	Fotolink	|
|	:----------	|	:----------	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|
|	product name	|	brand	|	price	|	weight	|		|	fat	|	saturated fat	|	carbs	|	sugar	|		|	protein	|	salt	|	ingredients	|	picture link	|
|	e.g. Snack bar	|		|	in EUR	|	in gr/kg	|	in kj/kcal	|	in gr	|	in gr	|	in gr	|	in gr	|	in gr	|	in gr	|	in gr	|		|		|

**List of websites:**
- Koro: https://www.korodrogerie.de/snacks/?p=1&o=2&n=144&f=233
- Vantastic foods: https://www.vantastic-foods.com/vegane-lebensmittel/snacks-und-suesswaren
- Kokku: https://kokku-online.de/vegane-suessigkeiten-snacks/
- Foodist: https://www.foodist.de/suesses-snacks?p=1&o=9&n=84&f=36

## Web Scraping

### Koro

#### Getting product links

In [2]:
# storing url in a variable
url = "https://www.korodrogerie.de/snacks/?p=1&o=2&n=144&f=233"

# downloading html-code with a get request
response = requests.get(url)
response.status_code

200

In [3]:
# parsing html (creating the 'soup')
soup = BeautifulSoup(response.content, "html.parser")

In [41]:
# retrieving/extracting the desired info

# general product info
product_info = soup.find_all("div", class_="product--info")

In [42]:
# product links
product_info[0].a.get("href")

'https://www.korodrogerie.de/soja-protein-crispies-58-mit-kakao-1-kg'

In [43]:
links = []
for p in product_info:
    link = p.a.get("href")
    links.append(link)

In [53]:
# product name
product_info[0].a.get("title")

'Soja Protein Crispies 58\xa0% mit Kakao 1\xa0kg'

In [55]:
names = []
for p in product_info:
    name = p.a.get("title")
    names.append(name)

In [57]:
len(product_link), len(names), len(links)

(100, 100, 100)

In [116]:
# Building a dataframe

product_overview = pd.DataFrame(
    {"product": names,
     "link": links})

In [117]:
product_overview

Unnamed: 0,product,link
0,Soja Protein Crispies 58 % mit Kakao 1 kg,https://www.korodrogerie.de/soja-protein-crisp...
1,Schokodrops mit Xylit 1 kg,https://www.korodrogerie.de/schokodrops-mit-xy...
2,Schoko Protein Crunchies ohne Zuckerzusatz 1 kg,https://www.korodrogerie.de/schoko-protein-cru...
3,Soja Protein Crispies 60 % 1 kg,https://www.korodrogerie.de/soja-protein-crisp...
4,Bohnen-Erbsen-Mix geröstet & gesalzen 1 kg,https://www.korodrogerie.de/bohnen-erbsen-mix-...
...,...,...
95,Bio Geile Schnitte Schokolade 10 x 30 g,https://www.korodrogerie.de/bio-geile-schnitte...
96,Bio Nut Butter Bar Peanut 12 x 30 g,https://www.korodrogerie.de/bio-nut-butter-bar...
97,Bio Nut Butter Bar Hazelnut 12 x 30 g,https://www.korodrogerie.de/bio-nut-butter-bar...
98,Bio Energy Ball Haselnuss 15 x 30 g,https://www.korodrogerie.de/bio-energy-ball-ha...


#### Getting all product infos

In [66]:
# storing url in a variable
url = "https://www.korodrogerie.de/soja-protein-crispies-58-mit-kakao-1-kg"

# downloading html-code with a get request
response = requests.get(url)
response.status_code

200

In [67]:
soup = BeautifulSoup(response.content, "html.parser")

In [97]:
# price
price = soup.find_all("span", class_="price--content content--default")

# weight
weight = soup.find_all("td", class_="base-info--content")

# kcal
kcal = soup.find_all("span", class_="nutrition--value nutrition--energy")

# fat
fat = soup.find_all("span", class_="nutrition--value nutrition--fat")

# sat_fat
sat_fat = soup.find_all("span", class_="nutrition--value nutrition--saturates")

# carbs
carbs = soup.find_all("span", class_="nutrition--value nutrition--carbohydrate")

# sugar
sugar = soup.find_all("span", class_="nutrition--value nutrition--sugars")

# fibre
fibre = soup.find_all("span", class_="nutrition--value nutrition--fibre")

# protein
protein = soup.find_all("span", class_="nutrition--value nutrition--protein")

# salt
salt = soup.find_all("span", class_="nutrition--value nutrition--salt")

# ingredients
ingredients = soup.find_all("span", class_="base-info--label")

# photo
photo = soup.find_all("span", class_="image--media")

In [110]:
# how to get photo link
photo_link = photo[0].img.get("src")
photo_link

'https://koro2.imgix.net/media/image/f1/50/81/CRIS_005_01E3j9hQ8K66BPv.jpg?auto=compress%2Cformat&w=900&h=900'

In [166]:
def scraping_details(links=[]):
    
    lst_price = []
    lst_weight = []
    lst_kcal = []
    lst_fat = []
    lst_sat_fat = []
    lst_carbs = []
    lst_sugar = []
    lst_fibre = []
    lst_protein = []
    lst_salt = []
    lst_ingredients = []
    lst_photos = []
    
    for l in links:
        
        url = l
        response = requests.get(url)
        
        soup = BeautifulSoup(response.content, "html.parser")
        
        # price
        price = soup.find_all("span", class_="price--content content--default")

        # kcal
        kcal = soup.find_all("span", class_="nutrition--value nutrition--energy")

        # fat
        fat = soup.find_all("span", class_="nutrition--value nutrition--fat")

        # sat_fat
        sat_fat = soup.find_all("span", class_="nutrition--value nutrition--saturates")

        # carbs
        carbs = soup.find_all("span", class_="nutrition--value nutrition--carbohydrate")

        # sugar
        sugar = soup.find_all("span", class_="nutrition--value nutrition--sugars")

        # fibre
        fibre = soup.find_all("span", class_="nutrition--value nutrition--fibre")

        # protein
        protein = soup.find_all("span", class_="nutrition--value nutrition--protein")

        # salt
        salt = soup.find_all("span", class_="nutrition--value nutrition--salt")
        
        # photo
        photo = soup.find_all("span", class_="image--media")


        # getting text for all necessary features
        for i in [price, kcal, fat, sat_fat, carbs, sugar, fibre, protein, salt]:
            for j in range(len(i)):
                i[j] = i[j].getText()
         
        # getting photo link
        photo_link = photo[0].img.get("scr")
        
        
        lst_price.append(price)
        # lst_weight.append(price)
        lst_kcal.append(kcal)
        lst_fat.append(fat)
        lst_sat_fat.append(sat_fat)
        lst_carbs.append(carbs)
        lst_sugar.append(sugar)
        lst_fibre.append(fibre)
        lst_protein.append(protein)
        lst_salt.append(salt)
        # lst_ingredients.append(price)
        lst_photos.append(photo_link)
        
    
        '''
    
        # still need to work on weight + ingredients!!
        
        # weight
        weight = soup.find_all("td", class_="base-info--content")      
                
        # ingredients
        ingredients = soup.find_all("span", class_="base-info--label")
        
        '''
        
    return lst_price, lst_kcal, lst_fat, lst_sat_fat, lst_carbs, lst_sugar, lst_fibre, lst_protein, lst_salt, lst_photos


In [167]:
product_1 = scraping_details(links=['https://www.korodrogerie.de/soja-protein-crispies-58-mit-kakao-1-kg'])
product_1

([['\n\n14,00\xa0€\n']],
 [['1535                                   / 362']],
 [['1,9 g']],
 [['0,4 g']],
 [['28,2 g']],
 [['9,1 g']],
 [['1,8 g']],
 [['58 g']],
 [['2,7 g']],
 [None])

In [168]:
product_2 = scraping_details(links=['https://www.korodrogerie.de/schokodrops-mit-xylit-1-kg'])
product_2

([['\n\n21,00\xa0€\n']],
 [['2290                                   / 555']],
 [['46 g']],
 [['28 g']],
 [['31 g']],
 [['1,0 g', '25 g']],
 [['10 g']],
 [['9,2 g']],
 [['0,07 g']],
 [None])

In [169]:
two_products = scraping_details(links=['https://www.korodrogerie.de/soja-protein-crispies-58-mit-kakao-1-kg', 'https://www.korodrogerie.de/schokodrops-mit-xylit-1-kg'])
two_products

([['\n\n14,00\xa0€\n'], ['\n\n21,00\xa0€\n']],
 [['1535                                   / 362'],
  ['2290                                   / 555']],
 [['1,9 g'], ['46 g']],
 [['0,4 g'], ['28 g']],
 [['28,2 g'], ['31 g']],
 [['9,1 g'], ['1,0 g', '25 g']],
 [['1,8 g'], ['10 g']],
 [['58 g'], ['9,2 g']],
 [['2,7 g'], ['0,07 g']],
 [None, None])

In [178]:
def creating_df(links=[]):
    
    scraping_details(links=links)
    
    details = pd.DataFrame(
    {"links": links,
     "price": lst_price,
     "kcal": lst_kcal,
     "fat": lst_fat,
     "sat_fat": lst_sat_fat,
     "carbs": lst_carbs,
     "sugar": lst_sugar,
     "fibre": lst_fibre,
     "protein": lst_protein,
     "salt": lst_salt,
     "photo_link": lst_photos}
        )
    return details

In [179]:
creating_df(links=['https://www.korodrogerie.de/soja-protein-crispies-58-mit-kakao-1-kg'])

NameError: name 'lst_price' is not defined

In [172]:
creating_df(links=['https://www.korodrogerie.de/schokodrops-mit-xylit-1-kg'])

Unnamed: 0,links,price,kcal,fat,sat_fat,carbs,sugar,fibre,protein,salt,photo_link
0,https://www.korodrogerie.de/schokodrops-mit-xy...,"[\n, [], \n14,00 €\n]",[1535 / 362],"[1,9 g]","[0,4 g]","[28,2 g]","[9,1 g]","[1,8 g]",[58 g],"[2,7 g]",https://koro2.imgix.net/media/image/f1/50/81/C...


In [173]:
creating_df(links=product_overview['link'])

ValueError: array length 1 does not match index length 100