# Web Scraping

## Importing libraries

In [28]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

## Overview

|	Produktname	|	Marke	|	Preis	|	Gewicht	|	Brennwert	|	Fett	|	gesättigte Fettsäuren	|	Kohlehydrate	|	Zucker	|	Ballaststoffe	|	Eiweiß	|	Salz	|	Zutaten	|	Fotolink	|
|	:----------	|	:----------	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|
|	product name	|	brand	|	price	|	weight	|		|	fat	|	saturated fat	|	carbs	|	sugar	|		|	protein	|	salt	|	ingredients	|	picture link	|
|	e.g. Snack bar	|		|	in EUR	|	in gr/kg	|	in kj/kcal	|	in gr	|	in gr	|	in gr	|	in gr	|	in gr	|	in gr	|	in gr	|		|		|

**List of websites:**
- Koro: https://www.korodrogerie.de/snacks/?p=1&o=2&n=144&f=233
- Vantastic foods: https://www.vantastic-foods.com/vegane-lebensmittel/snacks-und-suesswaren
- Kokku: https://kokku-online.de/vegane-suessigkeiten-snacks/
- Foodist: https://www.foodist.de/suesses-snacks?p=1&o=9&n=84&f=36

## Web Scraping

### Kokku

#### Getting product links

In [100]:
# storing url in a variable
url = "https://kokku-online.de/vegane-suessigkeiten-snacks/"

# downloading html-code with a get request
response = requests.get(url)
response.status_code

200

In [101]:
# parsing html (creating the 'soup')
soup = BeautifulSoup(response.content, "html.parser")

In [102]:
# retrieving/extracting the desired info

# general product info
product_info = soup.find_all("div", class_="p")

In [103]:
# product links
product_info[0].a.get("href")

'https://kokku-online.de/bio4you-eisbonbon/'

In [106]:
# brand
brand = soup.find_all("div", class_="pd")
brand[0].span.getText()

'Bio4You'

In [42]:
name = soup.find_all("span", class_="name")
name[0]#.getText()

<span class="name">Eisbonbon <span class="size">- 75g</span></span>

In [107]:
def get_first_details(soup):
    
    link = soup.find_all("div", class_="p")
    name = soup.find_all("span", class_="name")
    brand = soup.find_all("div", class_="pd")
    weight = soup.find_all("span", class_="size")
    
    names = []
    brands = []
    weights = []
    links = []
    
    
    for p in name:
        name = p.getText()
        names.append(name)
        
    for p in brand:
        brand = p.span.getText()
        brands.append(brand)
        
    for p in weight:
        weight = p.getText()
        weights.append(weight)
    
    for p in link:
        link = p.a.get("href")
        links.append(link)
    
    return [names, brands, weights, links]

In [108]:
details = get_first_details(soup)

In [109]:
len(details[0]), len(details[1]), len(details[2]), len(details[3])

(555, 555, 555, 555)

In [23]:
links = details[0]
names = details[1]

In [110]:
# Building a dataframe

product_overview = pd.DataFrame(
    {"product": details[0],
     "brand": details[1],
     "weight": details[2],
     "link": details[3]})

In [111]:
product_overview

Unnamed: 0,product,brand,weight,link
0,Eisbonbon - 75g,Bio4You,- 75g,https://kokku-online.de/bio4you-eisbonbon/
1,Stollenkonfekt - 100g,Bäckerei Sachse,- 100g,https://kokku-online.de/sachse-stollen-stollen...
2,Veganer °Schokodrops° Dinkelstollen mit Puderz...,Bäckerei Sachse,- 1kg,https://kokku-online.de/sachse-stollen-veganer...
3,3 Stollenscheiben °Schokodrops° (ohne Rosinen)...,Bäckerei Sachse,- 250g,https://kokku-online.de/sachse-stollen-3-stoll...
4,Veganer °Rosinen° Dinkelstollen - 1kg,Bäckerei Sachse,- 1kg,https://kokku-online.de/sachse-stollen-dinkels...
...,...,...,...,...
550,Lutscher °Himbeere° - 12g,Candy Tree,- 12g,https://kokku-online.de/candy-tree-lutscher-hi...
551,Lakritz Schlangen - extra lang - 56g,Terrasana,- 56g,https://kokku-online.de/terrasana-lakritz-schl...
552,MINI VEGO Haselnuss Schokoriegel - 65g,Vego Chocolate,- 65g,https://kokku-online.de/vego-chocolate-vego-wh...
553,Soy Jerky Original - 70g,Vantastic Foods,- 70g,https://kokku-online.de/vantastic-foods-soy-je...


#### Getting all product infos

In [117]:
# storing url in a variable
url = "https://kokku-online.de/bio4you-eisbonbon/"

# downloading html-code with a get request
response = requests.get(url)
response.status_code

200

In [118]:
soup = BeautifulSoup(response.content, "html.parser")

In [119]:
# price
price = soup.find_all("div", class_="preis")
price[0].span.getText()

'1.49 €'

In [120]:
# price gramms
price_gr = soup.find_all("div", class_="st")
price_gr[0].span.getText()

'1.99€/100g'

In [123]:
# nutrions
nutritions = soup.find_all("tbody")
nutritions[0].getText()

'Brennwert1.632 kJ / 384 kcalFett< 0,1g- davon gesättigte Fettsäuren< 0,1gKohlenhydrate94g- davon Zucker69gEiweiß< 0,1gSalz< 0,01g'

In [None]:
# ingredients
#ingredients = soup.find_all("div")

In [124]:
# photo
photo = soup.find_all("div", class_="pics")
photo[0].meta.get("content")

'https://kokku-online.de//bilder/350x350/19266/bio4you-eisbonbon.auto'

In [73]:
url = 'https://kokku-online.de/bio4you-eisbonbon/'
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

In [130]:
def scraping_details(links=[], not_in=[]):
    
    
    lst_price = []
    lst_price_gr = []
    lst_nutritions = []
    lst_ingredients = []
    lst_photos = []

    for l in links:
        
        if l in not_in:
            continue
        
        else:
            
            try:

                url = l
                response = requests.get(url)

                soup = BeautifulSoup(response.content, "html.parser")

                # price
                price = soup.find_all("div", class_="preis")
                pr = price[0].span.getText()

                # price gramms
                price_gr = soup.find_all("div", class_="st")
                pr_gr = price_gr[0].span.getText()

                # nutrions
                nutritions = soup.find_all("tbody")
                nutr = nutritions[0].getText()

                # ingredients
                #ingredients = soup.find_all("div")
                
                # photo
                photo = soup.find_all("div", class_="pics")
                photo_link = photo[0].meta.get("content")


                lst_price.append(pr)
                lst_price_gr.append(pr_gr)
                lst_nutritions.append(nutr)
                #lst_ingredients.append(ingredients[0])
                lst_photos.append(photo_link)

                print(l)

            except IndexError: 

                print("error for: ", l)
        
        
    return [lst_price, lst_price_gr, lst_nutritions, lst_photos]


In [131]:
test = scraping_details(links=['https://kokku-online.de/bio4you-eisbonbon/', 'https://kokku-online.de/sachse-stollen-veganer-dinkelstollen-mit-schokodrops-ohne-/'])
test

https://kokku-online.de/bio4you-eisbonbon/
https://kokku-online.de/sachse-stollen-veganer-dinkelstollen-mit-schokodrops-ohne-/


[['1.49 €', '19.99 €'],
 ['1.99€/100g', '19.99€/kg'],
 ['Brennwert1.632 kJ / 384 kcalFett< 0,1g- davon gesättigte Fettsäuren< 0,1gKohlenhydrate94g- davon Zucker69gEiweiß< 0,1gSalz< 0,01g',
  'Brennwert1763 kJ / 421 kcalFett20,3g- davon gesättigte Fettsäuren10,0gKohlenhydrate52,5g- davon Zucker30,2gEiweiß6,0gSalz0,2g'],
 ['https://kokku-online.de//bilder/350x350/19266/bio4you-eisbonbon.auto',
  'https://kokku-online.de//bilder/350x350/7806/sachse-stollen-veganer-dinkelstollen-mit-schokodrops-ohne-.auto']]

In [132]:
def creating_df(links=[], not_in=[]):
    
    result = scraping_details(links=links)
    
    details = pd.DataFrame(
    {"links": links,
     "price": result[0],
     "price_gr": result[1],
     "nutritions": result[2],
     "photo_link": result[3]}
        )
    return details

In [133]:
creating_df(links=['https://kokku-online.de/bio4you-eisbonbon/'])

https://kokku-online.de/bio4you-eisbonbon/


Unnamed: 0,links,price,price_gr,nutritions,photo_link
0,https://kokku-online.de/bio4you-eisbonbon/,1.49 €,1.99€/100g,"Brennwert1.632 kJ / 384 kcalFett< 0,1g- davon ...",https://kokku-online.de//bilder/350x350/19266/...


In [134]:
creating_df(links=['https://kokku-online.de/sachse-stollen-veganer-dinkelstollen-mit-schokodrops-ohne-/'])

https://kokku-online.de/sachse-stollen-veganer-dinkelstollen-mit-schokodrops-ohne-/


Unnamed: 0,links,price,price_gr,nutritions,photo_link
0,https://kokku-online.de/sachse-stollen-veganer...,19.99 €,19.99€/kg,"Brennwert1763 kJ / 421 kcalFett20,3g- davon ge...",https://kokku-online.de//bilder/350x350/7806/s...


**Deleting rows with too much info missing**

In [144]:
product_overview.index[product_overview['link']=='https://kokku-online.de/vegane-suessigkeiten-snacks/']

Int64Index([547], dtype='int64')

In [143]:
product_overview = product_overview.drop(product_overview.index[547]) #https://kokku-online.de/vegane-suessigkeiten-snacks/ 

In [None]:
len(product_overview)

**Creating data frame**

In [141]:
products_kokku = creating_df(links=product_overview['link'], not_in=['https://kokku-online.de/vegane-suessigkeiten-snacks/'])

https://kokku-online.de/bio4you-eisbonbon/
https://kokku-online.de/sachse-stollen-stollenkonfekt/
https://kokku-online.de/sachse-stollen-veganer-dinkelstollen-mit-schokodrops-ohne-/
https://kokku-online.de/sachse-stollen-3-stollenscheiben-dinkel-mit-schokodrops-ohne-ros/
https://kokku-online.de/sachse-stollen-dinkelstollen-rosine/
https://kokku-online.de/sachse-stollen-3-stollenscheiben-dinkel-rosine/
https://kokku-online.de/sachse-stollen-dinkelstollen-schoko-mailaender-art-mit-schokodro/
https://kokku-online.de/sachse-stollen-3-stollenscheiben-dinkel-mailaender-art-mit-schok/
https://kokku-online.de/bernsteinzimmer-12er-konfekt-winterwonne/
https://kokku-online.de/bernsteinzimmer-konfekt-adventskalender/
https://kokku-online.de/rosengarten-zartbitter-ingwer/
https://kokku-online.de/moo-free-olivia-der-baer-weisse-schokolade/
https://kokku-online.de/moo-free-snowball-weisse-schokolade-mit-marshmallow-und-honeycom/
https://kokku-online.de/moo-free-oscar-der-baer-vegane-schokolade/
http

ValueError: array length 552 does not match index length 553

In [162]:
products_kokku

Unnamed: 0,links,price,weight,kcal,fat,sat_fat,carbs,sugar,fibre,protein,salt,ingredients,photo_link
0,https://www.korodrogerie.de/soja-protein-crisp...,"\n\n14,00 €\n",1,1535 / 362,"1,9 g","0,4 g","28,2 g","9,1 g","1,8 g",58 g,"2,7 g",7,https://koro2.imgix.net/media/image/f1/50/81/C...
1,https://www.korodrogerie.de/schokodrops-mit-xy...,"\n\n21,00 €\n",1,2290 / 555,46 g,28 g,31 g,"1,0 g",10 g,"9,2 g","0,07 g",K,https://koro2.imgix.net/media/image/2c/01/f8/S...
2,https://www.korodrogerie.de/schoko-protein-cru...,"\n\n20,00 €\n",1,1875 / 448,28 g,12 g,37 g,"1,4 g","8,3 g",21 g,"0,36 g",S,https://koro2.imgix.net/media/image/30/78/62/P...
3,https://www.korodrogerie.de/soja-protein-crisp...,"\n\n13,00 €\n",1,1541 / 363,"1,8 g","0,2 g",26 g,"0,8 g","0,8 g",60 g,"3,2 g",6,https://koro2.imgix.net/media/image/84/55/ca/C...
4,https://www.korodrogerie.de/bohnen-erbsen-mix-...,"\n\n11,50 €\n",1,1766 / 421,14 g,"2,3 g",35 g,"6,7 g",10 g,33 g,"1,0 g",1,https://koro2.imgix.net/media/image/a9/37/79/B...
5,https://www.korodrogerie.de/reisgebaeck-superi...,"\n\n11,50 €\n",0,1710 / 405,"6,3 g","1,2 g",78 g,13 g,"2,7 g","7,6 g","1,2 g",4,https://koro2.imgix.net/media/image/68/7c/90/R...
6,https://www.korodrogerie.de/amarena-kirschen-m...,"\n\n8,50 €\n",0,1715 / 409,17 g,11 g,58 g,45 g,"5,4 g","3,5 g","0,08 g",5,https://koro2.imgix.net/media/image/31/8f/d6/A...
7,https://www.korodrogerie.de/sonnengetrocknete-...,"\n\n12,00 €\n",1,1078 / 258,"3,0 g",0 g,44 g,38 g,12 g,14 g,"5,3 g",8,https://koro2.imgix.net/media/image/5b/aa/28/T...
8,https://www.korodrogerie.de/apfelchips-deutsch...,"\n\n7,00 €\n",0,1507 / 356,0 g,0 g,77 g,72 g,14 g,"2,2 g","0,03 g",1,https://koro2.imgix.net/media/image/95/50/b3/A...
9,https://www.korodrogerie.de/bio-rote-bete-chip...,"\n\n11,00 €\n",2,1415 / 335,0 g,0 g,63 g,54 g,17 g,11 g,"0,5 g",K,https://koro2.imgix.net/media/image/13/f2/e2/R...


In [163]:
products_kokku.to_csv('products_kokku.csv')