# Web Scraping

## Importing libraries

In [28]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

## Overview 

|	Produktname	|	Marke	|	Preis	|	Gewicht	|	Brennwert	|	Fett	|	gesättigte Fettsäuren	|	Kohlehydrate	|	Zucker	|	Ballaststoffe	|	Eiweiß	|	Salz	|	Zutaten	|	Fotolink	|
|	:----------	|	:----------	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|	:---------:	|
|	product name	|	brand	|	price	|	weight	|		|	fat	|	saturated fat	|	carbs	|	sugar	|		|	protein	|	salt	|	ingredients	|	picture link	|
|	e.g. Snack bar	|		|	in EUR	|	in gr/kg	|	in kj/kcal	|	in gr	|	in gr	|	in gr	|	in gr	|	in gr	|	in gr	|	in gr	|		|		|

**List of websites:**
- Koro: https://www.korodrogerie.de/snacks/?p=1&o=2&n=144&f=233
- Vantastic foods: https://www.vantastic-foods.com/vegane-lebensmittel/snacks-und-suesswaren
- Kokku: https://kokku-online.de/vegane-suessigkeiten-snacks/
- Foodist: https://www.foodist.de/suesses-snacks?p=1&o=9&n=84&f=36

## Web Scraping

### Getting started

In [100]:
# storing url in a variable
url = "https://kokku-online.de/vegane-suessigkeiten-snacks/"

# downloading html-code with a get request
response = requests.get(url)
response.status_code

200

In [101]:
# parsing html (creating the 'soup')
soup = BeautifulSoup(response.content, "html.parser")

In [102]:
# retrieving/extracting the desired info

# general product info
product_info = soup.find_all("div", class_="p")

In [103]:
# product links
product_info[0].a.get("href")

'https://kokku-online.de/bio4you-eisbonbon/'

In [106]:
# brand
brand = soup.find_all("div", class_="pd")
brand[0].span.getText()

'Bio4You'

In [42]:
name = soup.find_all("span", class_="name")
name[0]#.getText()

<span class="name">Eisbonbon <span class="size">- 75g</span></span>

### Product links + first info

In [107]:
def get_first_details(soup):
    
    link = soup.find_all("div", class_="p")
    name = soup.find_all("span", class_="name")
    brand = soup.find_all("div", class_="pd")
    weight = soup.find_all("span", class_="size")
    
    names = []
    brands = []
    weights = []
    links = []
    
    
    for p in name:
        name = p.getText()
        names.append(name)
        
    for p in brand:
        brand = p.span.getText()
        brands.append(brand)
        
    for p in weight:
        weight = p.getText()
        weights.append(weight)
    
    for p in link:
        link = p.a.get("href")
        links.append(link)
    
    return [names, brands, weights, links]

In [108]:
details = get_first_details(soup)

In [109]:
len(details[0]), len(details[1]), len(details[2]), len(details[3])

(555, 555, 555, 555)

In [23]:
links = details[0]
names = details[1]

In [157]:
# Building a dataframe

product_overview = pd.DataFrame(
    {"product": details[0],
     "brand": details[1],
     "weight": details[2],
     "links": details[3]})

In [158]:
product_overview

Unnamed: 0,product,brand,weight,links
0,Eisbonbon - 75g,Bio4You,- 75g,https://kokku-online.de/bio4you-eisbonbon/
1,Stollenkonfekt - 100g,Bäckerei Sachse,- 100g,https://kokku-online.de/sachse-stollen-stollen...
2,Veganer °Schokodrops° Dinkelstollen mit Puderz...,Bäckerei Sachse,- 1kg,https://kokku-online.de/sachse-stollen-veganer...
3,3 Stollenscheiben °Schokodrops° (ohne Rosinen)...,Bäckerei Sachse,- 250g,https://kokku-online.de/sachse-stollen-3-stoll...
4,Veganer °Rosinen° Dinkelstollen - 1kg,Bäckerei Sachse,- 1kg,https://kokku-online.de/sachse-stollen-dinkels...
...,...,...,...,...
550,Lutscher °Himbeere° - 12g,Candy Tree,- 12g,https://kokku-online.de/candy-tree-lutscher-hi...
551,Lakritz Schlangen - extra lang - 56g,Terrasana,- 56g,https://kokku-online.de/terrasana-lakritz-schl...
552,MINI VEGO Haselnuss Schokoriegel - 65g,Vego Chocolate,- 65g,https://kokku-online.de/vego-chocolate-vego-wh...
553,Soy Jerky Original - 70g,Vantastic Foods,- 70g,https://kokku-online.de/vantastic-foods-soy-je...


### Product details

In [117]:
# storing url in a variable
url = "https://kokku-online.de/bio4you-eisbonbon/"

# downloading html-code with a get request
response = requests.get(url)
response.status_code

200

In [118]:
soup = BeautifulSoup(response.content, "html.parser")

In [119]:
# price
price = soup.find_all("div", class_="preis")
price[0].span.getText()

'1.49 €'

In [120]:
# price gramms
price_gr = soup.find_all("div", class_="st")
price_gr[0].span.getText()

'1.99€/100g'

In [123]:
# nutrions
nutritions = soup.find_all("tbody")
nutritions[0].getText()

'Brennwert1.632 kJ / 384 kcalFett< 0,1g- davon gesättigte Fettsäuren< 0,1gKohlenhydrate94g- davon Zucker69gEiweiß< 0,1gSalz< 0,01g'

In [None]:
# ingredients
#ingredients = soup.find_all("div")

In [124]:
# photo
photo = soup.find_all("div", class_="pics")
photo[0].meta.get("content")

'https://kokku-online.de//bilder/350x350/19266/bio4you-eisbonbon.auto'

In [73]:
url = 'https://kokku-online.de/bio4you-eisbonbon/'
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

#### Function to scrape details

In [130]:
def scraping_details(links=[], not_in=[]):
    
    
    lst_price = []
    lst_price_gr = []
    lst_nutritions = []
    lst_ingredients = []
    lst_photos = []

    for l in links:
        
        if l in not_in:
            continue
        
        else:
            
            try:

                url = l
                response = requests.get(url)

                soup = BeautifulSoup(response.content, "html.parser")

                # price
                price = soup.find_all("div", class_="preis")
                pr = price[0].span.getText()

                # price gramms
                price_gr = soup.find_all("div", class_="st")
                pr_gr = price_gr[0].span.getText()

                # nutrions
                nutritions = soup.find_all("tbody")
                nutr = nutritions[0].getText()

                # ingredients
                #ingredients = soup.find_all("div")
                
                # photo
                photo = soup.find_all("div", class_="pics")
                photo_link = photo[0].meta.get("content")


                lst_price.append(pr)
                lst_price_gr.append(pr_gr)
                lst_nutritions.append(nutr)
                #lst_ingredients.append(ingredients[0])
                lst_photos.append(photo_link)

                print(l)

            except IndexError: 

                print("error for: ", l)
        
        
    return [lst_price, lst_price_gr, lst_nutritions, lst_photos]


In [131]:
test = scraping_details(links=['https://kokku-online.de/bio4you-eisbonbon/', 'https://kokku-online.de/sachse-stollen-veganer-dinkelstollen-mit-schokodrops-ohne-/'])
test

https://kokku-online.de/bio4you-eisbonbon/
https://kokku-online.de/sachse-stollen-veganer-dinkelstollen-mit-schokodrops-ohne-/


[['1.49 €', '19.99 €'],
 ['1.99€/100g', '19.99€/kg'],
 ['Brennwert1.632 kJ / 384 kcalFett< 0,1g- davon gesättigte Fettsäuren< 0,1gKohlenhydrate94g- davon Zucker69gEiweiß< 0,1gSalz< 0,01g',
  'Brennwert1763 kJ / 421 kcalFett20,3g- davon gesättigte Fettsäuren10,0gKohlenhydrate52,5g- davon Zucker30,2gEiweiß6,0gSalz0,2g'],
 ['https://kokku-online.de//bilder/350x350/19266/bio4you-eisbonbon.auto',
  'https://kokku-online.de//bilder/350x350/7806/sachse-stollen-veganer-dinkelstollen-mit-schokodrops-ohne-.auto']]

#### Function to create dataframe

In [132]:
def creating_df(links=[], not_in=[]):
    
    result = scraping_details(links=links)
    
    details = pd.DataFrame(
    {"links": links,
     "price": result[0],
     "price_gr": result[1],
     "nutritions": result[2],
     "photo_link": result[3]}
        )
    return details

In [133]:
creating_df(links=['https://kokku-online.de/bio4you-eisbonbon/'])

https://kokku-online.de/bio4you-eisbonbon/


Unnamed: 0,links,price,price_gr,nutritions,photo_link
0,https://kokku-online.de/bio4you-eisbonbon/,1.49 €,1.99€/100g,"Brennwert1.632 kJ / 384 kcalFett< 0,1g- davon ...",https://kokku-online.de//bilder/350x350/19266/...


In [134]:
creating_df(links=['https://kokku-online.de/sachse-stollen-veganer-dinkelstollen-mit-schokodrops-ohne-/'])

https://kokku-online.de/sachse-stollen-veganer-dinkelstollen-mit-schokodrops-ohne-/


Unnamed: 0,links,price,price_gr,nutritions,photo_link
0,https://kokku-online.de/sachse-stollen-veganer...,19.99 €,19.99€/kg,"Brennwert1763 kJ / 421 kcalFett20,3g- davon ge...",https://kokku-online.de//bilder/350x350/7806/s...


**Deleting rows with too much info missing**

In [160]:
product_overview.index[product_overview['links']=='https://kokku-online.de/vegane-suessigkeiten-snacks/']

Int64Index([508, 547], dtype='int64')

In [161]:
product_overview = product_overview.drop(product_overview.index[508]) #https://kokku-online.de/vegane-suessigkeiten-snacks/ 

In [162]:
len(product_overview)

554

In [163]:
product_overview.index[product_overview['links']=='https://kokku-online.de/vegane-suessigkeiten-snacks/']

Int64Index([547], dtype='int64')

In [164]:
product_overview = product_overview.drop(product_overview.index[547]) #https://kokku-online.de/vegane-suessigkeiten-snacks/

In [165]:
product_overview.index[product_overview['links']=='https://kokku-online.de/vegane-suessigkeiten-snacks/']

Int64Index([547], dtype='int64')

**Creating data frame**

In [150]:
products_kokku = creating_df(links=product_overview['links'][0:546], not_in=['https://kokku-online.de/vegane-suessigkeiten-snacks/'])

https://kokku-online.de/bio4you-eisbonbon/
https://kokku-online.de/sachse-stollen-stollenkonfekt/
https://kokku-online.de/sachse-stollen-veganer-dinkelstollen-mit-schokodrops-ohne-/
https://kokku-online.de/sachse-stollen-3-stollenscheiben-dinkel-mit-schokodrops-ohne-ros/
https://kokku-online.de/sachse-stollen-dinkelstollen-rosine/
https://kokku-online.de/sachse-stollen-3-stollenscheiben-dinkel-rosine/
https://kokku-online.de/sachse-stollen-dinkelstollen-schoko-mailaender-art-mit-schokodro/
https://kokku-online.de/sachse-stollen-3-stollenscheiben-dinkel-mailaender-art-mit-schok/
https://kokku-online.de/bernsteinzimmer-12er-konfekt-winterwonne/
https://kokku-online.de/bernsteinzimmer-konfekt-adventskalender/
https://kokku-online.de/rosengarten-zartbitter-ingwer/
https://kokku-online.de/moo-free-olivia-der-baer-weisse-schokolade/
https://kokku-online.de/moo-free-snowball-weisse-schokolade-mit-marshmallow-und-honeycom/
https://kokku-online.de/moo-free-oscar-der-baer-vegane-schokolade/
http

In [151]:
products_kokku

Unnamed: 0,links,price,price_gr,nutritions,photo_link
0,https://kokku-online.de/bio4you-eisbonbon/,1.49 €,1.99€/100g,"Brennwert1.632 kJ / 384 kcalFett< 0,1g- davon ...",https://kokku-online.de//bilder/350x350/19266/...
1,https://kokku-online.de/sachse-stollen-stollen...,2.99 €,2.99€/100g,"Brennwert1912 kJ / 457 kcalFett26,7g- davon ge...",https://kokku-online.de//bilder/350x350/12280/...
2,https://kokku-online.de/sachse-stollen-veganer...,19.99 €,19.99€/kg,"Brennwert1763 kJ / 421 kcalFett20,3g- davon ge...",https://kokku-online.de//bilder/350x350/7806/s...
3,https://kokku-online.de/sachse-stollen-3-stoll...,4.99 €,2.00€/100g,"Brennwert1763 kJ / 421 kcalFett20,3g- davon ge...",https://kokku-online.de//bilder/350x350/6140/s...
4,https://kokku-online.de/sachse-stollen-dinkels...,19.99 €,19.99€/kg,"Brennwert1572 kJ / 375 kcalFett14,915,1g- davo...",https://kokku-online.de//bilder/350x350/6167/s...
...,...,...,...,...,...
542,https://kokku-online.de/rapunzel-sesamini-schoko/,0.59 €,2.19€/100g,"Brennwert2065 kJ / 495 kcalFett28,8g- davon ge...",https://kokku-online.de//bilder/350x350/7609/r...
543,https://kokku-online.de/rapunzel-rumba-puffrei...,1.29 €,2.58€/100g,Brennwert2208 kJ / 530 kcalFett33g- davon gesä...,https://kokku-online.de//bilder/350x350/7593/r...
544,https://kokku-online.de/zotter-kakao-nibs-natur/,4.19 €,4.19€/100g,Brennwert2546 kJ / 617 kcalFett54g- davon gesä...,https://kokku-online.de//bilder/350x350/16299/...
545,https://kokku-online.de/xyli-gum-kaugummi-icem...,1.29 €,7.59€/100g,Brennwert724 kJ / 173 kcalFett0g- davon gesätt...,https://kokku-online.de//bilder/350x350/7382/x...


In [152]:
products_kokku.to_csv('products_kokku.csv')

In [153]:
products_kokku_2 = creating_df(links=product_overview['links'][548:])

https://kokku-online.de/vego-chocolate-vego-whole-hazelnut-chocolate-bar-mini/
https://kokku-online.de/vantastic-foods-soy-jerky-original/
https://kokku-online.de/vantastic-foods-schakalode-monsters-schokolinsen/


### Merging data frames

In [156]:
kokku = pd.concat([products_kokku, products_kokku_2], axis=0)
kokku

Unnamed: 0,links,price,price_gr,nutritions,photo_link
0,https://kokku-online.de/bio4you-eisbonbon/,1.49 €,1.99€/100g,"Brennwert1.632 kJ / 384 kcalFett< 0,1g- davon ...",https://kokku-online.de//bilder/350x350/19266/...
1,https://kokku-online.de/sachse-stollen-stollen...,2.99 €,2.99€/100g,"Brennwert1912 kJ / 457 kcalFett26,7g- davon ge...",https://kokku-online.de//bilder/350x350/12280/...
2,https://kokku-online.de/sachse-stollen-veganer...,19.99 €,19.99€/kg,"Brennwert1763 kJ / 421 kcalFett20,3g- davon ge...",https://kokku-online.de//bilder/350x350/7806/s...
3,https://kokku-online.de/sachse-stollen-3-stoll...,4.99 €,2.00€/100g,"Brennwert1763 kJ / 421 kcalFett20,3g- davon ge...",https://kokku-online.de//bilder/350x350/6140/s...
4,https://kokku-online.de/sachse-stollen-dinkels...,19.99 €,19.99€/kg,"Brennwert1572 kJ / 375 kcalFett14,915,1g- davo...",https://kokku-online.de//bilder/350x350/6167/s...
...,...,...,...,...,...
545,https://kokku-online.de/xyli-gum-kaugummi-icem...,1.29 €,7.59€/100g,Brennwert724 kJ / 173 kcalFett0g- davon gesätt...,https://kokku-online.de//bilder/350x350/7382/x...
546,https://kokku-online.de/xyli-gum-kaugummi-pfef...,1.29 €,7.59€/100g,Brennwert724 kJ / 173 kcalFett0g- davon gesätt...,https://kokku-online.de//bilder/350x350/7381/x...
552,https://kokku-online.de/vego-chocolate-vego-wh...,1.99 €,3.06€/100g,"Brennwert2456 kJ / 591 kcalFett40,8g- davon ge...",https://kokku-online.de//bilder/350x350/2454/v...
553,https://kokku-online.de/vantastic-foods-soy-je...,2.99 €,4.27€/100g,"Brennwert1123 kJ / 267 kcalFett7,1g- davon ges...",https://kokku-online.de//bilder/350x350/10296/...


In [166]:
df_kokku = pd.merge(product_overview, kokku, on='links', how='inner')
df_kokku

Unnamed: 0,product,brand,weight,links,price,price_gr,nutritions,photo_link
0,Eisbonbon - 75g,Bio4You,- 75g,https://kokku-online.de/bio4you-eisbonbon/,1.49 €,1.99€/100g,"Brennwert1.632 kJ / 384 kcalFett< 0,1g- davon ...",https://kokku-online.de//bilder/350x350/19266/...
1,Stollenkonfekt - 100g,Bäckerei Sachse,- 100g,https://kokku-online.de/sachse-stollen-stollen...,2.99 €,2.99€/100g,"Brennwert1912 kJ / 457 kcalFett26,7g- davon ge...",https://kokku-online.de//bilder/350x350/12280/...
2,Veganer °Schokodrops° Dinkelstollen mit Puderz...,Bäckerei Sachse,- 1kg,https://kokku-online.de/sachse-stollen-veganer...,19.99 €,19.99€/kg,"Brennwert1763 kJ / 421 kcalFett20,3g- davon ge...",https://kokku-online.de//bilder/350x350/7806/s...
3,3 Stollenscheiben °Schokodrops° (ohne Rosinen)...,Bäckerei Sachse,- 250g,https://kokku-online.de/sachse-stollen-3-stoll...,4.99 €,2.00€/100g,"Brennwert1763 kJ / 421 kcalFett20,3g- davon ge...",https://kokku-online.de//bilder/350x350/6140/s...
4,Veganer °Rosinen° Dinkelstollen - 1kg,Bäckerei Sachse,- 1kg,https://kokku-online.de/sachse-stollen-dinkels...,19.99 €,19.99€/kg,"Brennwert1572 kJ / 375 kcalFett14,915,1g- davo...",https://kokku-online.de//bilder/350x350/6167/s...
...,...,...,...,...,...,...,...,...
544,Kaugummi Icemint - 17g,XyliGum,- 17g,https://kokku-online.de/xyli-gum-kaugummi-icem...,1.29 €,7.59€/100g,Brennwert724 kJ / 173 kcalFett0g- davon gesätt...,https://kokku-online.de//bilder/350x350/7382/x...
545,Kaugummi Peppermint - 17g,XyliGum,- 17g,https://kokku-online.de/xyli-gum-kaugummi-pfef...,1.29 €,7.59€/100g,Brennwert724 kJ / 173 kcalFett0g- davon gesätt...,https://kokku-online.de//bilder/350x350/7381/x...
546,MINI VEGO Haselnuss Schokoriegel - 65g,Vego Chocolate,- 65g,https://kokku-online.de/vego-chocolate-vego-wh...,1.99 €,3.06€/100g,"Brennwert2456 kJ / 591 kcalFett40,8g- davon ge...",https://kokku-online.de//bilder/350x350/2454/v...
547,Soy Jerky Original - 70g,Vantastic Foods,- 70g,https://kokku-online.de/vantastic-foods-soy-je...,2.99 €,4.27€/100g,"Brennwert1123 kJ / 267 kcalFett7,1g- davon ges...",https://kokku-online.de//bilder/350x350/10296/...


In [167]:
df_kokku.to_csv('df_kokku.csv')