# Cleaning

In [1]:
import pandas as pd
import re

## Koro

In [81]:
koro = pd.read_csv('df_koro.csv')

In [60]:
koro.shape

(221, 15)

In [61]:
koro.head()

Unnamed: 0.1,Unnamed: 0,product,links,price,weight,kcal,fat,sat_fat,carbs,sugar,fibre,protein,salt,ingredients,photo_link
0,0,Soja Protein Crispies 58 % mit Kakao 1 kg,https://www.korodrogerie.de/soja-protein-crisp...,"\n\n14,00 €\n",1 kg,1535 / 362,"1,9 g","0,4 g","28,2 g","9,1 g","1,8 g",58 g,"2,7 g","74 % SOJAPROTEIN, Reisgrieß, Zucker, 4 % Kakao...",https://koro2.imgix.net/media/image/f1/50/81/C...
1,1,Schokodrops mit Xylit 1 kg,https://www.korodrogerie.de/schokodrops-mit-xy...,"\n\n21,00 €\n",1 kg,2290 / 555,46 g,28 g,31 g,"1,0 g",10 g,"9,2 g","0,07 g","Kakaomasse, 25 % Süßungsmittel: Xylit; Kakaobu...",https://koro2.imgix.net/media/image/2c/01/f8/S...
2,2,Schoko Protein Crunchies ohne Zuckerzusatz 1 kg,https://www.korodrogerie.de/schoko-protein-cru...,"\n\n20,00 €\n",1 kg,1875 / 448,28 g,12 g,37 g,"1,4 g","8,3 g",21 g,"0,36 g","Süßungsmittel: Maltit, ERDNUSSKERNE, 20 % Prot...",https://koro2.imgix.net/media/image/30/78/62/P...
3,3,Soja Protein Crispies 60 % 1 kg,https://www.korodrogerie.de/soja-protein-crisp...,"\n\n13,00 €\n",1 kg,1541 / 363,"1,8 g","0,2 g",26 g,"0,8 g","0,8 g",60 g,"3,2 g","68 % SOJAPROTEIN, Reisgriess, SOJAMEHL, Salz\t...",https://koro2.imgix.net/media/image/84/55/ca/C...
4,4,Bohnen-Erbsen-Mix geröstet & gesalzen 1 kg,https://www.korodrogerie.de/bohnen-erbsen-mix-...,"\n\n11,50 €\n",1 kg,1766 / 421,14 g,"2,3 g",35 g,"6,7 g",10 g,33 g,"1,0 g","19 % SCHWARZE SOJABOHNEN, 19 % EDAMAME-BOHNEN,...",https://koro2.imgix.net/media/image/a9/37/79/B...


### Cleaning overview

- removing duplicate index column
- new column with brand: "value == Koro" for all rows
- removing extra characters from price
- cleaning weight column / getting info about weight from product name
- removing 'g' from fat, sat_fat, carbs, sugar, fibre, protein, salt
- split kcal-column in kJ and kcal
- ingredients: use first ingredient?

Maybe:
- removing weight from product name?
- new column price/gr?

**Dropping duplicate index column**

In [62]:
koro = koro.drop('Unnamed: 0', axis=1)
koro.columns

Index(['product', 'links', 'price', 'weight', 'kcal', 'fat', 'sat_fat',
       'carbs', 'sugar', 'fibre', 'protein', 'salt', 'ingredients',
       'photo_link'],
      dtype='object')

**Clean product name**

In [142]:
def clean_name(x):
    return re.sub(r'\d{2,3}\s\w+|\d{1}\s\w{2}|\d{1}\sx', '', x)

In [143]:
koro["cleaned_name"] = koro["product"].apply(clean_name)
koro["cleaned_name"]

0            Soja Protein Crispies 58 % mit Kakao 
1                           Schokodrops mit Xylit 
2      Schoko Protein Crunchies ohne Zuckerzusatz 
3                      Soja Protein Crispies 60 % 
4           Bohnen-Erbsen-Mix geröstet & gesalzen 
                          ...                     
216                     Bio gepuffte Ananasstücke 
217          Bio gepuffte schwarze Johannisbeeren 
218                          Bio gepuffte Kirsche 
219                             Bio Cracker Pizza 
220                            Bio Cracker Pizza  
Name: cleaned_name, Length: 221, dtype: object

**Adding brand column**

In [63]:
koro['brand'] = 'Koro'
koro.head()

Unnamed: 0,product,links,price,weight,kcal,fat,sat_fat,carbs,sugar,fibre,protein,salt,ingredients,photo_link,brand
0,Soja Protein Crispies 58 % mit Kakao 1 kg,https://www.korodrogerie.de/soja-protein-crisp...,"\n\n14,00 €\n",1 kg,1535 / 362,"1,9 g","0,4 g","28,2 g","9,1 g","1,8 g",58 g,"2,7 g","74 % SOJAPROTEIN, Reisgrieß, Zucker, 4 % Kakao...",https://koro2.imgix.net/media/image/f1/50/81/C...,Koro
1,Schokodrops mit Xylit 1 kg,https://www.korodrogerie.de/schokodrops-mit-xy...,"\n\n21,00 €\n",1 kg,2290 / 555,46 g,28 g,31 g,"1,0 g",10 g,"9,2 g","0,07 g","Kakaomasse, 25 % Süßungsmittel: Xylit; Kakaobu...",https://koro2.imgix.net/media/image/2c/01/f8/S...,Koro
2,Schoko Protein Crunchies ohne Zuckerzusatz 1 kg,https://www.korodrogerie.de/schoko-protein-cru...,"\n\n20,00 €\n",1 kg,1875 / 448,28 g,12 g,37 g,"1,4 g","8,3 g",21 g,"0,36 g","Süßungsmittel: Maltit, ERDNUSSKERNE, 20 % Prot...",https://koro2.imgix.net/media/image/30/78/62/P...,Koro
3,Soja Protein Crispies 60 % 1 kg,https://www.korodrogerie.de/soja-protein-crisp...,"\n\n13,00 €\n",1 kg,1541 / 363,"1,8 g","0,2 g",26 g,"0,8 g","0,8 g",60 g,"3,2 g","68 % SOJAPROTEIN, Reisgriess, SOJAMEHL, Salz\t...",https://koro2.imgix.net/media/image/84/55/ca/C...,Koro
4,Bohnen-Erbsen-Mix geröstet & gesalzen 1 kg,https://www.korodrogerie.de/bohnen-erbsen-mix-...,"\n\n11,50 €\n",1 kg,1766 / 421,14 g,"2,3 g",35 g,"6,7 g",10 g,33 g,"1,0 g","19 % SCHWARZE SOJABOHNEN, 19 % EDAMAME-BOHNEN,...",https://koro2.imgix.net/media/image/a9/37/79/B...,Koro


**Cleaning price column**

In [122]:
def clean_price(col):
    
    cleaned = []
    
    for p in col:
        price = str(p).replace('\n', '').replace('€', '').replace(',', '.')
        cleaned.append(float(price))   
        
    return cleaned

In [124]:
koro['cleaned_price'] = clean_price(koro['price'])

In [126]:
koro['cleaned_price']

0      14.0
1      21.0
2      20.0
3      13.0
4      11.5
       ... 
216    18.5
217    24.0
218    20.0
219     5.0
220    27.5
Name: cleaned_price, Length: 221, dtype: float64

**Cleaning weight column**

In [49]:
koro['weight'].value_counts

<bound method IndexOpsMixin.value_counts of 0         1 kg
1         1 kg
2         1 kg
3         1 kg
4         1 kg
        ...   
216      Polen
217      Polen
218      Polen
219    Italien
220    Italien
Name: weight, Length: 221, dtype: object>

In [140]:
def clean_weight(x):
    return re.findall(r'\d{2,3}\s\w+|\d{1}\s\w{2}|\d{1}\sx', '', x)

In [145]:
weights = []
for p in koro['product']:
    weight = clean_weight(p)
    weights.append(weight)

TypeError: unsupported operand type(s) for &: 'str' and 'int'

**Cleaning nutrient columns**

In [133]:
def nutrient_cleaner(col):
    
    cleaned = []
    
    for n in col:
        n = str(n).replace('g', '').replace(',', '')
        cleaned.append(float(n))
    
    return cleaned

In [134]:
koro['cleaned_fat'] = nutrient_cleaner(koro['fat'])
koro['cleaned_fat']

0      19.0
1      46.0
2      28.0
3      18.0
4      14.0
       ... 
216     0.0
217    23.0
218     5.0
219    13.0
220    13.0
Name: cleaned_fat, Length: 221, dtype: float64

In [135]:
def batch_cleaning_nutrients(df, in_columns=[]):
    
    for col in df.columns:
        if col in in_columns:
            
            df[col] = nutrient_cleaner(df[col])
            
    return df

In [137]:
batch_cleaning_nutrients(koro, in_columns=['fat','sat_fat', 'carbs', 'sugar', 'fibre', 'protein', 'salt'])

Unnamed: 0.1,Unnamed: 0,product,links,price,weight,kcal,fat,sat_fat,carbs,sugar,fibre,protein,salt,ingredients,photo_link,price_cleaned,cleaned_price,cleaned_fat
0,0.0,Soja Protein Crispies 58 % mit Kakao 1 kg,https://www.korodrogerie.de/soja-protein-crisp...,"\n\n14,00 €\n",1 kg,1535 / 362,19.0,4.0,282.0,91.0,18.0,58.0,27.0,"74 % SOJAPROTEIN, Reisgrieß, Zucker, 4 % Kakao...",https://koro2.imgix.net/media/image/f1/50/81/C...,"\n\n14,00 €\n",14.0,19.0
1,1.0,Schokodrops mit Xylit 1 kg,https://www.korodrogerie.de/schokodrops-mit-xy...,"\n\n21,00 €\n",1 kg,2290 / 555,46.0,28.0,31.0,10.0,10.0,92.0,7.0,"Kakaomasse, 25 % Süßungsmittel: Xylit; Kakaobu...",https://koro2.imgix.net/media/image/2c/01/f8/S...,"\n\n21,00 €\n",21.0,46.0
2,2.0,Schoko Protein Crunchies ohne Zuckerzusatz 1 kg,https://www.korodrogerie.de/schoko-protein-cru...,"\n\n20,00 €\n",1 kg,1875 / 448,28.0,12.0,37.0,14.0,83.0,21.0,36.0,"Süßungsmittel: Maltit, ERDNUSSKERNE, 20 % Prot...",https://koro2.imgix.net/media/image/30/78/62/P...,"\n\n20,00 €\n",20.0,28.0
3,3.0,Soja Protein Crispies 60 % 1 kg,https://www.korodrogerie.de/soja-protein-crisp...,"\n\n13,00 €\n",1 kg,1541 / 363,18.0,2.0,26.0,8.0,8.0,60.0,32.0,"68 % SOJAPROTEIN, Reisgriess, SOJAMEHL, Salz\t...",https://koro2.imgix.net/media/image/84/55/ca/C...,"\n\n13,00 €\n",13.0,18.0
4,4.0,Bohnen-Erbsen-Mix geröstet & gesalzen 1 kg,https://www.korodrogerie.de/bohnen-erbsen-mix-...,"\n\n11,50 €\n",1 kg,1766 / 421,14.0,23.0,35.0,67.0,10.0,33.0,10.0,"19 % SCHWARZE SOJABOHNEN, 19 % EDAMAME-BOHNEN,...",https://koro2.imgix.net/media/image/a9/37/79/B...,"\n\n11,50 €\n",11.5,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
216,216.0,Bio gepuffte Ananasstücke 500 g,https://www.korodrogerie.de/bio-gepuffte-anana...,"\n\n18,50 €\n",Polen,1612 / 380,0.0,0.0,87.0,85.0,79.0,30.0,0.0,"KoRo Handels GmbHKoppenplatz 9, 10115 Berlin",https://koro2.imgix.net/media/image/b8/cc/8c/B...,"\n\n18,50 €\n",18.5,0.0
217,217.0,Bio gepuffte schwarze Johannisbeeren 500 g,https://www.korodrogerie.de/bio-gepuffte-schwa...,"\n\n24,00 €\n",Polen,1336 / 319,23.0,4.0,54.0,44.0,31.0,50.0,2.0,"KoRo Handels GmbHKoppenplatz 9, 10115 Berlin",https://koro2.imgix.net/media/image/47/fa/41/B...,"\n\n24,00 €\n",24.0,23.0
218,218.0,Bio gepuffte Kirsche 500 g,https://www.korodrogerie.de/bio-gepuffte-kirsc...,"\n\n20,00 €\n",Polen,1472 / 348,5.0,2.0,75.0,41.0,95.0,60.0,2.0,"KoRo Handels GmbHKoppenplatz 9, 10115 Berlin",https://koro2.imgix.net/media/image/38/f1/16/P...,"\n\n20,00 €\n",20.0,5.0
219,219.0,Bio Cracker Pizza 500 g,https://www.korodrogerie.de/bio-cracker-pizza-...,"\n\n5,00 €\n",Italien,1830 / 435,13.0,11.0,67.0,23.0,29.0,10.0,29.0,"KoRo Handels GmbHKoppenplatz 9, 10115 Berlin",https://koro2.imgix.net/media/image/7e/83/e0/C...,"\n\n5,00 €\n",5.0,13.0


**Splitting kcal column**

## Kokku