# Cleaning

In [1]:
import pandas as pd
import re

## Koro

In [2]:
koro = pd.read_csv('df_koro.csv')

In [3]:
koro.shape

(221, 15)

In [4]:
koro.head()

Unnamed: 0.1,Unnamed: 0,product,links,price,weight,kcal,fat,sat_fat,carbs,sugar,fibre,protein,salt,ingredients,photo_link
0,0,Soja Protein Crispies 58 % mit Kakao 1 kg,https://www.korodrogerie.de/soja-protein-crisp...,"\n\n14,00 €\n",1 kg,1535 / 362,"1,9 g","0,4 g","28,2 g","9,1 g","1,8 g",58 g,"2,7 g","74 % SOJAPROTEIN, Reisgrieß, Zucker, 4 % Kakao...",https://koro2.imgix.net/media/image/f1/50/81/C...
1,1,Schokodrops mit Xylit 1 kg,https://www.korodrogerie.de/schokodrops-mit-xy...,"\n\n21,00 €\n",1 kg,2290 / 555,46 g,28 g,31 g,"1,0 g",10 g,"9,2 g","0,07 g","Kakaomasse, 25 % Süßungsmittel: Xylit; Kakaobu...",https://koro2.imgix.net/media/image/2c/01/f8/S...
2,2,Schoko Protein Crunchies ohne Zuckerzusatz 1 kg,https://www.korodrogerie.de/schoko-protein-cru...,"\n\n20,00 €\n",1 kg,1875 / 448,28 g,12 g,37 g,"1,4 g","8,3 g",21 g,"0,36 g","Süßungsmittel: Maltit, ERDNUSSKERNE, 20 % Prot...",https://koro2.imgix.net/media/image/30/78/62/P...
3,3,Soja Protein Crispies 60 % 1 kg,https://www.korodrogerie.de/soja-protein-crisp...,"\n\n13,00 €\n",1 kg,1541 / 363,"1,8 g","0,2 g",26 g,"0,8 g","0,8 g",60 g,"3,2 g","68 % SOJAPROTEIN, Reisgriess, SOJAMEHL, Salz\t...",https://koro2.imgix.net/media/image/84/55/ca/C...
4,4,Bohnen-Erbsen-Mix geröstet & gesalzen 1 kg,https://www.korodrogerie.de/bohnen-erbsen-mix-...,"\n\n11,50 €\n",1 kg,1766 / 421,14 g,"2,3 g",35 g,"6,7 g",10 g,33 g,"1,0 g","19 % SCHWARZE SOJABOHNEN, 19 % EDAMAME-BOHNEN,...",https://koro2.imgix.net/media/image/a9/37/79/B...


### Cleaning overview

- [x] removing duplicate index column
- [x] removing weight from product name
- [x] new column with brand: "value == Koro" for all rows
- [x] removing extra characters from price
- [ ] cleaning weight column / getting info about weight from product name
- [x] removing 'g' from fat, sat_fat, carbs, sugar, fibre, protein, salt
- [x] split kcal-column in kJ and kcal

Maybe:

- [ ] new column price/gr?
- [ ] ingredients: use first ingredient?

**Dropping duplicate index column**

In [5]:
koro = koro.drop('Unnamed: 0', axis=1)
koro.columns

Index(['product', 'links', 'price', 'weight', 'kcal', 'fat', 'sat_fat',
       'carbs', 'sugar', 'fibre', 'protein', 'salt', 'ingredients',
       'photo_link'],
      dtype='object')

**Clean product name**

In [6]:
def clean_name(x):
    return re.sub(r'\d{2,3}\s\w+|\d{1}\s\w{2}|\d{1}\sx', '', x)

In [7]:
koro["product_clean"] = koro["product"].apply(clean_name)
koro["product_clean"]

0            Soja Protein Crispies 58 % mit Kakao 
1                           Schokodrops mit Xylit 
2      Schoko Protein Crunchies ohne Zuckerzusatz 
3                      Soja Protein Crispies 60 % 
4           Bohnen-Erbsen-Mix geröstet & gesalzen 
                          ...                     
216                     Bio gepuffte Ananasstücke 
217          Bio gepuffte schwarze Johannisbeeren 
218                          Bio gepuffte Kirsche 
219                             Bio Cracker Pizza 
220                            Bio Cracker Pizza  
Name: product_clean, Length: 221, dtype: object

**Adding brand column**

In [8]:
koro['brand'] = 'Koro'
koro.head()

Unnamed: 0,product,links,price,weight,kcal,fat,sat_fat,carbs,sugar,fibre,protein,salt,ingredients,photo_link,product_clean,brand
0,Soja Protein Crispies 58 % mit Kakao 1 kg,https://www.korodrogerie.de/soja-protein-crisp...,"\n\n14,00 €\n",1 kg,1535 / 362,"1,9 g","0,4 g","28,2 g","9,1 g","1,8 g",58 g,"2,7 g","74 % SOJAPROTEIN, Reisgrieß, Zucker, 4 % Kakao...",https://koro2.imgix.net/media/image/f1/50/81/C...,Soja Protein Crispies 58 % mit Kakao,Koro
1,Schokodrops mit Xylit 1 kg,https://www.korodrogerie.de/schokodrops-mit-xy...,"\n\n21,00 €\n",1 kg,2290 / 555,46 g,28 g,31 g,"1,0 g",10 g,"9,2 g","0,07 g","Kakaomasse, 25 % Süßungsmittel: Xylit; Kakaobu...",https://koro2.imgix.net/media/image/2c/01/f8/S...,Schokodrops mit Xylit,Koro
2,Schoko Protein Crunchies ohne Zuckerzusatz 1 kg,https://www.korodrogerie.de/schoko-protein-cru...,"\n\n20,00 €\n",1 kg,1875 / 448,28 g,12 g,37 g,"1,4 g","8,3 g",21 g,"0,36 g","Süßungsmittel: Maltit, ERDNUSSKERNE, 20 % Prot...",https://koro2.imgix.net/media/image/30/78/62/P...,Schoko Protein Crunchies ohne Zuckerzusatz,Koro
3,Soja Protein Crispies 60 % 1 kg,https://www.korodrogerie.de/soja-protein-crisp...,"\n\n13,00 €\n",1 kg,1541 / 363,"1,8 g","0,2 g",26 g,"0,8 g","0,8 g",60 g,"3,2 g","68 % SOJAPROTEIN, Reisgriess, SOJAMEHL, Salz\t...",https://koro2.imgix.net/media/image/84/55/ca/C...,Soja Protein Crispies 60 %,Koro
4,Bohnen-Erbsen-Mix geröstet & gesalzen 1 kg,https://www.korodrogerie.de/bohnen-erbsen-mix-...,"\n\n11,50 €\n",1 kg,1766 / 421,14 g,"2,3 g",35 g,"6,7 g",10 g,33 g,"1,0 g","19 % SCHWARZE SOJABOHNEN, 19 % EDAMAME-BOHNEN,...",https://koro2.imgix.net/media/image/a9/37/79/B...,Bohnen-Erbsen-Mix geröstet & gesalzen,Koro


**Cleaning price column**

In [9]:
def clean_price(col):
    
    cleaned = []
    
    for p in col:
        price = str(p).replace('\n', '').replace('€', '').replace(',', '.')
        cleaned.append(float(price))   
        
    return cleaned

In [10]:
koro['price_clean'] = clean_price(koro['price'])

In [11]:
koro['price_clean']

0      14.0
1      21.0
2      20.0
3      13.0
4      11.5
       ... 
216    18.5
217    24.0
218    20.0
219     5.0
220    27.5
Name: price_clean, Length: 221, dtype: float64

**Cleaning weight column**

In [12]:
koro['weight'].value_counts

<bound method IndexOpsMixin.value_counts of 0         1 kg
1         1 kg
2         1 kg
3         1 kg
4         1 kg
        ...   
216      Polen
217      Polen
218      Polen
219    Italien
220    Italien
Name: weight, Length: 221, dtype: object>

In [13]:
def clean_weight(x):
    return re.findall(r'\d{2,3}\s\w+|\d{1}\s\w{2}|\d{1}\sx', '', x)

In [14]:
weights = []
for p in koro['product']:
    weight = clean_weight(p)
    weights.append(weight)

TypeError: unsupported operand type(s) for &: 'str' and 'int'

**Cleaning nutrient columns**

In [15]:
def nutrient_cleaner(col):
    
    cleaned = []
    
    for n in col:
        n = str(n).replace('g', '').replace(',', '')
        cleaned.append(float(n))
    
    return cleaned

In [16]:
koro['fat_clean'] = nutrient_cleaner(koro['fat'])
koro['fat_clean']

0      19.0
1      46.0
2      28.0
3      18.0
4      14.0
       ... 
216     0.0
217    23.0
218     5.0
219    13.0
220    13.0
Name: fat_clean, Length: 221, dtype: float64

In [17]:
def batch_cleaning_nutrients(df, in_columns=[]):
    
    for col in df.columns:
        if col in in_columns:
            
            df[col] = nutrient_cleaner(df[col])
            
    return df

In [18]:
batch_cleaning_nutrients(koro, in_columns=['fat','sat_fat', 'carbs', 'sugar', 'fibre', 'protein', 'salt'])

Unnamed: 0,product,links,price,weight,kcal,fat,sat_fat,carbs,sugar,fibre,protein,salt,ingredients,photo_link,product_clean,brand,price_clean,fat_clean
0,Soja Protein Crispies 58 % mit Kakao 1 kg,https://www.korodrogerie.de/soja-protein-crisp...,"\n\n14,00 €\n",1 kg,1535 / 362,19.0,4.0,282.0,91.0,18.0,58.0,27.0,"74 % SOJAPROTEIN, Reisgrieß, Zucker, 4 % Kakao...",https://koro2.imgix.net/media/image/f1/50/81/C...,Soja Protein Crispies 58 % mit Kakao,Koro,14.0,19.0
1,Schokodrops mit Xylit 1 kg,https://www.korodrogerie.de/schokodrops-mit-xy...,"\n\n21,00 €\n",1 kg,2290 / 555,46.0,28.0,31.0,10.0,10.0,92.0,7.0,"Kakaomasse, 25 % Süßungsmittel: Xylit; Kakaobu...",https://koro2.imgix.net/media/image/2c/01/f8/S...,Schokodrops mit Xylit,Koro,21.0,46.0
2,Schoko Protein Crunchies ohne Zuckerzusatz 1 kg,https://www.korodrogerie.de/schoko-protein-cru...,"\n\n20,00 €\n",1 kg,1875 / 448,28.0,12.0,37.0,14.0,83.0,21.0,36.0,"Süßungsmittel: Maltit, ERDNUSSKERNE, 20 % Prot...",https://koro2.imgix.net/media/image/30/78/62/P...,Schoko Protein Crunchies ohne Zuckerzusatz,Koro,20.0,28.0
3,Soja Protein Crispies 60 % 1 kg,https://www.korodrogerie.de/soja-protein-crisp...,"\n\n13,00 €\n",1 kg,1541 / 363,18.0,2.0,26.0,8.0,8.0,60.0,32.0,"68 % SOJAPROTEIN, Reisgriess, SOJAMEHL, Salz\t...",https://koro2.imgix.net/media/image/84/55/ca/C...,Soja Protein Crispies 60 %,Koro,13.0,18.0
4,Bohnen-Erbsen-Mix geröstet & gesalzen 1 kg,https://www.korodrogerie.de/bohnen-erbsen-mix-...,"\n\n11,50 €\n",1 kg,1766 / 421,14.0,23.0,35.0,67.0,10.0,33.0,10.0,"19 % SCHWARZE SOJABOHNEN, 19 % EDAMAME-BOHNEN,...",https://koro2.imgix.net/media/image/a9/37/79/B...,Bohnen-Erbsen-Mix geröstet & gesalzen,Koro,11.5,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
216,Bio gepuffte Ananasstücke 500 g,https://www.korodrogerie.de/bio-gepuffte-anana...,"\n\n18,50 €\n",Polen,1612 / 380,0.0,0.0,87.0,85.0,79.0,30.0,0.0,"KoRo Handels GmbHKoppenplatz 9, 10115 Berlin",https://koro2.imgix.net/media/image/b8/cc/8c/B...,Bio gepuffte Ananasstücke,Koro,18.5,0.0
217,Bio gepuffte schwarze Johannisbeeren 500 g,https://www.korodrogerie.de/bio-gepuffte-schwa...,"\n\n24,00 €\n",Polen,1336 / 319,23.0,4.0,54.0,44.0,31.0,50.0,2.0,"KoRo Handels GmbHKoppenplatz 9, 10115 Berlin",https://koro2.imgix.net/media/image/47/fa/41/B...,Bio gepuffte schwarze Johannisbeeren,Koro,24.0,23.0
218,Bio gepuffte Kirsche 500 g,https://www.korodrogerie.de/bio-gepuffte-kirsc...,"\n\n20,00 €\n",Polen,1472 / 348,5.0,2.0,75.0,41.0,95.0,60.0,2.0,"KoRo Handels GmbHKoppenplatz 9, 10115 Berlin",https://koro2.imgix.net/media/image/38/f1/16/P...,Bio gepuffte Kirsche,Koro,20.0,5.0
219,Bio Cracker Pizza 500 g,https://www.korodrogerie.de/bio-cracker-pizza-...,"\n\n5,00 €\n",Italien,1830 / 435,13.0,11.0,67.0,23.0,29.0,10.0,29.0,"KoRo Handels GmbHKoppenplatz 9, 10115 Berlin",https://koro2.imgix.net/media/image/7e/83/e0/C...,Bio Cracker Pizza,Koro,5.0,13.0


**Splitting kcal column**

In [19]:
koro[['kj_clean','kcal_clean']] = koro['kcal'].str.split('/', expand=True)

In [20]:
koro['kj_clean'] = koro['kj_clean'].astype('float')
koro['kcal_clean'] = koro['kcal_clean'].astype('float')

In [21]:
koro.head(5)

Unnamed: 0,product,links,price,weight,kcal,fat,sat_fat,carbs,sugar,fibre,protein,salt,ingredients,photo_link,product_clean,brand,price_clean,fat_clean,kj_clean,kcal_clean
0,Soja Protein Crispies 58 % mit Kakao 1 kg,https://www.korodrogerie.de/soja-protein-crisp...,"\n\n14,00 €\n",1 kg,1535 / 362,19.0,4.0,282.0,91.0,18.0,58.0,27.0,"74 % SOJAPROTEIN, Reisgrieß, Zucker, 4 % Kakao...",https://koro2.imgix.net/media/image/f1/50/81/C...,Soja Protein Crispies 58 % mit Kakao,Koro,14.0,19.0,1535.0,362.0
1,Schokodrops mit Xylit 1 kg,https://www.korodrogerie.de/schokodrops-mit-xy...,"\n\n21,00 €\n",1 kg,2290 / 555,46.0,28.0,31.0,10.0,10.0,92.0,7.0,"Kakaomasse, 25 % Süßungsmittel: Xylit; Kakaobu...",https://koro2.imgix.net/media/image/2c/01/f8/S...,Schokodrops mit Xylit,Koro,21.0,46.0,2290.0,555.0
2,Schoko Protein Crunchies ohne Zuckerzusatz 1 kg,https://www.korodrogerie.de/schoko-protein-cru...,"\n\n20,00 €\n",1 kg,1875 / 448,28.0,12.0,37.0,14.0,83.0,21.0,36.0,"Süßungsmittel: Maltit, ERDNUSSKERNE, 20 % Prot...",https://koro2.imgix.net/media/image/30/78/62/P...,Schoko Protein Crunchies ohne Zuckerzusatz,Koro,20.0,28.0,1875.0,448.0
3,Soja Protein Crispies 60 % 1 kg,https://www.korodrogerie.de/soja-protein-crisp...,"\n\n13,00 €\n",1 kg,1541 / 363,18.0,2.0,26.0,8.0,8.0,60.0,32.0,"68 % SOJAPROTEIN, Reisgriess, SOJAMEHL, Salz\t...",https://koro2.imgix.net/media/image/84/55/ca/C...,Soja Protein Crispies 60 %,Koro,13.0,18.0,1541.0,363.0
4,Bohnen-Erbsen-Mix geröstet & gesalzen 1 kg,https://www.korodrogerie.de/bohnen-erbsen-mix-...,"\n\n11,50 €\n",1 kg,1766 / 421,14.0,23.0,35.0,67.0,10.0,33.0,10.0,"19 % SCHWARZE SOJABOHNEN, 19 % EDAMAME-BOHNEN,...",https://koro2.imgix.net/media/image/a9/37/79/B...,Bohnen-Erbsen-Mix geröstet & gesalzen,Koro,11.5,14.0,1766.0,421.0


**Cleaning final dataframe**

In [22]:
koro.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221 entries, 0 to 220
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product        221 non-null    object 
 1   links          221 non-null    object 
 2   price          221 non-null    object 
 3   weight         221 non-null    object 
 4   kcal           221 non-null    object 
 5   fat            221 non-null    float64
 6   sat_fat        220 non-null    float64
 7   carbs          220 non-null    float64
 8   sugar          220 non-null    float64
 9   fibre          219 non-null    float64
 10  protein        221 non-null    float64
 11  salt           221 non-null    float64
 12  ingredients    221 non-null    object 
 13  photo_link     221 non-null    object 
 14  product_clean  221 non-null    object 
 15  brand          221 non-null    object 
 16  price_clean    221 non-null    float64
 17  fat_clean      221 non-null    float64
 18  kj_clean  

In [23]:
drop_list = ['price', 'weight', 'kcal', 'fat_clean','ingredients']
koro = koro.drop(drop_list, axis=1)

In [24]:
koro = koro.reindex(columns=['product_clean','brand','price_clean', 'kj_clean', 'kcal_clean', 'fat', 'sat_fat', 'carbs', 'sugar', 'fibre', 'protein', 'salt',
                            'links', 'photo_link'])

In [25]:
koro

Unnamed: 0,product_clean,brand,price_clean,kj_clean,kcal_clean,fat,sat_fat,carbs,sugar,fibre,protein,salt,links,photo_link
0,Soja Protein Crispies 58 % mit Kakao,Koro,14.0,1535.0,362.0,19.0,4.0,282.0,91.0,18.0,58.0,27.0,https://www.korodrogerie.de/soja-protein-crisp...,https://koro2.imgix.net/media/image/f1/50/81/C...
1,Schokodrops mit Xylit,Koro,21.0,2290.0,555.0,46.0,28.0,31.0,10.0,10.0,92.0,7.0,https://www.korodrogerie.de/schokodrops-mit-xy...,https://koro2.imgix.net/media/image/2c/01/f8/S...
2,Schoko Protein Crunchies ohne Zuckerzusatz,Koro,20.0,1875.0,448.0,28.0,12.0,37.0,14.0,83.0,21.0,36.0,https://www.korodrogerie.de/schoko-protein-cru...,https://koro2.imgix.net/media/image/30/78/62/P...
3,Soja Protein Crispies 60 %,Koro,13.0,1541.0,363.0,18.0,2.0,26.0,8.0,8.0,60.0,32.0,https://www.korodrogerie.de/soja-protein-crisp...,https://koro2.imgix.net/media/image/84/55/ca/C...
4,Bohnen-Erbsen-Mix geröstet & gesalzen,Koro,11.5,1766.0,421.0,14.0,23.0,35.0,67.0,10.0,33.0,10.0,https://www.korodrogerie.de/bohnen-erbsen-mix-...,https://koro2.imgix.net/media/image/a9/37/79/B...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
216,Bio gepuffte Ananasstücke,Koro,18.5,1612.0,380.0,0.0,0.0,87.0,85.0,79.0,30.0,0.0,https://www.korodrogerie.de/bio-gepuffte-anana...,https://koro2.imgix.net/media/image/b8/cc/8c/B...
217,Bio gepuffte schwarze Johannisbeeren,Koro,24.0,1336.0,319.0,23.0,4.0,54.0,44.0,31.0,50.0,2.0,https://www.korodrogerie.de/bio-gepuffte-schwa...,https://koro2.imgix.net/media/image/47/fa/41/B...
218,Bio gepuffte Kirsche,Koro,20.0,1472.0,348.0,5.0,2.0,75.0,41.0,95.0,60.0,2.0,https://www.korodrogerie.de/bio-gepuffte-kirsc...,https://koro2.imgix.net/media/image/38/f1/16/P...
219,Bio Cracker Pizza,Koro,5.0,1830.0,435.0,13.0,11.0,67.0,23.0,29.0,10.0,29.0,https://www.korodrogerie.de/bio-cracker-pizza-...,https://koro2.imgix.net/media/image/7e/83/e0/C...


## Kokku

In [63]:
kokku = pd.read_csv('df_kokku.csv')

In [64]:
kokku.shape

(549, 9)

In [65]:
kokku.head()

Unnamed: 0.1,Unnamed: 0,product,brand,weight,links,price,price_gr,nutritions,photo_link
0,0,Eisbonbon - 75g,Bio4You,- 75g,https://kokku-online.de/bio4you-eisbonbon/,1.49 €,1.99€/100g,"Brennwert1.632 kJ / 384 kcalFett< 0,1g- davon ...",https://kokku-online.de//bilder/350x350/19266/...
1,1,Stollenkonfekt - 100g,Bäckerei Sachse,- 100g,https://kokku-online.de/sachse-stollen-stollen...,2.99 €,2.99€/100g,"Brennwert1912 kJ / 457 kcalFett26,7g- davon ge...",https://kokku-online.de//bilder/350x350/12280/...
2,2,Veganer °Schokodrops° Dinkelstollen mit Puderz...,Bäckerei Sachse,- 1kg,https://kokku-online.de/sachse-stollen-veganer...,19.99 €,19.99€/kg,"Brennwert1763 kJ / 421 kcalFett20,3g- davon ge...",https://kokku-online.de//bilder/350x350/7806/s...
3,3,3 Stollenscheiben °Schokodrops° (ohne Rosinen)...,Bäckerei Sachse,- 250g,https://kokku-online.de/sachse-stollen-3-stoll...,4.99 €,2.00€/100g,"Brennwert1763 kJ / 421 kcalFett20,3g- davon ge...",https://kokku-online.de//bilder/350x350/6140/s...
4,4,Veganer °Rosinen° Dinkelstollen - 1kg,Bäckerei Sachse,- 1kg,https://kokku-online.de/sachse-stollen-dinkels...,19.99 €,19.99€/kg,"Brennwert1572 kJ / 375 kcalFett14,915,1g- davo...",https://kokku-online.de//bilder/350x350/6167/s...


### Cleaning overview

- [x] removing duplicate index column
- [x] removing weight from product name
- [x] removing extra charcters from weight + changing column type
- [x] removing extra characters from price + changing column type
- [ ] splitting nutrition column: kcal, fat, sat_fat, carbs, sugar, fibre, protein, salt

Maybe:

- [ ] cleaning price_gr column?


**Dropping duplicate index column**

In [66]:
kokku= kokku.drop('Unnamed: 0', axis=1)
kokku.columns

Index(['product', 'brand', 'weight', 'links', 'price', 'price_gr',
       'nutritions', 'photo_link'],
      dtype='object')

**Cleaning product name**

In [67]:
kokku['product'].value_counts()

Kakao Nibs Natur - 100g                           2
Schokotäfelchen 3er-Set - 120g                    2
Kakaobohnen Peru geröstet - 100g                  2
°Glücksstücke° - 70g                              1
Veggie Vine Gums - 100g                           1
                                                 ..
Lateinamerika Edelbitter Schokolade 100% - 80g    1
Nougat Lebkuchen Taler - 150g                     1
Laugenstangen 2x80g - 160g                        1
Erdbeer Rhabarber Fruchtkonfekt - 80g             1
Veggie Hearts - 100g                              1
Name: product, Length: 546, dtype: int64

In [68]:
def clean_name_kokku(x):
    return re.sub(r'[-]\s\d{2,3}\w+|[-]\s\d{1}\w+', '', x)

In [69]:
kokku["product_clean"] = kokku["product"].apply(clean_name_kokku)
kokku["product_clean"].sample(10)

58                             Edel Bitter Chili 
101                      Tafel °Mandel & Baobab° 
451               So Free Hanfsamen °zuckerfrei° 
536           Helle Schokolade MaplePekan & Salz 
494                          Lutscher °Erdbeere° 
468                        Riegel °Mandel Feige° 
446    °White Chocolate Macadamia° Energieriegel 
289                     Kinderriegel Starry Inca 
447       °Chocolate Almond Fudge° Energieriegel 
297                      Mini Mandel Spekulatius 
Name: product_clean, dtype: object

**Cleaning weight**

In [70]:
kokku['weight'] = kokku['weight'].str.replace('- ', '')

In [73]:
#removing kg

def clean_weight(col):
    
    clean_weight = []
    
    for w in col:
        if w.endswith('kg'):
            w = w.replace('kg', '')
            w = float(w)
            clean_weight.append(w)
            
        elif w.endswith('g'):
            w = w.replace('g', '')
            w = float(w)
            w = w/1000
            clean_weight.append(w)
        
        else:
            clean_weight.append(w)
    
    return clean_weight
            

In [74]:
kokku['clean_weight'] = clean_weight(kokku['weight'])
kokku['clean_weight']

0      0.075
1        0.1
2        1.0
3       0.25
4        1.0
       ...  
544    0.017
545    0.017
546    0.065
547     0.07
548    0.045
Name: clean_weight, Length: 549, dtype: object

**Cleaning price**

In [76]:
kokku['price'].value_counts()

2.49 €           78
1.99 €           43
2.99 €           39
0.99 €           32
2.59 €           23
                 ..
7.39 €            1
14.99 €           1
6.99 €            1
4.39 €            1
bisher 4.19 €     1
Name: price, Length: 77, dtype: int64

In [78]:
kokku['clean_price'] = kokku['price'].str.replace(' €', '').str.replace('bisher ', '').astype('float')
kokku['clean_price']

0       1.49
1       2.99
2      19.99
3       4.99
4      19.99
       ...  
544     1.29
545     1.29
546     1.99
547     2.99
548     1.99
Name: clean_price, Length: 549, dtype: float64

**Cleaning nutrients**

In [85]:
kokku['nutritions'].value_counts()

Brennwert2715 kJ / 655 kcalFett46g- davon gesättigte Fettsäuren28gKohlenhydrate31g- davon Zucker1gEiweiß9,2gSalz0,07g                6
Brennwert2458 kJ / 591 kcalFett41g- davon gesättigte Fettsäuren24gKohlenhydrate51g- davon Zucker42,5gEiweiß2,8gSalz0,07g             4
Brennwert2458 kJ / 591 kcalFett40,9g- davon gesättigte Fettsäuren24gKohlenhydrate51,1g- davon Zucker42,5gEiweiß2,8gSalz0,07g         4
Brennwert1372 kJ / 323 kcalFett0,1g- davon gesättigte Fettsäuren0,1gKohlenhydrate78g- davon Zucker58gEiweiß<0,2gSalz0,75g            4
Brennwert2425 kJ / 585 kcalFett44.9g- davon gesättigte Fettsäuren28.3gKohlenhydrate32.5g- davon Zucker27gEiweiß7gSalz< 0.02g         4
                                                                                                                                    ..
Brennwert2310 kJ / 551 kcalFett36g- davon gesättigte Fettsäuren21gKohlenhydrate56g- davon Zucker42gEiweiß2,5gSalz0,25g               1
Brennwert2429 kJ / 584 kcalFett42g- davon gesättigte Fe

In [None]:
def clean_nutrients(col):
    
    kcals = []
    fats = []
    sat_fats = []
    carbs = []
    sugars = []
    proteins = []
    salts = []
    
    for n in col:
        
        kcal = re.findall()
        kcals.append(kcal)
        
        fat = re.findall()
        fats.append(fat)
        
        sat_fat = re.findall()
        kcals.append(sat_fat)
        
        carb = re.findall()
        carbs.append(carb)
        
        sugar = re.findall()
        sugars.append(sugar)
        
        protein = re.findall()
        proteins.append(protein)
        
        salt = re.findall()
        salts.append(salt)
        
    return [kcals, fats, sat_fats, carbs, sugars, proteins, salts]
        

In [83]:
kokku[['kcal', 'rest']] = kokku['nutritions'].str.split('Fett', expand=True)

ValueError: Columns must be same length as key