In [231]:
import pandas as pd
import numpy as np
import sys

### Reading of the train set

In [232]:
column_names = ['id','name','barcode','category','Énergie (kJ)','Sel (g)','Protéines (g)','Sucres (g)',\
                'Glucides (g)','Matières grasses (g)','Acides gras saturés (g)','Fibres alimentaires (g)',\
                'Biotine (µg)','Vitamine B1 (Thiamine) (mg)','Vitamine E (Tocopherol) (mg)','Acide folique (µg)',\
                'Vitamine A (µg)','Vitamine D (Cholacalciferol) (µg)','Vitamine C (Acide ascorbique) (mg)',\
                'Vitamine B6 (Pyridoxine) (mg)','Vitamine B5 (acide pantothénique) (mg)','Vitamine B3 (Niacine) (mg)',\
                'Vitamine B2 (Riboflavine) (mg)','Vitamine B12 (Cobalamine) (µg)','endline', 'extra']
train_set = pd.read_csv('../clustering/raw_cluster_train_set.csv', names = column_names)

### Dropping unfound products in OpenFood

In [233]:
# dropping first line that repeats columns
train_set.drop(0, inplace=True)
train_set = train_set[train_set['name'].notnull()]

### Processing of product names containing ','

For the elements of the train set that had a ',' in their name the csv formatting shifted every value and created an extra column. Every value is reassigned to its right place below.

In [234]:
elem_to_shift = train_set[train_set['extra'].notnull()].copy()
for i in range(elem_to_shift.shape[0]):
    line = elem_to_shift.iloc[i].copy()
    line_values = line.values
    line_values[1] = line_values[1] + line_values[2]
    line_values = np.delete(line_values, 2)
    line_values = np.append(line_values, [np.nan])
    elem_to_shift.iloc[i] = line_values

train_set[train_set['extra'].notnull()] = elem_to_shift


### Processing categories that have more than 2 levels by removing extra levels

In [235]:
catagory_column_value = train_set['category'].values

for i in range(train_set.shape[0]):
    if catagory_column_value[i].count('/') > 2:
        current_category =  catagory_column_value[i]
        slash_indexes = [pos for pos, char in enumerate(current_category) if char == '/']
        catagory_column_value[i] = current_category[:slash_indexes[2]]

train_set['category'] = catagory_column_value

### Dropping useless columns

In [236]:
train_set.drop(['extra', 'endline'], axis = 1, inplace = True)

In [237]:
train_set.columns

Index(['id', 'name', 'barcode', 'category', 'Énergie (kJ)', 'Sel (g)',
       'Protéines (g)', 'Sucres (g)', 'Glucides (g)', 'Matières grasses (g)',
       'Acides gras saturés (g)', 'Fibres alimentaires (g)', 'Biotine (µg)',
       'Vitamine B1 (Thiamine) (mg)', 'Vitamine E (Tocopherol) (mg)',
       'Acide folique (µg)', 'Vitamine A (µg)',
       'Vitamine D (Cholacalciferol) (µg)',
       'Vitamine C (Acide ascorbique) (mg)', 'Vitamine B6 (Pyridoxine) (mg)',
       'Vitamine B5 (acide pantothénique) (mg)', 'Vitamine B3 (Niacine) (mg)',
       'Vitamine B2 (Riboflavine) (mg)', 'Vitamine B12 (Cobalamine) (µg)'],
      dtype='object')

### Dropping products that didn't have a nutrient list and dropping those that have a -1 as nutrient

In [238]:
train_set_values = train_set[train_set.columns[4:]].values.astype(float)
train_set = train_set[np.abs(np.sum(train_set_values, axis =1)) > 1e-3]

In [239]:
indexes_to_drop = []
# update train_set indexes so that it ranges from 0 to train_set.shape[0]-1

train_set.index = range(train_set.shape[0])
train_set_values = train_set[train_set.columns[4:]].values.astype(float)

for i in range(train_set.shape[0]):
    if -1 in train_set_values[i]:
        indexes_to_drop += [i]
print(indexes_to_drop)

train_set = train_set.drop(indexes_to_drop)

[10, 45, 55, 74, 78, 270, 340, 465, 593, 817, 917, 929, 1300, 1480, 1733, 1754, 1757, 1783, 1819, 2045, 2183, 2207, 2617, 2896, 2937, 2938, 3213, 3261, 3269, 3456, 3496, 3657, 3671, 3940, 3969, 4005, 4194, 4339, 4560, 4565]


In [240]:
train_set

Unnamed: 0,id,name,barcode,category,Énergie (kJ),Sel (g),Protéines (g),Sucres (g),Glucides (g),Matières grasses (g),...,Vitamine E (Tocopherol) (mg),Acide folique (µg),Vitamine A (µg),Vitamine D (Cholacalciferol) (µg),Vitamine C (Acide ascorbique) (mg),Vitamine B6 (Pyridoxine) (mg),Vitamine B5 (acide pantothénique) (mg),Vitamine B3 (Niacine) (mg),Vitamine B2 (Riboflavine) (mg),Vitamine B12 (Cobalamine) (µg)
0,0,V6 White Spearmint,5700626503125,/chocolat/bonbons-chewing-gum,707.0,0.18,0.4,0.0,67.0,0.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,M-Quick Boisson au cacao,7613312049297,/petit-dejeuner/cacao-chocolats-en-poudre,1640.0,0.06,5.0,80.0,82.0,3.0,...,6.0,300.0,400.0,2.5,40.0,0.7,3.0,8.0,0.7,1.25
2,6,Skai (Crunchy mint),7616500651555,/chocolat/bonbons-chewing-gum,650.0,0.02,0.2,0.1,64.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,8,RedBull : Energy Drink,9002490206789,/boissons-chaudes-froides/boissons-energetiques,194.0,0.1,0.0,11.0,11.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,2.0,8.0,0.0,2.0
4,10,Frey Milch Extra Chocolat au lait extra fin sa...,7616500912472,/chocolat/chocolat,2040.0,0.25,8.0,11.0,51.0,35.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,13,MIGROS BIO Choco Cookies aux pépites de chocolat,7617400037784,/chocolat/sain-bio,2142.0,0.59,7.0,27.0,61.0,26.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,15,LONGOBARDI : Pomodori pelati,8002510010007,/garnitures-ingredients/tomates-en-conserve,100.0,0.0,1.5,4.5,4.5,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,23,Kinder Riegel,4008400221021,/chocolat/chocolat,2360.0,0.31,8.7,53.3,53.5,35.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,25,Namaste India PALAK PANEER Epinards et fromage...,7613312065945,/garnitures-ingredients/du-monde-entier,424.0,1.6,4.0,2.5,5.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,29,MINDOR petit berre,7617400032796,/chocolat/biscuits-gaufres,2150.0,0.4,6.0,34.0,64.0,25.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [241]:
train_set.shape

(4614, 24)

### Renaming categories with only one level in "To classify"

In [242]:
catagory_column_value = train_set['category'].values

for i in range(train_set.shape[0]):
    if catagory_column_value[i].count('/') < 2:
        catagory_column_value[i] = "To classify"

train_set['category'] = catagory_column_value

### Saving preprocessed train set before computing clusters

In [243]:
train_set.shape

(4614, 24)

In [244]:
path = '../clustering/preprocessed_cluster_train_set.csv'
train_set.to_csv(path, index = False)

### Rereading of the train_set

In [245]:
train_set = pd.read_csv(path)

### Dropping columns not useful for clustering

In [246]:
train_set.drop(['id', 'name', 'barcode'], axis = 1, inplace = True)

### Computing cluster centers

In [247]:
unique_categories = train_set['category'].unique()
cluster_centers = np.zeros((unique_categories.shape[0], train_set.shape[1]-1))

# We compute the nutrient center of gravity of each of the 250 categories
for index, category in enumerate(unique_categories):
    category_train_set = train_set[train_set['category'] == category]
    category_train_set_values = category_train_set.values[:,1:].astype(float)
    nutrient_stack = np.zeros(train_set.shape[1]-1)
    
    for i in range(category_train_set_values.shape[0]):
        
        nutrient_array = category_train_set_values[i,:]
        nutrient_stack = nutrient_stack + nutrient_array
    
    
    cluster_centers[index] = nutrient_stack / category_train_set_values.shape[0]

unique_categories = np.expand_dims(unique_categories, axis = 1)
cluster_centers_values = np.concatenate([unique_categories, cluster_centers], axis = 1)
    
cluster_centers_df = pd.DataFrame(categories_center_values, columns = column_names[3:-2])
    

In [248]:
cluster_centers_df.to_csv('../clustering/cluster_centers.csv')

### /!\ Think of modifying the treatment of getParsedNutrients in master
### /!\ Add Acides gras monoinsaturés (g) and Acides gras poly-insaturés (g)