In [51]:
import pandas as pd

# Import LabelEncoder for encoding categorical variables
from sklearn.preprocessing import LabelEncoder, RobustScaler

import pickle

In [52]:
# df = pd.read_csv('../raw_data/openfoodfacts.org.products-full_data.csv', sep='\t', on_bad_lines='warn')
df = pd.read_csv('../processed_data/openfoodfacts_unknown_nutriscore.csv', on_bad_lines='warn')

In [53]:
df["nutriscore_grade"].value_counts()

nutriscore_grade
unknown    286788
Name: count, dtype: int64

In [54]:
categorical_columns = ["energy-kcal_100g", "sugars_100g", "saturated-fat_100g", "salt_100g", "fat_100g", "fiber_100g", "proteins_100g"]
numerical_columns = ["countries_en", "food_groups_en", "nutriscore_grade"]

df_simple = df[categorical_columns + numerical_columns]
df_simple.head()

Unnamed: 0,energy-kcal_100g,sugars_100g,saturated-fat_100g,salt_100g,fat_100g,fiber_100g,proteins_100g,countries_en,food_groups_en,nutriscore_grade
0,6.0,0.2,0.2,0.8,0.3,0.1,0.3,France,,unknown
1,6.0,0.2,0.2,0.8,0.3,0.1,0.3,Germany,,unknown
2,278.0,23.0,13.5,0.0,16.700001,1.0,3.3,France,,unknown
3,174.0,28.5,3.6,0.1,5.3,0.6,2.1,Germany,,unknown
4,174.0,28.5,3.6,0.1,5.3,0.6,2.1,United States,,unknown


In [55]:
df_simple_dropna = df_simple.dropna()

In [56]:
df_simple_dropna_unknown_grade = df_simple_dropna[df_simple_dropna["nutriscore_grade"] == "unknown"]

In [57]:
df_to_predict = df_simple_dropna_unknown_grade.drop("nutriscore_grade", axis=1)

### Processing dataframe before prediction

In [58]:
def scale_nuumerical_columns(df, columns):
    scaler = RobustScaler()

    # Robust scalling
    robust_df = scaler.fit_transform(df[columns])
    robust_df = pd.DataFrame(robust_df, columns=columns)

    # Drop numerical columns
    df = df.drop(columns=columns)

    # Merge the two dataframes
    df = pd.concat([df, robust_df], axis=1)
    return df

In [59]:
df_to_predict = scale_nuumerical_columns(df_to_predict, ["energy-kcal_100g", "sugars_100g", "saturated-fat_100g", "salt_100g", "fat_100g", "fiber_100g", "proteins_100g"])
df_to_predict.head()

Unnamed: 0,countries_en,food_groups_en,energy-kcal_100g,sugars_100g,saturated-fat_100g,salt_100g,fat_100g,fiber_100g,proteins_100g
244,United States,"Sugary snacks,Biscuits and cakes",-0.070103,2.035579,-0.2,-0.617389,-0.323529,-0.333333,-0.488381
544,Mexico,"Cereals and potatoes,Cereals",0.390137,0.344686,0.152,0.936911,0.192647,0.962963,0.893265
778,United States,"Sugary snacks,Biscuits and cakes",0.420616,0.477675,-0.2,1.264595,0.279412,0.259259,1.061835
1017,United States,"Sugary snacks,Biscuits and cakes",-0.551678,-0.198098,0.132,0.198682,-0.077941,-0.333333,-0.225286
1039,United States,"Sugary snacks,Biscuits and cakes",0.362705,0.61582,-0.2,1.234463,-0.323529,0.148148,0.754628


In [60]:
categorical_columns = ["countries_en", "food_groups_en"]
label_encoders = {}
for column in categorical_columns:
    print(len(df_to_predict[column].unique()), column)
    le = LabelEncoder()
    df_to_predict[column] = le.fit_transform(df_to_predict[column])
    label_encoders[column] = le

df_to_predict.head()

77 countries_en
28 food_groups_en


Unnamed: 0,countries_en,food_groups_en,energy-kcal_100g,sugars_100g,saturated-fat_100g,salt_100g,fat_100g,fiber_100g,proteins_100g
244,73,23,-0.070103,2.035579,-0.2,-0.617389,-0.323529,-0.333333,-0.488381
544,42,8,0.390137,0.344686,0.152,0.936911,0.192647,0.962963,0.893265
778,73,23,0.420616,0.477675,-0.2,1.264595,0.279412,0.259259,1.061835
1017,73,23,-0.551678,-0.198098,0.132,0.198682,-0.077941,-0.333333,-0.225286
1039,73,23,0.362705,0.61582,-0.2,1.234463,-0.323529,0.148148,0.754628


In [61]:
df_to_predict

Unnamed: 0,countries_en,food_groups_en,energy-kcal_100g,sugars_100g,saturated-fat_100g,salt_100g,fat_100g,fiber_100g,proteins_100g
244,73,23,-0.070103,2.035579,-0.200000,-0.617389,-0.323529,-0.333333,-0.488381
544,42,8,0.390137,0.344686,0.152000,0.936911,0.192647,0.962963,0.893265
778,73,23,0.420616,0.477675,-0.200000,1.264595,0.279412,0.259259,1.061835
1017,73,23,-0.551678,-0.198098,0.132000,0.198682,-0.077941,-0.333333,-0.225286
1039,73,23,0.362705,0.615820,-0.200000,1.234463,-0.323529,0.148148,0.754628
...,...,...,...,...,...,...,...,...,...
4853,76,27,-0.406500,-0.177751,0.421053,-0.197790,0.164087,-0.148148,-0.177442
4854,76,27,-0.015240,-0.141397,1.400000,-0.347458,0.867647,0.333333,0.315085
4855,76,27,0.505959,0.947263,-0.200000,-0.648776,-0.323529,-0.333333,-0.488381
4856,76,27,-0.006096,0.120974,-0.111600,0.059322,-0.063235,1.303704,0.348169


In [62]:
# Step 1: Load the pickle file containing the saved model
with open('../model/nutriscore_grade_prediction_model.pkl', 'rb') as file:
    model = pickle.load(file)

In [63]:
X = df_to_predict

# Step 4: Run the prediction
predictions = model.predict(X)

# Step 5: Display or save the predictions
print("Predictions:", predictions)

# Optional: Save predictions to a file
output = pd.DataFrame({'predicted_nutriscore_grade': predictions})

Predictions: [4 4 4 ... 4 4 4]


In [64]:
output["predicted_nutriscore_grade"].value_counts()

predicted_nutriscore_grade
4    9424
3     181
2       3
Name: count, dtype: int64

In [65]:
full_output = pd.concat([df.drop(columns=['nutriscore_grade']), output], axis=1)

In [66]:
full_output.to_csv('predictions_output.csv', index=False)