# Calorie Count Prediction Model

**Name(s)**: Hillary Chang, Paige Pagaduan

**Website Link**: https://hillarychang.github.io/Calorie-Count-Prediction-Model/

## Code

### Framing the Problem

In [2]:
import pandas as pd
import numpy as np
import os

import plotly.express as px
pd.options.plotting.backend = 'plotly'

from pathlib import Path
import ast

In [4]:
fp_interactions = Path('data') /'RAW_interactions.csv'
interactions = pd.read_csv(fp_interactions)

fp = Path('data') /'RAW_recipes.csv'
recipes = pd.read_csv(fp)

In [5]:
#preprocessing

#merging DataFrames
df = recipes.merge(right=interactions, how='left', left_on='id', right_on='recipe_id')

#filling 0.0 values with np.nan in rating column
df['rating'] = df['rating'].replace(0.0, np.nan)

#finding mean of recipes and adding it to the DataFrame in the avg_rating column
grouped = pd.DataFrame(df.groupby('name').mean()['rating'])
(df.merge(grouped, 
          left_on = 'name', 
          right_on = 'name')
 .rename(columns={'rating_y':'avg_rating', 'rating_x':'rating'}))

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,user_id,recipe_id,date,rating,review,avg_rating
0,1 brownies in the world best ever,333281,40,985201,2008-10-27,"['60-minutes-or-less', 'time-to-make', 'course...","[138.4, 10.0, 50.0, 3.0, 3.0, 19.0, 6.0]",10,['heat the oven to 350f and arrange the rack i...,"these are the most; chocolatey, moist, rich, d...","['bittersweet chocolate', 'unsalted butter', '...",9,3.865850e+05,333281.0,2008-11-19,4.0,"These were pretty good, but took forever to ba...",4.0
1,1 in canada chocolate chip cookies,453467,45,1848091,2011-04-11,"['60-minutes-or-less', 'time-to-make', 'cuisin...","[595.1, 46.0, 211.0, 22.0, 13.0, 51.0, 26.0]",12,"['pre-heat oven the 350 degrees f', 'in a mixi...",this is the recipe that we use at my school ca...,"['white sugar', 'brown sugar', 'salt', 'margar...",11,4.246800e+05,453467.0,2012-01-26,5.0,Originally I was gonna cut the recipe in half ...,5.0
2,412 broccoli casserole,306168,40,50969,2008-05-30,"['60-minutes-or-less', 'time-to-make', 'course...","[194.8, 20.0, 6.0, 32.0, 22.0, 36.0, 3.0]",6,"['preheat oven to 350 degrees', 'spray a 2 qua...",since there are already 411 recipes for brocco...,"['frozen broccoli cuts', 'cream of chicken sou...",9,2.978200e+04,306168.0,2008-12-31,5.0,This was one of the best broccoli casseroles t...,5.0
3,412 broccoli casserole,306168,40,50969,2008-05-30,"['60-minutes-or-less', 'time-to-make', 'course...","[194.8, 20.0, 6.0, 32.0, 22.0, 36.0, 3.0]",6,"['preheat oven to 350 degrees', 'spray a 2 qua...",since there are already 411 recipes for brocco...,"['frozen broccoli cuts', 'cream of chicken sou...",9,1.196280e+06,306168.0,2009-04-13,5.0,I made this for my son's first birthday party ...,5.0
4,412 broccoli casserole,306168,40,50969,2008-05-30,"['60-minutes-or-less', 'time-to-make', 'course...","[194.8, 20.0, 6.0, 32.0, 22.0, 36.0, 3.0]",6,"['preheat oven to 350 degrees', 'spray a 2 qua...",since there are already 411 recipes for brocco...,"['frozen broccoli cuts', 'cream of chicken sou...",9,7.688280e+05,306168.0,2013-08-02,5.0,Loved this. Be sure to completely thaw the br...,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234423,zydeco ya ya deviled eggs,308080,40,37779,2008-06-07,"['60-minutes-or-less', 'time-to-make', 'course...","[59.2, 6.0, 2.0, 3.0, 6.0, 5.0, 0.0]",7,"['in a bowl , combine the mashed yolks and may...","deviled eggs, cajun-style","['hard-cooked eggs', 'mayonnaise', 'dijon must...",8,8.445540e+05,308080.0,2009-10-14,5.0,These were very good. I meant to add some jala...,5.0
234424,cookies by design cookies on a stick,298512,29,506822,2008-04-15,"['30-minutes-or-less', 'time-to-make', 'course...","[188.0, 11.0, 57.0, 11.0, 7.0, 21.0, 9.0]",9,['place melted butter in a large mixing bowl a...,"i've heard of the 'cookies by design' company,...","['butter', 'eagle brand condensed milk', 'ligh...",10,8.042340e+05,298512.0,2008-05-02,1.0,I would rate this a zero if I could. I followe...,1.0
234425,cookies by design sugar shortbread cookies,298509,20,506822,2008-04-15,"['30-minutes-or-less', 'time-to-make', 'course...","[174.9, 14.0, 33.0, 4.0, 4.0, 11.0, 6.0]",5,"['whip sugar and shortening in a large bowl , ...","i've heard of the 'cookies by design' company,...","['granulated sugar', 'shortening', 'eggs', 'fl...",7,8.666510e+05,298509.0,2008-06-19,1.0,This recipe tastes nothing like the Cookies by...,3.0
234426,cookies by design sugar shortbread cookies,298509,20,506822,2008-04-15,"['30-minutes-or-less', 'time-to-make', 'course...","[174.9, 14.0, 33.0, 4.0, 4.0, 11.0, 6.0]",5,"['whip sugar and shortening in a large bowl , ...","i've heard of the 'cookies by design' company,...","['granulated sugar', 'shortening', 'eggs', 'fl...",7,1.546277e+06,298509.0,2010-02-08,5.0,"yummy cookies, i love this recipe me and my sm...",3.0


In [8]:
#get different nutrient values from nutrition column
vals = df['nutrition'].apply(ast.literal_eval)

df = df.assign(calories=vals.str[0],
         total_fat=vals.str[1],
         sugar=vals.str[2],
         sodium=vals.str[3],
         protein=vals.str[4],
         saturated_fat=vals.str[5],
         carbs=vals.str[6])

In [5]:
df

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,...,date,rating,review,calories,total_fat,sugar,sodium,protein,saturated_fat,carbs
0,1 brownies in the world best ever,333281,40,985201,2008-10-27,"['60-minutes-or-less', 'time-to-make', 'course...","[138.4, 10.0, 50.0, 3.0, 3.0, 19.0, 6.0]",10,['heat the oven to 350f and arrange the rack i...,"these are the most; chocolatey, moist, rich, d...",...,2008-11-19,4.0,"These were pretty good, but took forever to ba...",138.4,10.0,50.0,3.0,3.0,19.0,6.0
1,1 in canada chocolate chip cookies,453467,45,1848091,2011-04-11,"['60-minutes-or-less', 'time-to-make', 'cuisin...","[595.1, 46.0, 211.0, 22.0, 13.0, 51.0, 26.0]",12,"['pre-heat oven the 350 degrees f', 'in a mixi...",this is the recipe that we use at my school ca...,...,2012-01-26,5.0,Originally I was gonna cut the recipe in half ...,595.1,46.0,211.0,22.0,13.0,51.0,26.0
2,412 broccoli casserole,306168,40,50969,2008-05-30,"['60-minutes-or-less', 'time-to-make', 'course...","[194.8, 20.0, 6.0, 32.0, 22.0, 36.0, 3.0]",6,"['preheat oven to 350 degrees', 'spray a 2 qua...",since there are already 411 recipes for brocco...,...,2008-12-31,5.0,This was one of the best broccoli casseroles t...,194.8,20.0,6.0,32.0,22.0,36.0,3.0
3,412 broccoli casserole,306168,40,50969,2008-05-30,"['60-minutes-or-less', 'time-to-make', 'course...","[194.8, 20.0, 6.0, 32.0, 22.0, 36.0, 3.0]",6,"['preheat oven to 350 degrees', 'spray a 2 qua...",since there are already 411 recipes for brocco...,...,2009-04-13,5.0,I made this for my son's first birthday party ...,194.8,20.0,6.0,32.0,22.0,36.0,3.0
4,412 broccoli casserole,306168,40,50969,2008-05-30,"['60-minutes-or-less', 'time-to-make', 'course...","[194.8, 20.0, 6.0, 32.0, 22.0, 36.0, 3.0]",6,"['preheat oven to 350 degrees', 'spray a 2 qua...",since there are already 411 recipes for brocco...,...,2013-08-02,5.0,Loved this. Be sure to completely thaw the br...,194.8,20.0,6.0,32.0,22.0,36.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234424,zydeco ya ya deviled eggs,308080,40,37779,2008-06-07,"['60-minutes-or-less', 'time-to-make', 'course...","[59.2, 6.0, 2.0, 3.0, 6.0, 5.0, 0.0]",7,"['in a bowl , combine the mashed yolks and may...","deviled eggs, cajun-style",...,2009-10-14,5.0,These were very good. I meant to add some jala...,59.2,6.0,2.0,3.0,6.0,5.0,0.0
234425,cookies by design cookies on a stick,298512,29,506822,2008-04-15,"['30-minutes-or-less', 'time-to-make', 'course...","[188.0, 11.0, 57.0, 11.0, 7.0, 21.0, 9.0]",9,['place melted butter in a large mixing bowl a...,"i've heard of the 'cookies by design' company,...",...,2008-05-02,1.0,I would rate this a zero if I could. I followe...,188.0,11.0,57.0,11.0,7.0,21.0,9.0
234426,cookies by design sugar shortbread cookies,298509,20,506822,2008-04-15,"['30-minutes-or-less', 'time-to-make', 'course...","[174.9, 14.0, 33.0, 4.0, 4.0, 11.0, 6.0]",5,"['whip sugar and shortening in a large bowl , ...","i've heard of the 'cookies by design' company,...",...,2008-06-19,1.0,This recipe tastes nothing like the Cookies by...,174.9,14.0,33.0,4.0,4.0,11.0,6.0
234427,cookies by design sugar shortbread cookies,298509,20,506822,2008-04-15,"['30-minutes-or-less', 'time-to-make', 'course...","[174.9, 14.0, 33.0, 4.0, 4.0, 11.0, 6.0]",5,"['whip sugar and shortening in a large bowl , ...","i've heard of the 'cookies by design' company,...",...,2010-02-08,5.0,"yummy cookies, i love this recipe me and my sm...",174.9,14.0,33.0,4.0,4.0,11.0,6.0


In [6]:
df.corr()

Unnamed: 0,id,minutes,contributor_id,n_steps,n_ingredients,user_id,recipe_id,rating,calories,total_fat,sugar,sodium,protein,saturated_fat,carbs
id,1.0,0.001719,0.174941,0.083079,0.055759,0.150081,1.0,0.024654,0.021727,0.025629,0.005289,0.010359,0.009852,0.016987,0.013265
minutes,0.001719,1.0,0.000253,0.011695,-0.006963,0.001843,0.001718,0.00144,0.005562,0.001066,0.00453,0.004382,0.003112,0.002345,0.006786
contributor_id,0.174941,0.000253,1.0,0.050131,0.008876,0.172145,0.174941,0.002215,0.019978,0.020717,0.014018,0.031788,0.005361,0.013757,0.015679
n_steps,0.083079,0.011695,0.050131,1.0,0.40889,0.08139,0.083077,-0.001101,0.152484,0.132764,0.045285,0.022865,0.152408,0.143201,0.104727
n_ingredients,0.055759,-0.006963,0.008876,0.40889,1.0,0.013214,0.055756,-0.005005,0.132949,0.116539,0.004624,0.041202,0.188083,0.092708,0.0759
user_id,0.150081,0.001843,0.172145,0.08139,0.013214,1.0,0.150081,-0.116926,0.047905,0.038758,0.035878,0.023525,0.034467,0.042853,0.040174
recipe_id,1.0,0.001718,0.174941,0.083077,0.055756,0.150081,1.0,0.024654,0.02173,0.025633,0.005288,0.01036,0.009857,0.016991,0.013265
rating,0.024654,0.00144,0.002215,-0.001101,-0.005005,-0.116926,0.024654,1.0,-0.009316,-0.001542,-0.006584,-0.004567,-0.011787,-0.003704,-0.012925
calories,0.021727,0.005562,0.019978,0.152484,0.132949,0.047905,0.02173,-0.009316,1.0,0.869702,0.681099,0.216373,0.593704,0.805539,0.812777
total_fat,0.025629,0.001066,0.020717,0.132764,0.116539,0.038758,0.025633,-0.001542,0.869702,1.0,0.403276,0.123399,0.510125,0.86235,0.459807


### Baseline Model

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, Binarizer
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

X = df[['total_fat', 'sugar']]
y = df['calories']

#split data
X_train, X_test, y_train, y_test = (
    train_test_split(X, y, random_state=1, test_size=0.25)
)

#make pipeline for baseline model
pl = Pipeline([
    ('transfrom cols', ColumnTransformer(
        transformers=[
            ('binarize', Binarizer(threshold=31.9), ['total_fat']),
            ('stdscaler', StandardScaler(), ['sugar'])
        ],
        remainder='passthrough'
    )),
    ('random forest', RandomForestRegressor(random_state=29))
])

pl.fit(X_train, y_train)
y_pred = pl.predict(X_test)
r_squared = pl.score(X_test, y_test)

print(f'R^2: {r_squared}')
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'RMSE: {rmse}')


R^2: 0.6556359621651207
RMSE: 345.72485986758636


### Final Model

In [11]:
df

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,...,date,rating,review,calories,total_fat,sugar,sodium,protein,saturated_fat,carbs
0,1 brownies in the world best ever,333281,40,985201,2008-10-27,"['60-minutes-or-less', 'time-to-make', 'course...","[138.4, 10.0, 50.0, 3.0, 3.0, 19.0, 6.0]",10,['heat the oven to 350f and arrange the rack i...,"these are the most; chocolatey, moist, rich, d...",...,2008-11-19,4.0,"These were pretty good, but took forever to ba...",138.4,10.0,50.0,3.0,3.0,19.0,6.0
1,1 in canada chocolate chip cookies,453467,45,1848091,2011-04-11,"['60-minutes-or-less', 'time-to-make', 'cuisin...","[595.1, 46.0, 211.0, 22.0, 13.0, 51.0, 26.0]",12,"['pre-heat oven the 350 degrees f', 'in a mixi...",this is the recipe that we use at my school ca...,...,2012-01-26,5.0,Originally I was gonna cut the recipe in half ...,595.1,46.0,211.0,22.0,13.0,51.0,26.0
2,412 broccoli casserole,306168,40,50969,2008-05-30,"['60-minutes-or-less', 'time-to-make', 'course...","[194.8, 20.0, 6.0, 32.0, 22.0, 36.0, 3.0]",6,"['preheat oven to 350 degrees', 'spray a 2 qua...",since there are already 411 recipes for brocco...,...,2008-12-31,5.0,This was one of the best broccoli casseroles t...,194.8,20.0,6.0,32.0,22.0,36.0,3.0
3,412 broccoli casserole,306168,40,50969,2008-05-30,"['60-minutes-or-less', 'time-to-make', 'course...","[194.8, 20.0, 6.0, 32.0, 22.0, 36.0, 3.0]",6,"['preheat oven to 350 degrees', 'spray a 2 qua...",since there are already 411 recipes for brocco...,...,2009-04-13,5.0,I made this for my son's first birthday party ...,194.8,20.0,6.0,32.0,22.0,36.0,3.0
4,412 broccoli casserole,306168,40,50969,2008-05-30,"['60-minutes-or-less', 'time-to-make', 'course...","[194.8, 20.0, 6.0, 32.0, 22.0, 36.0, 3.0]",6,"['preheat oven to 350 degrees', 'spray a 2 qua...",since there are already 411 recipes for brocco...,...,2013-08-02,5.0,Loved this. Be sure to completely thaw the br...,194.8,20.0,6.0,32.0,22.0,36.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234424,zydeco ya ya deviled eggs,308080,40,37779,2008-06-07,"['60-minutes-or-less', 'time-to-make', 'course...","[59.2, 6.0, 2.0, 3.0, 6.0, 5.0, 0.0]",7,"['in a bowl , combine the mashed yolks and may...","deviled eggs, cajun-style",...,2009-10-14,5.0,These were very good. I meant to add some jala...,59.2,6.0,2.0,3.0,6.0,5.0,0.0
234425,cookies by design cookies on a stick,298512,29,506822,2008-04-15,"['30-minutes-or-less', 'time-to-make', 'course...","[188.0, 11.0, 57.0, 11.0, 7.0, 21.0, 9.0]",9,['place melted butter in a large mixing bowl a...,"i've heard of the 'cookies by design' company,...",...,2008-05-02,1.0,I would rate this a zero if I could. I followe...,188.0,11.0,57.0,11.0,7.0,21.0,9.0
234426,cookies by design sugar shortbread cookies,298509,20,506822,2008-04-15,"['30-minutes-or-less', 'time-to-make', 'course...","[174.9, 14.0, 33.0, 4.0, 4.0, 11.0, 6.0]",5,"['whip sugar and shortening in a large bowl , ...","i've heard of the 'cookies by design' company,...",...,2008-06-19,1.0,This recipe tastes nothing like the Cookies by...,174.9,14.0,33.0,4.0,4.0,11.0,6.0
234427,cookies by design sugar shortbread cookies,298509,20,506822,2008-04-15,"['30-minutes-or-less', 'time-to-make', 'course...","[174.9, 14.0, 33.0, 4.0, 4.0, 11.0, 6.0]",5,"['whip sugar and shortening in a large bowl , ...","i've heard of the 'cookies by design' company,...",...,2010-02-08,5.0,"yummy cookies, i love this recipe me and my sm...",174.9,14.0,33.0,4.0,4.0,11.0,6.0


In [12]:
from sklearn.preprocessing import PolynomialFeatures, QuantileTransformer, StandardScaler
from sklearn.linear_model import Lasso
from sklearn.svm import SVC
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

X = df[['total_fat', 'carbs','sugar', 'sodium', 'protein', 'saturated_fat', 'n_steps', 'minutes']]
y = df['calories']

X_train, X_test, y_train, y_test = (
    train_test_split(X, y, random_state=1, test_size=0.25)
)

pl = Pipeline([
    ('transfrom cols', ColumnTransformer(
        transformers=[
            ('stdscaler1', StandardScaler(), ['total_fat', 'carbs', 'sugar', 'sodium', 'protein', 'saturated_fat','n_steps']),
            # ('polynomialfeatures', PolynomialFeatures(), ['n_steps']),
            ('quantile_trans', QuantileTransformer(), ['minutes'])
        ],
        remainder='passthrough'
    )),
    ('random forest', RandomForestRegressor(max_depth=10, n_estimators=100,random_state=29))
])

pl.fit(X_train, y_train)
y_pred = pl.predict(X_test)

r_squared = pl.score(X_test, y_test)
print(f'R^2: {r_squared}')
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'RMSE: {rmse}')


R^2: 0.9933687791149496
RMSE: 47.97538413056438


In [13]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'random forest__n_estimators': [50, 100, 200],
    'random forest__max_depth': [None, 10, 20],
}

grid_search = GridSearchCV(pl, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs = -1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'random forest__max_depth': None, 'random forest__n_estimators': 100}


In [13]:
from sklearn.preprocessing import PolynomialFeatures, QuantileTransformer, StandardScaler
from sklearn.linear_model import Lasso
from sklearn.svm import SVC
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

X = df[['total_fat', 'carbs','sugar', 'sodium', 'protein', 'saturated_fat', 'n_steps', 'minutes']]
y = df['calories']

X_train, X_test, y_train, y_test = (
    train_test_split(X, y, random_state=1, test_size=0.25)
)

pl = Pipeline([
    ('transfrom cols', ColumnTransformer(
        transformers=[
            ('stdscaler1', StandardScaler(), ['total_fat', 'carbs', 'sugar', 'sodium', 'protein', 'saturated_fat','n_steps']),
            # ('polynomialfeatures', PolynomialFeatures(), ['n_steps']),
            ('quantile_trans', QuantileTransformer(), ['minutes'])
        ],
        remainder='passthrough'
    )),
    ('random forest', RandomForestRegressor(max_depth=None, n_estimators=100,random_state=29))
])

pl.fit(X_train, y_train)
y_pred = pl.predict(X_test)

r_squared = pl.score(X_test, y_test)
print(f'R^2: {r_squared}')
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'RMSE: {rmse}')

R^2: 0.9952528783488641
RMSE: 40.59166765798909


### Fairness Analysis

In [15]:
from sklearn.preprocessing import PolynomialFeatures, QuantileTransformer, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

X = df[['total_fat', 'carbs', 'sugar', 'sodium', 'protein', 'saturated_fat', 'n_steps', 'minutes']]
y = df['calories']

# Split the data into two groups based on 'n_steps'
group_X = df[df['n_steps'] <= 10]
group_Y = df[df['n_steps'] > 10]

# Train-test split for each group
X_train_X, X_test_X, y_train_X, y_test_X = train_test_split(group_X[['total_fat', 'carbs', 'sugar', 'sodium', 'protein', 'saturated_fat', 'n_steps', 'minutes']],
                                                            group_X['calories'], random_state=1, test_size=0.25)

X_train_Y, X_test_Y, y_train_Y, y_test_Y = train_test_split(group_Y[['total_fat', 'carbs', 'sugar', 'sodium', 'protein', 'saturated_fat', 'n_steps', 'minutes']],
                                                            group_Y['calories'], random_state=1, test_size=0.25)

# Create pipeline
pl = Pipeline([
    ('transform cols', ColumnTransformer(
        transformers=[
            ('stdscaler1', StandardScaler(), ['total_fat', 'carbs', 'sugar', 'sodium', 'protein', 'saturated_fat', 'n_steps']),
            ('quantile_trans', QuantileTransformer(), ['minutes'])
        ],
        remainder='passthrough'
    )),
    ('random forest', RandomForestRegressor(max_depth=None, n_estimators=100, random_state=29))
])

# Fit model for Group X
pl.fit(X_train_X, y_train_X)
y_pred_X = pl.predict(X_test_X)
rmse_X = np.sqrt(mean_squared_error(y_test_X, y_pred_X))

# Fit model for Group Y
pl.fit(X_train_Y, y_train_Y)
y_pred_Y = pl.predict(X_test_Y)
rmse_Y = np.sqrt(mean_squared_error(y_test_Y, y_pred_Y))

# Calculate observed RMSE difference
observed_rmse_diff = np.abs(rmse_X - rmse_Y)

# Run permutation test
num_repetitions = 100
rmse_diffs = np.array([])

for i in range(num_repetitions):
    # Permute 'n_steps' column
    df['shuffled_n_steps'] = np.random.permutation(df['n_steps'])
    
    # Split data into two groups based on the shuffled 'n_steps'
    shuffled_group_X = df[df['shuffled_n_steps'] <= 10]
    shuffled_group_Y = df[df['shuffled_n_steps'] > 10]
    
    # Fit model
    pl.fit(shuffled_group_X[['total_fat', 'carbs', 'sugar', 'sodium', 'protein', 'saturated_fat', 'n_steps', 'minutes']],
           shuffled_group_X['calories'])
    y_pred_X_shuffled = pl.predict(X_test_X)
    rmse_X_shuffled = np.sqrt(mean_squared_error(y_test_X, y_pred_X_shuffled))
    
    y_pred_Y_shuffled = pl.predict(X_test_Y)
    rmse_Y_shuffled = np.sqrt(mean_squared_error(y_test_Y, y_pred_Y_shuffled))
    
    # RMSE difference for shuffled groups
    shuffled_rmse_diff = np.abs(rmse_X_shuffled - rmse_Y_shuffled)
    rmse_diffs = np.append(rmse_diffs, shuffled_rmse_diff)

# P-value
p_value = (rmse_diffs >= observed_rmse_diff).mean()

print(f'Observed RMSE Difference: {observed_rmse_diff}')
print(f'P-value: {p_value}')

0.0
