In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
#import sweetviz as sv

In [2]:
seed = 2024

# pandas, statsmodels, matplotlib and y_data_profiling rely on numpy's random generator, and thus, we need to set the seed in numpy
np.random.seed(seed)

In [3]:
# load the data
diet = pd.read_csv('diet.csv', sep=",")
recipes = pd.read_csv('recipes.csv', sep=",")
requests = pd.read_csv('requests.csv', sep=",")
reviews = pd.read_csv('reviews.csv', sep=",")

  reviews = pd.read_csv('reviews.csv', sep=",")


In [4]:
# handling with missing values
diet = diet.dropna()
recipes = recipes.drop(['RecipeServings', 'RecipeYield'], axis=1)
reviews = reviews.drop('Rating', axis=1)
print(diet.info())
print(recipes.info())
print(reviews.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 271906 entries, 0 to 271906
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   AuthorId  271906 non-null  object
 1   Diet      271906 non-null  object
 2   Age       271906 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 8.3+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75604 entries, 0 to 75603
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   RecipeId                    75604 non-null  int64  
 1   Name                        75604 non-null  object 
 2   CookTime                    75604 non-null  int64  
 3   PrepTime                    75604 non-null  int64  
 4   RecipeCategory              75604 non-null  object 
 5   RecipeIngredientQuantities  75604 non-null  object 
 6   RecipeIngredientParts       75604 non-null  object 
 7   Calories     

In [5]:
#unify the labels for request table

requests['HighCalories'] = requests['HighCalories'].map( 
                   {1.0:True , 0.0:False, 'Indifferent': 'Indifferent'})
requests['HighProtein'] = requests['HighProtein'].map( 
                   {'Yes':True ,'Indifferent':'Indifferent', 'No':False})
requests['LowFat'] = requests['LowFat'].map( 
                   {1:True , 0:False, 'Indifferent':'Indifferent'})
requests['LowSugar'] = requests['LowSugar'].map( 
                   {'1':True ,'0':False, 'Indifferent':'Indifferent'})
requests['HighFiber'] = requests['HighFiber'].map( 
                   {1:True , 0:False, 'Indifferent':'Indifferent'})
requests

Unnamed: 0,AuthorId,RecipeId,Time,HighCalories,HighProtein,LowFat,LowSugar,HighFiber
0,2001012259B,73440,1799.950949,False,Indifferent,False,False,False
1,437641B,365718,4201.820980,False,True,False,Indifferent,True
2,1803340263D,141757,6299.861496,False,Indifferent,True,Indifferent,False
3,854048B,280351,19801.365796,False,True,True,False,True
4,2277685E,180505,5400.093457,False,Indifferent,False,False,False
...,...,...,...,...,...,...,...,...
140190,163793B,78171,1560.649725,False,Indifferent,False,False,True
140191,33888B,333262,1502.011466,True,Indifferent,True,False,False
140192,401942C,49200,5999.274269,False,Indifferent,False,False,True
140193,346866B,214815,899.523513,False,True,True,Indifferent,True


In [6]:
# arrange inappropriate data types
diet.dtypes
diet['Diet'] = diet['Diet'].astype('category')
diet.dtypes

AuthorId      object
Diet        category
Age            int64
dtype: object

In [7]:
# arrange inappropriate data types
recipes.dtypes
recipes['RecipeCategory'] = recipes['RecipeCategory'].astype('category')
recipes.dtypes

RecipeId                         int64
Name                            object
CookTime                         int64
PrepTime                         int64
RecipeCategory                category
RecipeIngredientQuantities      object
RecipeIngredientParts           object
Calories                       float64
FatContent                     float64
SaturatedFatContent            float64
CholesterolContent             float64
SodiumContent                  float64
CarbohydrateContent            float64
FiberContent                   float64
SugarContent                   float64
ProteinContent                 float64
dtype: object

In [8]:
# arrange inappropriate data types
reviews.dtypes
reviews['Like'] = reviews['Like'].apply(lambda x: pd.NA if pd.isna(x) else bool(x))
reviews.dtypes

AuthorId      object
RecipeId       int64
Like          object
TestSetId    float64
dtype: object

In [9]:
# arrange inappropriate data types
requests.dtypes
requests['HighCalories'] = requests['HighCalories'].astype('category')
requests['HighProtein'] = requests['HighProtein'].astype('category')
requests['LowFat'] = requests['LowFat'].astype('category')
requests['LowSugar'] = requests['LowSugar'].astype('category')
requests['HighFiber'] = requests['HighFiber'].astype('category')
requests.dtypes

AuthorId          object
RecipeId           int64
Time             float64
HighCalories    category
HighProtein     category
LowFat          category
LowSugar        category
HighFiber       category
dtype: object

In [10]:
# creating key attribute for request table
requests['AuthorId_RecipeId'] = requests['AuthorId'] + '_' + requests['RecipeId'].astype(str)
requests = requests.iloc[:, [8,0,1,2,3,4,5,6,7]]
requests

Unnamed: 0,AuthorId_RecipeId,AuthorId,RecipeId,Time,HighCalories,HighProtein,LowFat,LowSugar,HighFiber
0,2001012259B_73440,2001012259B,73440,1799.950949,False,Indifferent,False,False,False
1,437641B_365718,437641B,365718,4201.820980,False,True,False,Indifferent,True
2,1803340263D_141757,1803340263D,141757,6299.861496,False,Indifferent,True,Indifferent,False
3,854048B_280351,854048B,280351,19801.365796,False,True,True,False,True
4,2277685E_180505,2277685E,180505,5400.093457,False,Indifferent,False,False,False
...,...,...,...,...,...,...,...,...,...
140190,163793B_78171,163793B,78171,1560.649725,False,Indifferent,False,False,True
140191,33888B_333262,33888B,333262,1502.011466,True,Indifferent,True,False,False
140192,401942C_49200,401942C,49200,5999.274269,False,Indifferent,False,False,True
140193,346866B_214815,346866B,214815,899.523513,False,True,True,Indifferent,True


In [11]:
# creating key attribute for reviews table
reviews['AuthorId_RecipeId'] = reviews['AuthorId'] + '_' + reviews['RecipeId'].astype(str)
reviews = reviews.iloc[:, [4,0,1,2,3]]
reviews

Unnamed: 0,AuthorId_RecipeId,AuthorId,RecipeId,Like,TestSetId
0,2492191A_33671,2492191A,33671,,1.0
1,2002019979A_92647,2002019979A,92647,,2.0
2,408594E_161770,408594E,161770,,3.0
3,2001625557E_108231,2001625557E,108231,,4.0
4,2001427116E_71109,2001427116E,71109,,5.0
...,...,...,...,...,...
140190,999595E_338070,999595E,338070,False,
140191,999774A_29002,999774A,29002,False,
140192,999774A_159252,999774A,159252,False,
140193,999774A_1171,999774A,1171,True,


In [12]:
# flag recipe as vegan, vegetarian, omnivore

recipe_ingredients = recipes['RecipeIngredientParts']
recipe_names = recipes['Name']

# Define keywords that indicate non-vegetarian or non-vegan ingredients
non_vegetarian_keywords = ['chicken', 'beef', 'pork', 'fish', 'shrimp', 'bacon', 'lamb', 'turkey', 'ham']
non_vegan_keywords = ['milk', 'cheese', 'butter', 'cream', 'yogurt', 'honey', 'gelatin ', 'egg']

# Define vegetarian and vegan keywords
vegetarian_keywords = ['vegetarian']
vegan_keywords = vegetarian_keywords + ['vegan cheese', 'tofu', 'plant-based milk', 'soymilk', 'vegan']

# Create new columns indicating if the recipe is vegetarian, vegan, or omnivore
recipes['IsVegetarian'] = (~recipe_ingredients.str.contains('|'.join(non_vegetarian_keywords), case=False)) | recipe_ingredients.str.contains('|'.join(vegetarian_keywords), case=False)
recipes['IsVegan'] = (~recipe_ingredients.str.contains('|'.join(non_vegan_keywords), case=False)) | recipe_ingredients.str.contains('|'.join(vegan_keywords), case=False)

# checking also with using name of recipe
recipes['IsVegetarian'] |= recipe_names.str.contains('|'.join(vegetarian_keywords), case=False)
recipes['IsVegan'] |= recipe_names.str.contains('|'.join(vegan_keywords), case=False)

# Create a 'DietType' column
recipes['DietType'] = 'Omnivore'  # Assume omnivore by default

# Update the 'DietType' column based on 'IsVegetarian' and 'IsVegan' columns
recipes.loc[recipes['IsVegan'], 'DietType'] = 'Vegan'
recipes.loc[recipes['IsVegetarian'] & ~recipes['IsVegan'], 'DietType'] = 'Vegetarian'

recipes


Unnamed: 0,RecipeId,Name,CookTime,PrepTime,RecipeCategory,RecipeIngredientQuantities,RecipeIngredientParts,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,IsVegetarian,IsVegan,DietType
0,73440,Bow Ties With Broccoli Pesto,0,1800,Other,"c(""\""6\"""", ""\""2\"""", ""\""1 1/2\"""", ""\""1/4\"""", ""\...","c(""\""hazelnuts\"""", ""\""broccoli florets\"""", ""\""...",241.3,10.1,1.2,0.0,13.1,31.8,2.3,1.4,6.7,True,True,Vegan
1,365718,Cashew-chutney Rice,3600,600,Other,"c(""\""1\"""", ""\""3/4\"""", ""\""6\"""", ""\""5\"""", ""\""2\""...","c(""\""celery\"""", ""\""onion\"""", ""\""butter\"""", ""\""...",370.8,17.5,7.2,22.9,553.3,44.3,1.6,2.2,9.4,False,False,Omnivore
2,141757,Copycat Taco Bell Nacho Fries BellGrande,3600,2700,Other,"c(""\""3\"""", ""\""1/2\"""", ""\""1\"""", ""\""1\"""", ""\""3\""...","c(""\""Copycat Taco Bell Seasoned Beef\"""", ""\""ye...",377.6,20.9,10.5,45.7,1501.8,36.6,3.8,6.1,12.9,False,False,Omnivore
3,280351,Slow Cooker Jalapeno Cheddar Cheese Soup,18000,1800,Other,"c(""\""2\"""", ""\""1\"""", ""\""2\"""", ""\""2\"""", ""\""1\"""",...","c(""\""unsalted butter\"""", ""\""yellow onion\"""", ""...",282.8,16.5,10.3,50.5,630.2,22.8,2.3,2.7,11.7,False,False,Omnivore
4,180505,Cool & Crisp Citrus Chiffon Pie,3600,1800,Other,"c(""\""1\"""", ""\""1/4\"""", ""\""1/2\"""", ""\""1/2\"""", ""\...","c(""\""unflavored gelatin\"""", ""\""water\"""", ""\""su...",257.5,8.6,2.4,110.7,160.9,39.8,0.4,30.2,6.3,True,True,Vegan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75599,253577,Frijoles Negros- Crock Pot Mexican Black Beans,43200,28800,Other,"c(""\""2\"""", ""\""6 -8\"""", ""\""5\"""", ""\""1/2\"""", ""\""...","c(""\""black beans\"""", ""\""water\"""", ""\""bay leave...",121.5,0.5,0.1,0.0,1175.1,22.2,7.8,0.6,7.9,True,True,Vegan
75600,267827,Moose Moussaka,3600,2700,Other,"c(""\""1\"""", ""\""6\"""", ""\""2\"""", ""\""2\"""", ""\""1/2\""...","c(""\""onion\"""", ""\""garlic cloves\"""", ""\""olive o...",652.2,25.8,10.7,197.9,435.5,51.9,7.5,7.2,50.1,True,False,Vegetarian
75601,266983,Cantonese Pepper Steak for Two (Or More),1800,900,Other,"c(""\""1/2\"""", ""\""1\"""", ""\""1/8\"""", ""\""1/8\"""", ""\...","c(""\""top round steak\"""", ""\""cornstarch\"""", ""\""...",223.9,9.2,3.6,78.3,725.9,7.3,1.1,1.7,26.7,True,True,Vegan
75602,253739,Coconut Cream Cooler,300,120,Other,"c(""\""1\"""", ""\""1\"""", ""\""1\"""", ""\""7 1/2\"""", ""\""1...","c(""\""cream of coconut\"""", ""\""water\"""")",2229.8,80.3,69.3,0.0,294.7,369.0,15.7,317.9,26.7,True,False,Vegetarian


In [13]:
# merge all tables
final_df = reviews.merge(requests, left_on='AuthorId_RecipeId', right_on='AuthorId_RecipeId')
final_df = final_df.merge(diet, left_on='AuthorId_x', right_on='AuthorId')
final_df = final_df.merge(recipes, left_on='RecipeId_x', right_on='RecipeId')
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 140195 entries, 0 to 140194
Data columns (total 35 columns):
 #   Column                      Non-Null Count   Dtype   
---  ------                      --------------   -----   
 0   AuthorId_RecipeId           140195 non-null  object  
 1   AuthorId_x                  140195 non-null  object  
 2   RecipeId_x                  140195 non-null  int64   
 3   Like                        97381 non-null   object  
 4   TestSetId                   42814 non-null   float64 
 5   AuthorId_y                  140195 non-null  object  
 6   RecipeId_y                  140195 non-null  int64   
 7   Time                        140195 non-null  float64 
 8   HighCalories                140195 non-null  category
 9   HighProtein                 140195 non-null  category
 10  LowFat                      140195 non-null  category
 11  LowSugar                    140195 non-null  category
 12  HighFiber                   140195 non-null  category
 13 

In [14]:
# Creating new columns to measure customer needs are met or not

# Total time of recipe is sum of cook time and prep time
final_df['Cook_Prep_Time'] = final_df['CookTime'] + final_df['PrepTime']

average_calorie = final_df['Calories'].mean()
final_df['Calorie_of_Recipe'] = final_df['Calories'] > average_calorie
average_fat_content = final_df['FatContent'].mean()
final_df['Fat_Content_of_Recipe'] = final_df['FatContent'] > average_fat_content
median_fiber_content = final_df['FiberContent'].median()
final_df['FiberContent_of_Recipe'] = final_df['FiberContent'] > median_fiber_content
average_protein_content = final_df['ProteinContent'].mean()
final_df['Protein_Content_of_Recipe'] = final_df['ProteinContent'] > average_protein_content
median_sugar_content = final_df['SugarContent'].median()
final_df['SugarContent_of_Recipe'] = final_df['SugarContent'] > median_sugar_content

# droping some columns to make our model simpler
final_df = final_df.drop(['AuthorId_y', 'RecipeId_y', 'AuthorId','RecipeId', 'RecipeIngredientQuantities', 'Name', 'IsVegetarian', 'IsVegan', 'RecipeIngredientParts'], axis=1)
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 140195 entries, 0 to 140194
Data columns (total 32 columns):
 #   Column                     Non-Null Count   Dtype   
---  ------                     --------------   -----   
 0   AuthorId_RecipeId          140195 non-null  object  
 1   AuthorId_x                 140195 non-null  object  
 2   RecipeId_x                 140195 non-null  int64   
 3   Like                       97381 non-null   object  
 4   TestSetId                  42814 non-null   float64 
 5   Time                       140195 non-null  float64 
 6   HighCalories               140195 non-null  category
 7   HighProtein                140195 non-null  category
 8   LowFat                     140195 non-null  category
 9   LowSugar                   140195 non-null  category
 10  HighFiber                  140195 non-null  category
 11  Diet                       140195 non-null  category
 12  Age                        140195 non-null  int64   
 13  CookTime      

In [15]:
# desire time of recipe should be equal or greater than the time of recipe
final_df['Time_Check'] = final_df['Time'] >= final_df['Cook_Prep_Time']

# if customer prefer high calories, recipe should be flaged as high calorie
final_df.loc[final_df['HighCalories'] == 'TRUE', 'Calorie_Check'] = 1
final_df.loc[final_df['HighCalories'] == 'FALSE', 'Calorie_Check'] = 0
final_df['Calorie_Check'] = final_df['Calorie_Check'] == final_df['Calorie_of_Recipe']

final_df.loc[final_df['HighProtein'] == 'TRUE', 'Protein_Check'] = 1
final_df.loc[final_df['HighProtein'] == 'FALSE', 'Protein_Check'] = 0
final_df['Protein_Check'] = final_df['Protein_Check'] == final_df['Protein_Content_of_Recipe']

# since I flagged fat content of recipe 1 if it has high fat content, here I changed 0 and 1.
final_df.loc[final_df['LowFat'] == 'TRUE', 'Fat_Check'] = 0
final_df.loc[final_df['LowFat'] == 'FALSE', 'Fat_Check'] = 1
final_df['Fat_Check'] = final_df['Fat_Check'] == final_df['Fat_Content_of_Recipe']

final_df.loc[final_df['HighFiber'] == 'TRUE', 'Fiber_Check'] = 1
final_df.loc[final_df['HighFiber'] == 'FALSE', 'Fiber_Check'] = 0
final_df['Fiber_Check'] = final_df['Fiber_Check'] == final_df['FiberContent_of_Recipe']

# since I flagged sugar content of recipe 1 if it has high sugar content, here I changed 0 and 1.
final_df.loc[final_df['LowSugar'] == 'TRUE', 'Sugar_Check'] = 0
final_df.loc[final_df['LowSugar'] == 'FALSE', 'Sugar_Check'] = 1
final_df['Sugar_Check'] = final_df['Sugar_Check'] == final_df['SugarContent_of_Recipe']

# diet type of recipe should be same with the customer diet
final_df['Diet_Check'] = final_df['Diet'] == final_df['DietType']

# droping some columns to make our model simpler
check_df = final_df
check_df


  output = repr(obj)
  return method()


Unnamed: 0,AuthorId_RecipeId,AuthorId_x,RecipeId_x,Like,TestSetId,Time,HighCalories,HighProtein,LowFat,LowSugar,...,FiberContent_of_Recipe,Protein_Content_of_Recipe,SugarContent_of_Recipe,Time_Check,Calorie_Check,Protein_Check,Fat_Check,Fiber_Check,Sugar_Check,Diet_Check
0,2492191A_33671,2492191A,33671,,1.0,2698.714376,False,Indifferent,True,False,...,True,True,True,False,False,False,False,False,False,False
1,1054649A_33671,1054649A,33671,,518.0,2698.953218,True,True,False,False,...,True,True,True,False,False,False,False,False,False,False
2,1332021D_33671,1332021D,33671,,614.0,2699.449076,True,Indifferent,True,False,...,True,True,True,False,False,False,False,False,False,False
3,527011D_33671,527011D,33671,,635.0,2700.372697,False,Indifferent,False,Indifferent,...,True,True,True,True,False,False,False,False,False,False
4,945540E_33671,945540E,33671,,712.0,2697.940249,False,Indifferent,False,False,...,True,True,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140190,998666C_215821,998666C,215821,False,,1198.343216,False,True,False,False,...,False,False,False,False,False,False,False,False,False,True
140191,999325C_41275,999325C,41275,False,,7200.236450,False,Indifferent,False,False,...,True,True,True,True,False,False,False,False,False,True
140192,999521E_55333,999521E,55333,True,,4801.128176,True,True,False,False,...,True,True,True,True,False,False,False,False,False,True
140193,999595E_338070,999595E,338070,False,,3899.421310,False,Indifferent,True,Indifferent,...,False,False,True,False,False,False,False,False,False,True


In [16]:
# just renaming
check_df.rename(columns={
'AuthorId_x': 'AuthorId',
'RecipeId_x': 'RecipeId',
}, inplace=True)

check_df

  output = repr(obj)
  return method()


Unnamed: 0,AuthorId_RecipeId,AuthorId,RecipeId,Like,TestSetId,Time,HighCalories,HighProtein,LowFat,LowSugar,...,FiberContent_of_Recipe,Protein_Content_of_Recipe,SugarContent_of_Recipe,Time_Check,Calorie_Check,Protein_Check,Fat_Check,Fiber_Check,Sugar_Check,Diet_Check
0,2492191A_33671,2492191A,33671,,1.0,2698.714376,False,Indifferent,True,False,...,True,True,True,False,False,False,False,False,False,False
1,1054649A_33671,1054649A,33671,,518.0,2698.953218,True,True,False,False,...,True,True,True,False,False,False,False,False,False,False
2,1332021D_33671,1332021D,33671,,614.0,2699.449076,True,Indifferent,True,False,...,True,True,True,False,False,False,False,False,False,False
3,527011D_33671,527011D,33671,,635.0,2700.372697,False,Indifferent,False,Indifferent,...,True,True,True,True,False,False,False,False,False,False
4,945540E_33671,945540E,33671,,712.0,2697.940249,False,Indifferent,False,False,...,True,True,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140190,998666C_215821,998666C,215821,False,,1198.343216,False,True,False,False,...,False,False,False,False,False,False,False,False,False,True
140191,999325C_41275,999325C,41275,False,,7200.236450,False,Indifferent,False,False,...,True,True,True,True,False,False,False,False,False,True
140192,999521E_55333,999521E,55333,True,,4801.128176,True,True,False,False,...,True,True,True,True,False,False,False,False,False,True
140193,999595E_338070,999595E,338070,False,,3899.421310,False,Indifferent,True,Indifferent,...,False,False,True,False,False,False,False,False,False,True


In [17]:
from sklearn.model_selection import train_test_split

# if like=NA, splitting them as test set
test_set = check_df[check_df['Like'].isna()]
train_val_set = check_df.dropna(subset=['Like'])

# Split data into features (X) and target variable (y)
X_train_val = train_val_set.drop(columns=['Like'])
Y_train_val = train_val_set['Like']

# splitting X_train_val as train and validation
X_train, X_val, Y_train, Y_val = train_test_split(X_train_val, Y_train_val, test_size=0.2, random_state=seed)

In [18]:
exclude_columns = ['AuthorId_RecipeId', 'AuthorId', 'RecipeId', 'TestSetId']

# Remove excluded columns from features
X_train = X_train.drop(columns=exclude_columns)
X_val = X_val.drop(columns=exclude_columns)
test_set = test_set.drop(columns=exclude_columns)

X_train = pd.get_dummies(X_train, columns=['HighCalories', 'HighProtein', 'LowFat', 'LowSugar', 'HighFiber', 'Diet', 'RecipeCategory', 'DietType'])
X_val = pd.get_dummies(X_val, columns=['HighCalories', 'HighProtein', 'LowFat', 'LowSugar', 'HighFiber', 'Diet', 'RecipeCategory', 'DietType'])
test_set = pd.get_dummies(test_set, columns=['HighCalories', 'HighProtein', 'LowFat', 'LowSugar', 'HighFiber', 'Diet', 'RecipeCategory', 'DietType'])

# Convert boolean features to numeric (it is required for logistic regression, it only accepts numeric values)
bool_columns = ['Time_Check', 'Calorie_Check', 'Protein_Check', 'Fat_Check', 'Fiber_Check', 'Sugar_Check', 'Diet_Check', 
'Calorie_of_Recipe', 'Fat_Content_of_Recipe', 'FiberContent_of_Recipe', 'Protein_Content_of_Recipe', 'SugarContent_of_Recipe',
'HighCalories_False', 'HighCalories_True', 'HighProtein_True', 'HighProtein_Indifferent', 'LowFat_False', 'LowFat_True', 'LowSugar_False', 'LowSugar_Indifferent',
'HighFiber_False', 'HighFiber_True', 'Diet_Omnivore', 'Diet_Vegan', 'Diet_Vegetarian', 'RecipeCategory_Beverages', 'RecipeCategory_Bread','RecipeCategory_Breakfast','RecipeCategory_Lunch',
'RecipeCategory_One dish meal', 'RecipeCategory_Other','RecipeCategory_Soup','DietType_Omnivore', 'DietType_Vegan', 'DietType_Vegetarian']

X_train[bool_columns] = X_train[bool_columns].astype(int)
X_val[bool_columns] = X_val[bool_columns].astype(int)

test_set[bool_columns] = test_set[bool_columns].astype(int)

Y_train = Y_train.astype(int)
Y_val = Y_val.astype(int)


# checking data types => all numeric, no problem
print("Data Types:")
print(X_train.dtypes)

# Checkin missing values => no NA, no problem
print("\nMissing Values:")
print(X_train.isnull().sum())



Data Types:
Time                            float64
Age                               int64
CookTime                          int64
PrepTime                          int64
Calories                        float64
FatContent                      float64
SaturatedFatContent             float64
CholesterolContent              float64
SodiumContent                   float64
CarbohydrateContent             float64
FiberContent                    float64
SugarContent                    float64
ProteinContent                  float64
Cook_Prep_Time                    int64
Calorie_of_Recipe                 int32
Fat_Content_of_Recipe             int32
FiberContent_of_Recipe            int32
Protein_Content_of_Recipe         int32
SugarContent_of_Recipe            int32
Time_Check                        int32
Calorie_Check                     int32
Protein_Check                     int32
Fat_Check                         int32
Fiber_Check                       int32
Sugar_Check                 

In [19]:
len(X_val)

19477

In [20]:

# this data will train our model
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77904 entries, 96382 to 12631
Data columns (total 49 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Time                          77904 non-null  float64
 1   Age                           77904 non-null  int64  
 2   CookTime                      77904 non-null  int64  
 3   PrepTime                      77904 non-null  int64  
 4   Calories                      77904 non-null  float64
 5   FatContent                    77904 non-null  float64
 6   SaturatedFatContent           77904 non-null  float64
 7   CholesterolContent            77904 non-null  float64
 8   SodiumContent                 77904 non-null  float64
 9   CarbohydrateContent           77904 non-null  float64
 10  FiberContent                  77904 non-null  float64
 11  SugarContent                  77904 non-null  float64
 12  ProteinContent                77904 non-null  float64
 1

In [21]:
# using example forom example_crips_dm_pipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Define models
model_logistic_regression = LogisticRegression(max_iter=30, random_state=seed)
model_random_forest = RandomForestClassifier(random_state=seed)
model_gradient_boosting = GradientBoostingClassifier(random_state=seed)

# Define preprocessing steps
transform_scaler = StandardScaler()
transform_pca = PCA()

# Create a pipeline with preprocessing steps and a placeholder for the model
pipeline = Pipeline(steps=[("scaler", transform_scaler), 
                             ("pca", transform_pca),
                             ("model", None)])

# Define parameter grid for preprocessing
parameter_grid_preprocessing = {
    "pca__n_components": [1, 2, 3, 4]
}

# Define parameter grids for each model
parameter_grid_logistic_regression = {
    "model": [model_logistic_regression],
    "model__C": [0.1, 1, 10]  # inverse regularization strength
}

parameter_grid_random_forest = {
    "model": [model_random_forest],
    "model__n_estimators": [10, 20, 50],  # number of max trees in the forest
    "model__max_depth": [2, 3, 4]
}

parameter_grid_gradient_boosting = {
    "model": [model_gradient_boosting],
    "model__n_estimators": [10, 20, 30]
}
meta_parameter_grid = [parameter_grid_logistic_regression,
                       parameter_grid_random_forest,
                       parameter_grid_gradient_boosting]

meta_parameter_grid = [{**parameter_grid_preprocessing, **model_grid}
                       for model_grid in meta_parameter_grid]
search = GridSearchCV(pipeline,
                      meta_parameter_grid,
                      scoring="balanced_accuracy",
                      n_jobs=2,
                      cv=5,
                      error_score="raise")

# Train models and search for best parameters
search.fit(X_train, Y_train)

print("Best parameters:", search.best_params_, "(CV score=%0.3f)" % search.best_score_)


Best parameters: {'model': LogisticRegression(C=0.1, max_iter=30, random_state=2024), 'model__C': 0.1, 'pca__n_components': 4} (CV score=0.501)


In [22]:
from sklearn.metrics import accuracy_score
# Get the best model from the search
best_model = search.best_estimator_

# Use the best model to make predictions on the validation set
y_pred_best_model = best_model.predict(X_val)

# Evaluate the performance of the best model
accuracy_best_model = accuracy_score(Y_val, y_pred_best_model)
print("Best Model Accuracy:", accuracy_best_model)

Best Model Accuracy: 0.8671766699183653


In [23]:
# Split test data into features (X_test) 
X_test = test_set.drop(columns=['Like'])
Y_test = test_set['Like']

# Use the best model to make predictions on the test set
y_pred_test = best_model.predict(X_test)

# evaluate performance of model on test set
print("Score on test set:", search.score(X_test, y_pred_test))

# contingency table
ct = pd.crosstab(search.best_estimator_.predict(X_test), y_pred_test,
                 rownames=["pred"], colnames=["true"])
print(ct)

Score on test set: 1.0
true      0   1
pred           
0     42729   0
1         0  85


In [24]:
output = pd.DataFrame(y_pred_test, columns=['predictions'])

output['id'] = range(1, len(y_pred_test) + 1)
output['predictions'] = output['predictions'].astype(int)

output = output[['id', 'predictions']]
output = output.reindex(columns=["id", "predictions"])
# Concatenate all values into a single column separated by commas
output["id,predictions"] = output['id'].astype(str) +', '+output['predictions'].astype(str)

# Drop the original columns if needed
output = output.drop(['id', 'predictions'], axis=1)
output.to_csv('predictions_Good_Morning_02.csv', index=False)

output

Unnamed: 0,"id,predictions"
0,"1, 0"
1,"2, 0"
2,"3, 0"
3,"4, 0"
4,"5, 0"
...,...
42809,"42810, 0"
42810,"42811, 0"
42811,"42812, 0"
42812,"42813, 0"
