In [None]:
# Import and install packages
%pip install pyfood
%pip install imblearn

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, balanced_accuracy_score
from pyfood.utils import Shelf
import numpy as np
from scipy.stats import zscore
from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_selection import SelectFromModel
from scipy.stats import zscore

In [None]:
# Set seed value
seed = 2024
np.random.seed(seed)

In [None]:
# Read and preprocess diets
diets = pd.read_csv("diet.csv")
diets['Diet'].fillna('Missing', inplace=True)

# Replace 'Missing' with the mode
diets['Diet'] = diets['Diet'].replace('Missing', diets['Diet'].mode().iloc[0])

# Define the mapping
mapping = {'Omnivore': 0, 'Vegetarian': 1, 'Vegan': 2}
# Map the values in the 'Omnivore' column
diets['Diet2'] = diets['Diet'].map(mapping)

# Create separate columns for diet type
diets = pd.get_dummies(diets, columns=['Diet'], dtype='int')

diets

In [None]:
# Read and preprocess reviews
reviews = pd.read_csv("reviews.csv", dtype={'AuthorId': str, 'RecipeId': int, 'Rating': float, 'Like': float, 'TestSetId': float})

reviews = reviews.drop('Rating', axis=1)
# reviews['Rating'] = reviews['Rating'].fillna(0)
reviews

In [None]:
# Read and preprocess recipes
recipes = pd.read_csv("recipes.csv")

recipes['RecipeIngredientParts'] = [ingredient[5:-4].lower() if ingredient.startswith("c") else ingredient[1:-1].lower() for ingredient in recipes['RecipeIngredientParts']]
recipes['RecipeIngredientParts'] = recipes['RecipeIngredientParts'].str.replace('\\', "", regex=False)
recipes['RecipeIngredientParts'] = recipes['RecipeIngredientParts'].str.split(r'"", ""')

# Add new column: TotalTime = CookTime + PrepTime
total_time = recipes['CookTime'] + recipes['PrepTime']
recipes.insert(4,'TotalTime', total_time)

# Collect all the ingredients and put it into a list
all_ingredients_list = [ingredient for recipe in recipes['RecipeIngredientParts'] for ingredient in recipe if len(ingredient) != 0]
all_ingredients_set = set(all_ingredients_list)

# Create list of food that should be avoided by vegan and vegetarians
shelf = Shelf(lang_source='en')

vegetarian_avoid = []
vegan_avoid = []

for ingredient in all_ingredients_set:
    label = shelf.process_ingredients([ingredient])
    if label is not None:
        if not label["labels"]["vegan"]:
            vegan_avoid.append(ingredient)
        if not label["labels"]["vege"]:
            vegetarian_avoid.append(ingredient)

# Add 2 columns to identify  whether the recipe is vegan or vegetarian
vegan_labels = []
vegetarian_labels = []

for ingredients in recipes['RecipeIngredientParts']:
    if set(ingredients) & set(vegetarian_avoid):
        vegetarian_labels.append(0)
    else:
        vegetarian_labels.append(1)
    if set(ingredients) & set(vegan_avoid):
        vegan_labels.append(0)
    else:
        vegan_labels.append(1)

recipes['IsRecipeVegan'] = vegan_labels
recipes['IsRecipeVegetarian'] = vegetarian_labels


recipes = recipes.drop(["RecipeIngredientQuantities", "RecipeIngredientParts", "Name", "RecipeYield", "RecipeServings",
                       "CookTime", "PrepTime", "CholesterolContent", "SodiumContent", "CarbohydrateContent"], axis=1)

recipes

In [None]:
# Read and preprocess requests
requests = pd.read_csv("requests.csv")

# Replace negative time with 0
requests['Time'] = requests['Time'].apply(lambda x: max(0, x))

protein_mapping = {'Indifferent': 0, 'Yes': 1}
requests['HighProtein'] = requests['HighProtein'].map(protein_mapping)

sugar_mapping = {'0': 0, 'Indifferent': 1}
requests['LowSugar'] = requests['LowSugar'].map(sugar_mapping)

requests 

In [None]:
# Merge dataframe
# requests x diets on AuthorId
merged1 = pd.merge(requests, diets, on='AuthorId')

# merged1 (requests x diets) x recipes of RecipeId
merged2 = pd.merge(merged1, recipes, on="RecipeId")

# Diet compatibility
compatibility = []
isCustVegan = merged2['Diet_Vegan'].tolist()
isCustVege = merged2['Diet_Vegetarian'].tolist()
isRecipeVegan = merged2['IsRecipeVegan'].tolist()
isRecipeVege = merged2['IsRecipeVegetarian'].tolist()

for i in range(len(isCustVegan)):
    if (isCustVegan[i] == 1 and isRecipeVegan[i] == 0) or (isCustVege[i] == 1 and isRecipeVege[i] == 0):
        compatibility.append(0)
    else:
        compatibility.append(1)

merged2['Compatibility'] = compatibility

# merged2['TimeDiff'] = merged2['Time'] - merged2['TotalTime']
# merged2['TimeDiff'] = merged2['TimeDiff'].abs() < merged2['TimeDiff'].mean()
# merged2['TimeDiff'] = merged2['TimeDiff'].astype(int)

merged2 = merged2.drop(['Time', 'TotalTime'], axis=1)

# merged2 reviews x (requests x diets x recipes) on "AuthorId" and "RecipeId")
mergedAll = pd.merge(reviews, merged2, on = ["AuthorId", "RecipeId"])
mergedAll = mergedAll.drop(["Diet_Omnivore", "Diet_Vegan", "IsRecipeVegan","AuthorId", "RecipeId", "Diet_Vegetarian", "IsRecipeVegetarian",
                           "SaturatedFatContent"], axis = 1)

mapping = {'Other' : 0, 
           'Lunch': 1, 
           'One dish meal': 2, 
           'Bread': 3, 
           'Breakfast': 4, 
           'Beverages': 5, 
           'Soup': 6}
mergedAll["RecipeCategory"] = mergedAll["RecipeCategory"].map(mapping)

mergedAll

In [None]:
# Split data
trainData = mergedAll[mergedAll['TestSetId'].isna()]
trainData = trainData.drop("TestSetId", axis=1)
trainData["Like"] = trainData["Like"].astype("int")

numeric_columns = trainData.select_dtypes(include=[np.number]).columns
z_scores = zscore(trainData[numeric_columns])
threshold = 30 #30 - 83.50
outlier_mask = np.abs(z_scores) <= threshold
trainData = trainData[outlier_mask.all(axis=1)]

testData = mergedAll[~mergedAll['TestSetId'].isna()]

X = trainData.drop("Like", axis = 1)
Y = trainData["Like"]

# Split the data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, random_state=seed)

trainData

In [None]:
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

# Training with random forest
rf = RandomForestClassifier(random_state=seed,
                            n_estimators=1000,
                            max_depth=20,
                            min_samples_split=10,
                            min_samples_leaf=20,
                            class_weight='balanced',
                            bootstrap=False,
                            criterion='gini',
                            n_jobs=-1)

ros = RandomOverSampler(random_state=seed)
X_resampled_train, y_resampled_train = ros.fit_resample(X_train, y_train)

rf.fit(X_resampled_train, y_resampled_train)

# Get predicted probabilities on the validation set
y_pred_proba = rf.predict_proba(X_val)[:, 1]

# Threshold search
best_threshold = 0
best_balanced_accuracy = 0

for threshold in np.arange(0, 1.01, 0.001):
    # Adjust classification threshold
    y_pred = (y_pred_proba > threshold).astype(int)

    balanced_accuracy = balanced_accuracy_score(y_val, y_pred)

    if balanced_accuracy > best_balanced_accuracy:
        best_balanced_accuracy = balanced_accuracy
        best_threshold = threshold

# Apply the best threshold to get final predictions
y_pred_final = (y_pred_proba > best_threshold).astype(int)

# Print results
print("Best Balanced Accuracy: {:.2f}%".format(best_balanced_accuracy * 100))
print("Best Threshold:", best_threshold)

conf_matrix = confusion_matrix(y_val, y_pred_final)
classification_rep = classification_report(y_val, y_pred_final)
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{classification_rep}")


In [None]:
# Write predictions into csv
X = testData.sort_values("TestSetId")
X["TestSetId"] = X["TestSetId"].astype("int")

TestSetId = X["TestSetId"]
X = testData.drop(["Like", "TestSetId"], axis=1)

result_y = rf.predict(X).astype("int")

# Get predicted probabilities on the validation set
result_y = rf.predict_proba(X)[:, 1]
result_y = (result_y > best_threshold).astype(int)

output = pd.DataFrame()
output["id"] = TestSetId
output['prediction'] = result_y

output.to_csv('prediction_PinguHunters_9.csv', index=False)
print("0: ",output['prediction'].value_counts()[0], "making up ", output['prediction'].value_counts()[0]/len(output['prediction']))
print("1: ",output['prediction'].value_counts()[1], "making up ", output['prediction'].value_counts()[1]/len(output['prediction']))

result_y