# Portfolio Project: Recipes and Ratings

**Name(s)**: Megan Bowen and Jade Vanhaitsma

In [None]:
import pandas as pd
import numpy as np

import plotly.express as px
pd.options.plotting.backend = 'plotly'

from lec_utils import * # Feel free to uncomment and use this. It'll make your plotly graphs look like ours in lecture!

## Step 1: Introduction

In [None]:
interactions = pd.read_csv('food_data/RAW_interactions.csv')
recipes = pd.read_csv('food_data/RAW_recipes.csv')

In [None]:
rr = pd.merge(recipes, interactions, left_on='id', right_on='recipe_id', how='left')
rr

In [None]:
fig = px.histogram(rr, 
             x='rating', title = 'Rating Distribution')
fig.write_html('ratingRaw.html', include_plotlyjs='cdn')
fig.show()

In [None]:
rr['rating'] = rr['rating'].apply(lambda r: np.nan if r == 0 else r)

In [None]:
fig = px.histogram(rr, 
             x='rating', title = 'Rating Distribution')
fig.write_html('ratingClean.html', include_plotlyjs='cdn')

fig.show()

In [None]:
avg_r = rr.groupby('id')[['rating']].apply(lambda g: g.mean(skipna=True)).reset_index()
rr = rr.merge(avg_r, on='id')

In [None]:
rr = rr.rename(columns={"rating_x":"rating", "rating_y": "avg_rating"})
rr

## Step 2: Data Cleaning and Exploratory Data Analysis

In [None]:
# TODO
rr.info()

In [None]:
#getting relevant cols
rr_filtered = rr[['name', 'id', 'minutes', 'submitted', 'tags', 'nutrition', 'n_steps', 'ingredients', 'n_ingredients', 'user_id', 
                  'rating', 'avg_rating']]
rr_filtered.head()

In [None]:
#adding n_rating column
rr_filtered['n_rating'] = rr_filtered.groupby('id')['id'].transform('count')
rr_filtered.head()

In [None]:
rr_filtered = rr_filtered.groupby('id')[['name', 'submitted', 'minutes', 'tags', 'nutrition', 'n_steps', 'ingredients', 
                                         'n_ingredients', 'n_rating', 'avg_rating']].max()

In [None]:
#drop na ratings? bc nothing to compare against
rr_filtered = rr_filtered[rr_filtered['avg_rating'].notna()]

In [None]:
# number of tags, drop ingredients and tags columns
rr_filtered['n_tags'] = rr_filtered['tags'].transform(lambda r: r.count(',') + 1)

In [None]:
rr_filtered[['calories', 'total fat', 'sugar', 'sodium', 'protein', 'saturated fat', 'carbohydrates']] = rr_filtered['nutrition'].str.split(',', expand=True)
rr_filtered['calories'] = rr_filtered['calories'].str.replace('[', '')
rr_filtered['carbohydrates'] = rr_filtered['carbohydrates'].str.replace(']', '')
rr_filtered[['calories', 'total fat', 'sugar', 'sodium', 'protein', 'saturated fat', 'carbohydrates']] = rr_filtered[['calories', 'total fat', 'sugar', 'sodium', 'protein', 'saturated fat', 'carbohydrates']].astype(float)
rr_filtered

In [None]:
rr_filtered['year'] = rr_filtered['submitted'].transform(lambda s: s[:4]).astype(float)
rr_filtered['healthy_tag'] = rr_filtered['tags'].apply(lambda t: 1 if 'healthy' in t else 0)
rr_filtered.head()

In [None]:
rr_filtered = rr_filtered.drop(['tags', 'ingredients', 'nutrition'], axis=1)
rr_filtered.head()

In [None]:
fig = px.histogram(rr_filtered, 
             x='healthy_tag', 
                   title='Healthy Tag Distribution')

fig.show()
fig.write_html('healthytag_dist.html', include_plotlyjs='cdn')

In [None]:
fig = px.histogram(rr_filtered, 
             x='total fat', title='Total Fat Distribution Cropped')

median_value = np.median(rr_filtered["total fat"])

# Add the median line
fig.add_vline(x=median_value, line_width=3, line_dash="dash", line_color="red")
fig.update_layout(xaxis=dict(range=[0, 200]))

fig.write_html('total_fatCrop.html', include_plotlyjs='cdn')
fig.show()

In [None]:
np.median(rr_filtered["saturated fat"]), np.median(rr_filtered["sugar"])

In [None]:
fig = px.histogram(rr_filtered, 
             x='total fat', title='Total Fat Distribution')

median_value = np.median(rr_filtered["total fat"])

# Add the median line
fig.add_vline(x=median_value, line_width=3, line_dash="dash", line_color="red")
fig.write_html('total_fatReg.html', include_plotlyjs='cdn')

fig.show()

In [None]:
fig = px.histogram(rr_filtered, 
             x='sugar')

fig.show()

In [None]:
fig = px.box(rr_filtered, x='healthy_tag', y='calories', title='Healthy vs Unhealthy Calorie Distributions (Full Range)')
fig.show()
fig.write_html('calorie_allrange.html', include_plotlyjs='cdn')

In [None]:
fig = px.box(rr_filtered, x='healthy_tag', y='calories', title='Healthy vs Unhealthy Calorie Distributions (Cropped Range)')
fig.update_layout(yaxis=dict(range=[0, 2000]))
fig.show()
fig.write_html('calorie_cropped.html', include_plotlyjs='cdn')

In [None]:
fig = px.box(rr_filtered, x='healthy_tag', y='total fat', title='Healthy vs Unhealthy Calorie Distributions (Concentrated Range)')
fig.update_layout(xaxis=dict(range=[0, 500]))
fig.show()

In [None]:
fig = px.scatter(rr_filtered, x='calories', y='protein')
fig.show()

In [None]:
fig = px.scatter(rr_filtered, x='calories', y='total fat')
fig.show()

In [None]:
fig = px.scatter(rr_filtered, x='calories', y='carbohydrates')
fig.show()

In [None]:
fig = px.scatter(rr_filtered, x='avg_rating', y='calories')
fig.show()

In [None]:
fig = px.box(rr_filtered, x='healthy_tag', y='avg_rating')
fig.show()

In [None]:
rr_filtered_grouped = rr_filtered.groupby('healthy_tag')[['calories', 'total fat', 'sugar', 'sodium', 'protein', 'saturated fat', 'carbohydrates']].median()
rr_filtered_grouped

In [None]:
rr_filtered.groupby('year')[['calories', 'total fat', 'sugar', 'sodium', 'protein', 'saturated fat', 'carbohydrates']].median()

## Step 3: Framing a Prediction Problem

In [None]:
#Predict if recipe will be tagged as healthy or not

## Step 4: Baseline Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
# TODO
# can take the most common tags rather talkign soley the unique tags
# looking for items that could be healthy/unhealthy -- 
# use these as a way to look for healthy and un
# logistic regression with calories as predictor and saturated fat as predictors?
X_train, X_test, y_train, y_test = (
    train_test_split(rr_filtered[['calories', 'total fat', 'sugar', 'sodium', 'protein', 'saturated fat', 'carbohydrates', 'minutes', 'year', 'avg_rating', 'n_ingredients', 'n_steps', 'n_rating']], rr_filtered['healthy_tag'])
)

In [None]:
from sklearn.pipeline import FunctionTransformer, make_pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
base_model = make_pipeline(LogisticRegression(max_iter=1000))
base_model.fit(X_train[['sugar', 'sodium', 'saturated fat']], y_train)
base_model

In [None]:
base_model.score(X_test[['sugar', 'sodium','saturated fat']], y_test)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
# look at total fat, proteins and carbs
y_pred = base_model.predict(X_test[['sugar', 'sodium', 'saturated fat']])

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

precision = precision_score(y_test, y_pred)
print("Precision:", precision)

recall = recall_score(y_test, y_pred)
print("Recall:", recall)

In [None]:
# base_model.predict_proba(pd.DataFrame([{
#     'calories': 200,
#     'total fat': 8
# }]))

In [None]:
# base_model.predict(pd.DataFrame([{
#     'calories': 200,
#     'total fat': 8
# }]))

In [None]:
base_model['logisticregression'].intercept_[0], [base_model['logisticregression'].coef_[0][i] for i in range(3)]

In [None]:
# w0_star = base_model['logisticregression'].intercept_[0]
# w1_star = base_model['logisticregression'].coef_[0][0]
# T = 0.5
# cal_threshold = (np.log(T / (1 - T)) - w0_star) / w1_star
# cal_threshold

In [None]:
#notes-- probably not super generalizable, using accuracy/precision/recall to assess
#honestly seems like its just classifying all as not healthy tag
#also only uses nutritional content of recipes, could be other featuress of data that are related to tagged healthy/not

## Step 5: Final Model

In [None]:
# TODO
# cv to determine which are best ? and then idk run it annd assess precision/accuracy/recall

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
final_model1 = make_pipeline(LogisticRegression(max_iter=1000, solver='liblinear'))

# Create a parameter grid
param_grid = {
    'logisticregression__C': [0.1, 1, 10, 100],
    'logisticregression__penalty': ['l1', 'l2']
}
 
# Create GridSearchCV object
grid_search = GridSearchCV(final_model1, param_grid, cv=5)
 
# Fit the model
grid_search.fit(X_train[['calories', 'total fat', 'sugar', 'sodium', 'protein', 'saturated fat', 'carbohydrates']], y_train)
 
# Print best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

In [None]:
from sklearn.tree import DecisionTreeClassifier

model_tree2 = DecisionTreeClassifier(max_depth=3)
model_tree2.fit(X_train, y_train)
model_tree2.score(X_test, y_test)

In [None]:
from sklearn.tree import plot_tree
plt.figure(figsize=(13, 5))
plot_tree(model_tree2, feature_names=X_train.columns, class_names=['not healthy', 'yes healthy'], 
          filled=True, fontsize=10, impurity=False);

In [None]:
final_model2 = make_pipeline(LogisticRegression(max_iter=1000, solver='liblinear'))
# Create a parameter grid
param_grid = {
    'logisticregression__C': [0.1, 1, 10, 100],
    'logisticregression__penalty': ['l1', 'l2']
}
 
# Create GridSearchCV object
grid_search = GridSearchCV(final_model2, param_grid, cv=5)
 
# Fit the model
grid_search.fit(X_train, y_train)
 
# Print best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import QuantileTransformer

In [None]:
col_transformer = make_column_transformer(
        (QuantileTransformer(), ['minutes', 'calories', 'total fat', 'sugar', 'sodium', 'protein', 'saturated fat', 'carbohydrates']),
        (StandardScaler(), ['year', 'avg_rating', 'n_ingredients', 'n_steps', 'n_rating']),
    )

final_model3 = make_pipeline(col_transformer, LogisticRegression(max_iter=1000, solver='liblinear'))

param_grid = {
    'logisticregression__C': [0.1, 1, 10, 100],
    'logisticregression__penalty': ['l1', 'l2']
}
 
# Create GridSearchCV object
grid_search = GridSearchCV(final_model2, param_grid, cv=5)
 
# Fit the model
grid_search.fit(X_train, y_train)
 
# Print best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

In [None]:

from sklearn.tree import DecisionTreeClassifier

model_tree = DecisionTreeClassifier(max_depth=3)
model_tree.fit(X_train, y_train)

In [None]:
model_tree.score(X_test, y_test)

In [None]:
from sklearn.tree import plot_tree
plt.figure(figsize=(13, 5))
plot_tree(model_tree, feature_names=X_train.columns, class_names=['not healthy', 'yes healthy'], 
          filled=True, fontsize=10, impurity=False);

In [None]:
def show_decision_boundary(model, X_train, y_train, title=''):
    from sklearn.inspection import DecisionBoundaryDisplay

    import matplotlib.colors
    cmap = matplotlib.colors.ListedColormap(["orange", "blue"])

    disp = DecisionBoundaryDisplay.from_estimator(
        model, X_train, response_method='predict', cmap=cmap, grid_resolution=400,
        alpha=0.5,
    )
    disp.ax_.scatter(X_train.loc[y_train == 0, 'healthy_tag'], X_train.loc[y_train == 0, 'BMI'], color='orange', s=25, label='no diabetes');
    disp.ax_.scatter(X_train.loc[y_train == 1, 'healthy_tag'], X_train.loc[y_train == 1, 'BMI'], color='blue', s=25, label
    ='diabetes');
    plt.title(title, fontsize=20)
    plt.legend();
