# Recipe Star Prediction 

**Name(s)**: Sophia Papadopoulos and Leo Udell

**Website Link**: (your website link)

In [None]:
import pandas as pd
import numpy as np
import requests
import re
from bs4 import BeautifulSoup

import plotly.express as px
pd.options.plotting.backend = 'plotly'

from lec_utils import * # Feel free to uncomment and use this. It'll make your plotly graphs look like ours in lecture!

## Step 1: Introduction

In [None]:
recipes = pd.read_csv('RAW_recipes.csv')
recipes.shape

In [None]:
ratings = pd.read_csv('RAW_interactions.csv')
ratings.shape

## Step 2: Data Cleaning and Exploratory Data Analysis

In [None]:
merged_unclean = recipes.merge(ratings, left_on='id', right_on='recipe_id')

In [None]:
merged = merged_unclean[['id', 'contributor_id', 'name', 'minutes', 'n_steps', 'n_ingredients', 'rating']]
merged.sample(5)

In [None]:
merged['rating'].value_counts()

In [None]:
merged.sort_values('minutes', ascending=False).head()

**Univariate Analysis**

In [None]:
(
    merged['rating']
    .value_counts()
    .plot(
        kind='bar',
        title='Frequency of Ratings'
    )
)

In [None]:
merged_non_zero = merged[merged['rating'] != 0]

**Bivariate Analysis**

In [None]:
fig = px.scatter(
    merged_non_zero,
    x='minutes',
    y='n_steps',
    hover_name='name',
    color='rating',
    title='Minutes vs. Number of Steps',
)

fig.update_layout(
    xaxis_title='Minutes',
    yaxis_title='Number of Steps',
    legend_title='Rating'
)
fig.show()

**Interesting Aggregates**

## Step 3: Framing a Prediction Problem

Based on the "complexity" of the recipe what is the average rating for the recipe? 
We have defined recipe "complexity" as the number of ingredients, the number of minutes 
the recipe takes to complete, and the number of steps included in the recipe.

## Step 4: Baseline Model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import mean_squared_error

In [None]:
model_simple = LinearRegression()
model_simple.fit(X=merged_non_zero[['n_ingredients', 'minutes', 'n_steps']], y=merged_non_zero['rating'])
simple_mse = mean_squared_error(merged_non_zero['rating'], model_simple.predict(merged_non_zero[['n_ingredients', 'minutes', 'n_steps']]))
simple_mse

## Step 5: Final Model

In [None]:
# TODO

In [None]:
def create_model_standardized(df):
    stdscaler = StandardScaler()
    stdscaler.fit(df[['n_ingredients', 'minutes', 'n_steps']])
    stdscaler.transform(df[['n_ingredients', 'minutes', 'n_steps']])

    pipeline_model = make_pipeline(stdscaler, LinearRegression())
    pipeline_model.fit(df[['n_ingredients', 'minutes', 'n_steps']], df['rating'])
    return pipeline_model

model = create_model_standardized(merged_non_zero)
model.named_steps['linearregression'].coef_