# Rate The Review

**Name(s)**: Chris Chen, Ishaan Tibdewal

**Website Link**: https://chrisc1124.github.io/Rate-The-Review/

In [None]:
import pandas as pd
import numpy as np
import ast
from pathlib import Path

import plotly.express as px
pd.options.plotting.backend = 'plotly'

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report, mean_squared_error
from math import sqrt

from dsc80_utils import *  # Feel free to uncomment and use this.


## Step 1: Introduction

In [None]:
data_dir = Path('data')
recipes = pd.read_csv(data_dir / 'RAW_recipes.csv')
interactions = pd.read_csv(data_dir / 'RAW_interactions.csv')


In [None]:
recipes

In [None]:
recipes.columns

In [None]:
interactions

In [None]:
recipes_interactions = recipes.merge(
    interactions,
    left_on='id',
    right_on='recipe_id',
    how='left',
)
recipes_interactions['rating'] = recipes_interactions['rating'].replace(0, np.nan)
recipes_interactions


In [None]:
avg_rating = recipes_interactions.groupby('id')['rating'].mean()
avg_rating.head()


In [None]:
recipes_interactions = recipes_interactions.merge(
    avg_rating.rename("avg_rating"),
    left_on="id",
    right_index=True,
    how="left",
)

Ratings of 0 in the raw interactions denote missing feedback. Replacing them with `NaN` prevents them from pulling recipe averages downward and keeps the mean focused on true ratings.

## Step 2: Data Cleaning and Exploratory Data Analysis

In [None]:
recipes_interactions.head()

In [None]:
def process_nutrition(nutr):
    nutr = nutr.strip('[]')
    vals = nutr.split(',')
    return [float(val) for val in vals]

In [None]:
recipes_interactions['nutrition'] = recipes_interactions['nutrition'].apply(process_nutrition)

In [None]:
#adding columns for each nutrition factor for special considerations
nutrition_df = pd.DataFrame(recipes_interactions['nutrition'].tolist())
nutrition_df.columns = ['calories(#)', 'total fat(pdv)', 'sugar(pdv)', 'sodium(pdv)', 
                  'protein(pdv)', 'saturated fat(pdv)', 'carbohydrates(pdv)'  ]
recipes_interactions = recipes_interactions.join(nutrition_df)
recipes_interactions

In [None]:
recipes_interactions['date']

In [None]:
recipes_interactions['minutes'].value_counts()

In [None]:
recipes_interactions['minutes'].sort_values(ascending=False)

In [None]:
recipes_interactions['rating'].value_counts()

In [None]:
avg_sodium = recipes_interactions.groupby("rating")["total fat(pdv)"].mean()
avg_sodium

In [None]:
avg_sodium = recipes_interactions.groupby("rating")["sodium(pdv)"].mean()

plt.figure(figsize=(10,6))
plt.plot(avg_sodium.index, avg_sodium.values, marker='o', linewidth=2)

plt.xlabel("Rating")
plt.ylabel("Average Sodium (PDV)")
plt.title("Average Sodium Content vs. Rating")
plt.grid(True)
plt.show()

In [None]:
sodium_by_rating = (
    recipes_interactions
    .dropna(subset=["rating"])
    .assign(rating_int=lambda df: df["rating"].astype(int))  # ratings are discrete
    .groupby("rating_int")["sodium(pdv)"]
    .mean()
    .reset_index()
)

fig = px.bar(
    sodium_by_rating,
    x="rating_int",
    y="sodium(pdv)",
    labels={"rating_int": "Rating", "sodium_pdv": "Average sodium (% DV)"},
    title="Average sodium by rating",
)
fig.show()

In [None]:
carbs_by_rating = (
    recipes_interactions
    .dropna(subset=["rating"])
    .assign(rating_int=lambda df: df["rating"].astype(int))  # ratings are discrete
    .groupby("rating_int")["carbohydrates(pdv)"]
    .mean()
    .reset_index()
)

fig = px.bar(
    carbs_by_rating,
    x="rating_int",
    y="carbohydrates(pdv)",
    title="Average carbs by rating",
)
fig.show()

In [None]:
sugar_by_rating = (
    recipes_interactions
    .dropna(subset=["rating"])
    .assign(rating_int=lambda df: df["rating"].astype(int))  # ratings are discrete
    .groupby("rating_int")["sugar(pdv)"]
    .mean()
    .reset_index()
)

fig = px.bar(
    sugar_by_rating,
    x="rating_int",
    y="sugar(pdv)",
    title="Average sugar by rating",
)
fig.show()

In [None]:
protein_by_rating = (
    recipes_interactions
    .dropna(subset=["rating"])
    .assign(rating_int=lambda df: df["rating"].astype(int))  # ratings are discrete
    .groupby("rating_int")["protein(pdv)"]
    .mean()
    .reset_index()
)

fig = px.bar(
    protein_by_rating,
    x="rating_int",
    y="protein(pdv)",
    title="Average protein by rating",
)
fig.show()

In [None]:
recipes_interactions['total fat(pdv)'].sort_values()

In [None]:
recipes_interactions.iloc[56057]

In [None]:
time_filtered = recipes_interactions[recipes_interactions['minutes'] <= 600]
minutes_by_rating = (
    recipes_interactions
    .dropna(subset=["rating"])
    .assign(rating_int=lambda df: df["rating"].astype(int))  # ratings are discrete
    .groupby("rating_int")["minutes"]
    .mean()
    .reset_index()
)

fig = px.bar(
    minutes_by_rating,
    x="rating_int",
    y="minutes",
    title="Average minutes by rating"
)
fig.show()

In [None]:
fig_fat = px.scatter(
    recipes_interactions,
    x="total fat(pdv)",
    y="calories(#)",
    title="Total Fat vs Calories",
    opacity=0.4,
    range_x=[0, 500],
    range_y=[0, 5000],
    trendline='ols',
    trendline_color_override='red'
)
fig_fat.show()
print(np.corrcoef(recipes_interactions['total fat(pdv)'], recipes_interactions['calories(#)']))

In [None]:
fig_prot = px.scatter(
    recipes_interactions,
    x="protein(pdv)",
    y="calories(#)",
    title="Protein vs Total Fat",
    opacity=0.4,
    range_x=[0, 500],
    range_y=[0, 5000],
    trendline='ols',
    trendline_color_override='red'
)
fig_prot.show()
print(np.corrcoef(recipes_interactions['protein(pdv)'], recipes_interactions['calories(#)']))

In [None]:
fig_sug = px.scatter(
    recipes_interactions,
    x="sugar(pdv)",
    y="calories(#)",
    title="Sugar vs Calories",
    opacity=0.4,
    range_x=[0, 500],
    range_y=[0, 5000],
    trendline='ols',
    trendline_color_override='red'
)
fig_sug.show()
print(np.corrcoef(recipes_interactions['sugar(pdv)'], recipes_interactions['calories(#)']))

In [None]:
fig_fat = px.scatter(
    recipes_interactions,
    x="carbohydrates(pdv)",
    y="calories(#)",
    title="Total Fat vs Calories",
    opacity=0.4,
    range_x=[0, 500],
    range_y=[0, 5000],
    trendline='ols',
    trendline_color_override='red'
)
fig_fat.show()
print(np.corrcoef(recipes_interactions['carbohydrates(pdv)'], recipes_interactions['calories(#)']))

In [None]:
fig_fat = px.scatter(
    recipes_interactions,
    x="protein(pdv)",
    y="calories(#)",
    title="Protein vs Calories",
    opacity=0.4,
    range_x=[0, 500],
    range_y=[0, 5000],
    trendline='ols',
    trendline_color_override='red'
)
fig_fat.show()
print(np.corrcoef(recipes_interactions['protein(pdv)'], recipes_interactions['calories(#)']))

In [None]:
fig_sug = px.scatter(
    recipes_interactions,
    x="sugar(pdv)",
    y="total fat(pdv)",
    title="Sugar vs fat",
    opacity=0.4,
    range_x=[0, 500],
    range_y=[0, 500],
    trendline='ols',
    trendline_color_override='red'
)
fig_sug.show()
print(np.corrcoef(recipes_interactions['sugar(pdv)'], recipes_interactions['total fat(pdv)']))

In [None]:
fig_sug = px.scatter(
    recipes_interactions,
    x="sodium(pdv)",
    y="total fat(pdv)",
    title="Sugar vs fat",
    opacity=0.4,
    range_x=[0, 500],
    range_y=[0, 500],
    trendline='ols',
    trendline_color_override='red'
)
fig_sug.show()
print(np.corrcoef(recipes_interactions['sodium(pdv)'], recipes_interactions['total fat(pdv)']))

In [None]:
recipes_interactions['review_length'] = recipes_interactions['review'].str.len()
recipes_interactions.head()

In [None]:
print(recipes_interactions[['name', 'id', 'minutes', 'rating', 'calories(#)', 'review_length']]
    .head()
    .to_markdown(index=False))

In [None]:
recipes_interactions.columns.to_list()

In [None]:
review_lengths = recipes_interactions['review_length'].dropna()

fig = px.histogram(
    x=review_lengths,
    nbins=100,
    title='Distribution of Review Lengths (Characters)',
    labels={'x': 'Review Length (characters)', 'count': 'Frequency'},
    opacity=0.7
)

review_mean = review_lengths.mean()
review_median = review_lengths.median()

fig.add_vline(
    x=review_mean,
    line_dash="dash",
    line_color="red",
    line_width=2,
    annotation_text=f"Mean: {review_mean:.0f}",
    annotation_position="top right"
)

fig.add_vline(
    x=review_median,
    line_dash="dash",
    line_color="blue",
    line_width=2,
    annotation_text=f"Median: {review_median:.0f}",
    annotation_position="top left"
)

#filtered to 99th percentile for better visualization
upper_bound = review_lengths.quantile(0.99)
fig.update_xaxes(range=[0, upper_bound * 1.3])
fig.show()
fig.write_html('rate-the-review/assets/review_len_dist.html', include_plotlyjs='cdn')

print(f"Review length statistics:")
print(f"  Mean: {review_mean:.2f} characters")
print(f"  Median: {review_median:.2f} characters")
print(f"  Min: {review_lengths.min():.0f} characters")
print(f"  Max: {review_lengths.max():.0f} characters")
print(f"  95th percentile: {upper_bound:.0f} characters")



In [None]:
#bivariate analysis
review_rating_data = recipes_interactions.dropna(subset=['review_length', 'rating']).copy()
review_rating_data['rating_int'] = review_rating_data['rating'].astype(int)

upper_bound = review_rating_data['review_length'].quantile(0.99)
review_rating_data_filtered = review_rating_data[
    review_rating_data['review_length'] <= upper_bound
]

fig = px.violin(
    review_rating_data_filtered,
    x='rating_int',
    y='review_length',
    title='Distribution of Review Lengths by Rating',
    labels={'rating_int': 'Rating', 'review_length': 'Review Length (characters)'},
    color='rating_int',
    color_discrete_sequence=px.colors.qualitative.Set2,
    box=True, 
    points=False  
)

fig.update_layout(
    width=800,
    height=500,
    showlegend=False
)
fig.show()
fig.write_html('rate-the-review/assets/review_length_by_rating_violin.html', include_plotlyjs='cdn')


avg_review_length_by_rating = (
    review_rating_data
    .groupby('rating_int')['review_length']
    .agg(['mean', 'median', 'count'])
    .reset_index()
)

fig_bar = px.bar(
    avg_review_length_by_rating,
    x='rating_int',
    y='mean',
    title='Average Review Length by Rating',
    labels={'rating_int': 'Rating', 'mean': 'Average Review Length (characters)'},
    text='mean',
    color='rating_int',
    color_discrete_sequence=px.colors.qualitative.Set2
)
fig_bar.update_traces(texttemplate='%{text:.0f}', textposition='outside')
fig_bar.update_layout(width=700, height=500, showlegend=False)
fig_bar.show()
fig_bar.write_html('rate-the-review/assets/avg_review_length_by_rating.html', include_plotlyjs='cdn')

print("\nAverage review length by rating:")
print(avg_review_length_by_rating)


In [None]:
#aggregate analysis
review_agg_data = recipes_interactions.dropna(subset=['review_length', 'rating', 'n_steps', 'n_ingredients', 'calories(#)', 'minutes']).copy()

#review length categories
review_agg_data['review_category'] = pd.cut(
    review_agg_data['review_length'],
    bins=[0, 100, 250, 500, float('inf')],
    labels=['Short', 'Medium', 'Long', 'Very Long']
)

review_aggregate = review_agg_data.groupby('review_category').agg({
    'rating': ['mean', 'count'],
    'n_steps': 'mean',
    'n_ingredients': 'mean',
    'calories(#)': 'mean',
    'minutes': 'mean'
}).round(2)

review_aggregate.columns = ['Average Rating', 'Count', 'Avg Steps', 'Avg Ingredients', 'Avg Calories', 'Avg Cooking Time (min)']
review_aggregate = review_aggregate.reset_index()

print("Average Rating, Steps, Ingredients, Calories, and Cooking Time by Review Length:")
print(review_aggregate.to_markdown(index=False))


In [None]:
review_rating_analysis = recipes_interactions.dropna(subset=['review_length', 'rating']).copy()
review_rating_analysis['rating_int'] = review_rating_analysis['rating'].astype(int)

rating_grouped_stats = review_rating_analysis.groupby('rating_int')['review_length'].agg([
    'count', 'mean', 'median', 'min', 'max'
]).round(2)
rating_grouped_stats.columns = ['Count', 'Mean Review Length', 'Median Review Length', 'Min Review Length', 'Max Review Length']
rating_grouped_stats = rating_grouped_stats.reset_index()

print("Review Length Statistics by Rating:")
rating_grouped_stats

## Step 3: Assessment of Missingness

## NMAR in our Dataset

In [None]:
recipes_interactions[recipes_interactions['rating'].isna()][['rating', 'review']].iloc[11].loc['review']

In [None]:
recipes_interactions.isna().sum().sort_values(ascending=False)

In [None]:
recipes_interactions['description']

In [None]:
recipes_interactions[recipes_interactions['description'].isna()]['minutes'].value_counts()

In [None]:
recipes_interactions['n_steps'].value_counts()

In [None]:
recipes_interactions['n_ingredients'].value_counts()

In [None]:
recipes_interactions[recipes_interactions['description'].isna()]['n_steps'].value_counts()

In [None]:
recipes_interactions[recipes_interactions['description'].isna()]['n_ingredients'].value_counts()

##### We believe that the missingness of the `description` column in our recipe + interactions dataframe is most likely NMAR due to the fact that only authors who actaully really care about the recipe and making the recipe look better on the site will leave a description. Those who don't care as much about their recipe, or if it's a simple and straightforward recipe, may decide that having a description is not a priority and will leave it out.

## MAR Dependency test

##### In our missingness dependency test, we want to examine the missingness of the `description` column by testing the relationship/dependency of its missingness with certain columns in our dataframe. Specifically, we are going to be looking at whether the missingess of `description` is dependent on the columns `n_steps`, which is the number of steps in the recipe, and `n_ingredients` which is the number of ingredients in the recipe.

In [None]:
recipe_info = recipes_interactions.groupby('id').first()
recipe_info

##### MAR test for missingness of `description` dependent on `n_steps`

- Null Hypothesis: Missingness of description does not depend on n_steps
- Alternate Hypothesis: Missingness of description does depend on n_step
- Test Statistic: difference of mean n_steps with missing desc and mean n_steps without missing desc
- Significance Level: 0.05

In [None]:
mask_desc = recipe_info["description"].isna()
df = recipe_info.copy()

def diff_median_steps(mask):
    g = df.groupby(mask)["n_steps"].mean()
    return g.loc[True] - g.loc[False]

T_obs = diff_median_steps(mask_desc)

stats = []
for _ in range(2000):
    shuffled = np.random.permutation(mask_desc.to_numpy())
    stats.append(diff_median_steps(shuffled))

p_val = (stats >= T_obs).mean()
print(f'T_obs: {T_obs}, p_val: {p_val}')
print('we fail to reject the null hypothesis')

In [None]:
df = recipe_info.copy()
df["missing_desc"] = df["description"].isna().map({True: "Missing", False: "Present"})

curves = [
    df.loc[df["missing_desc"] == "Missing", "n_steps"].dropna(),
    df.loc[df["missing_desc"] == "Present", "n_steps"].dropna(),
]

fig_kde = ff.create_distplot(
    curves,
    group_labels=["Missing", "Present"],
    show_hist=False,
    show_rug=False,
    curve_type="kde",
)
fig_kde.update_layout(
    title="n_steps by Description Missingness",
    xaxis_title="# Steps",
    yaxis_title="density",
)
fig_kde.show()
fig_kde.write_html('rate-the-review/assets/n_steps_missingness_kde.html', include_plotlyjs='cdn')

In [None]:
fig_perm = px.histogram(
    stats,
    nbins=40,
    histnorm="probability",
    title="Null Distribution of mean n_steps difference (Missing - Present)",
    labels={"value": " Mean Difference", "count": "probability"},
)
fig_perm.add_vline(x=abs(T_obs), line_color="red", line_width=2, annotation_text="observed", annotation_position="top right")
fig_perm.show()
fig_perm.write_html('rate-the-review/assets/n_steps_permutation_test.html', include_plotlyjs='cdn')

##### MAR test for missingness of `description` dependent on `n_ingredients`
- Null Hypothesis: missingness of description does not depend on n_ingredients
- Alternate Hypothesis: missingness of description does depend on n_ingredients
- Test Statistic: absolute difference of mean n_ingredients (missing desc) and mean n_ingredients (not missing desc)
- Significance Level: 0.05

In [None]:
mask_desc = recipe_info["description"].isna()

def diff_mean_ingredients(mask):
    g = recipe_info.groupby(mask)["n_ingredients"].mean()
    return np.abs(g.loc[True] - g.loc[False])

T_obs_ing = diff_mean_ingredients(mask_desc)

stats_ing = []
for _ in range(2000):
    shuffled = np.random.permutation(mask_desc.to_numpy())
    stats_ing.append(diff_mean_ingredients(shuffled))
stats_ing = np.array(stats_ing)
p_val_ing = (stats_ing >= T_obs_ing).mean()
print(f'T_obs: {T_obs_ing}, p_val: {p_val_ing}')
print('we reject the null hypothesis')


In [None]:
curves = [
    recipe_info.loc[mask_desc, "n_ingredients"].dropna(),
    recipe_info.loc[~mask_desc, "n_ingredients"].dropna(),
]

fig_kde_ing = ff.create_distplot(
    curves,
    group_labels=["Missing", "Present"],
    show_hist=False,
    show_rug=False,
    curve_type="kde",
)
fig_kde_ing.update_layout(
    title="n_ingredients by Description Missingness",
    xaxis_title="# Ingredients",
    yaxis_title="density",
)
fig_kde_ing.show()
fig_kde_ing.write_html('rate-the-review/assets/n_ingredients_missingness_kde.html', include_plotlyjs='cdn')


In [None]:
fig_perm_ing = px.histogram(
    stats_ing,
    nbins=40,
    histnorm="probability",
    title="Distribution of |mean difference| of n_ingredients (missing - present)",
    labels={"value": "Absolute Median Difference", "count": "probability"},
)
fig_perm_ing.add_vline(
    x=T_obs_ing,
    line_color="red",
    line_width=2,
    annotation_text="observed",
    annotation_position="top right",
)
fig_perm_ing.show()
fig_perm_ing.write_html('rate-the-review/assets/n_ingredients_permutation_test.html', include_plotlyjs='cdn')


## Step 4: Hypothesis Testing

In [None]:
recipes_interactions.head()[['review']]

- Null Hypothesis: Long and short reviews recieve the same ratings on average
- Alternate Hypothesis: Long reviews recieve lower ratings on average compared to short reviews
- Test Statistic: Difference in means of long reviews and short reviews
- Significance Level: 0.05

In [None]:
import plotly.graph_objects as go

recipes_interactions['review_length'] = recipes_interactions['review'].str.len()
review_analysis = recipes_interactions.dropna(subset=['review_length', 'rating']).copy()

#classifying reviews as long or short
review_length_median = review_analysis['review_length'].median()
review_analysis['is_long_review'] = review_analysis['review_length'] > review_length_median
review_analysis['review_category'] = review_analysis['is_long_review'].map({True: 'Long Review', False: 'Short Review'})

#observed 
long_review_ratings = review_analysis[review_analysis['is_long_review'] == True]['rating']
short_review_ratings = review_analysis[review_analysis['is_long_review'] == False]['rating']
observed_diff = long_review_ratings.mean() - short_review_ratings.mean()

n_repetitions = 10000
np.random.seed(42)
ratings = review_analysis['rating'].values
differences = []

for _ in range(n_repetitions):
    shuffled_labels = np.random.permutation(review_analysis['is_long_review'])
    long_mean = ratings[shuffled_labels].mean()
    short_mean = ratings[~shuffled_labels].mean()
    diff = long_mean - short_mean
    differences.append(diff)

differences = np.array(differences)
p_value = np.mean(differences <= observed_diff)

# permutation test visualization
fig = go.Figure()
fig.add_trace(go.Histogram(
    x=differences,
    nbinsx=50,
    marker_color='mediumpurple',
    opacity=0.7,
    histnorm='probability density'
))
fig.add_vline(x=observed_diff, line_dash="dash", line_color="red", line_width=3,
              annotation_text=f"Observed: {observed_diff:.4f}", annotation_position="top right")
fig.add_vline(x=0, line_dash="dot", line_color="black", line_width=1)
fig.update_layout(
    title='Distribution of mean difference in ratings for Long Review vs Short Review',
    xaxis_title='Difference in Means (Long - Short)',
    yaxis_title='Density',
    width=800,
    height=500,
    showlegend=False
)
fig.show()
fig.write_html('rate-the-review/assets/hypothesis_test.html', include_plotlyjs='cdn')

print(f"P-value: {p_value:.4f}")
print('we reject the null hypothesis')

## Step 5: Framing a Prediction Problem

Our prediction problem is to identify whether an individual user's **review will be highly rated** (rating >= 4) or **low rated** (rating < 4) based on the review text they wrote and recipe characteristics. This framing helps platforms understand how review content relates to user satisfaction and can assist in identifying patterns in how users express their experiences.

- **Target (response)**: `highly_rated`, where we label each individual review as 1 if the user's rating is at least 4.0 and 0 otherwise.
- **Prediction type**: Binary classification.
- **Features considered**: 
  - Primary feature: Review text (`review`) converted to TF-IDF vectors - captures what the user wrote in their review
  - Secondary feature: Recipe metadata (`n_steps`) - number of steps in the recipe
- **Unit of analysis**: Each row corresponds to an individual review (one user's rating and review text for a recipe). We only keep reviews that have both review text and a rating.
- **Practical value**: Understanding the relationship between review text content and ratings can help platforms identify helpful review patterns, improve recommendation systems, and better understand user sentiment expression.


In [None]:

#filter to reviews that have both review text and rating
model_data = recipes_interactions.dropna(subset=['review', 'rating']).copy()

#create binary target: 1 if individual rating >= 4, else 0
model_data['highly_rated'] = (model_data['rating'] >= 4.0).astype(int)

print(f"Total reviews with both review text and rating: {len(model_data):,}")
print(f"Unique recipes: {model_data['id'].nunique():,}")
print("\nClass distribution (1 = highly rated review, rating >= 4):")
print(model_data['highly_rated'].value_counts())
print("\nClass proportions:")
print(model_data['highly_rated'].value_counts(normalize=True))
print("\nRating distribution:")
print(model_data['rating'].value_counts().sort_index())


In [None]:
fig = px.bar(
    x=model_data['highly_rated'].value_counts().index.astype(str),
    y=model_data['highly_rated'].value_counts().values,
    labels={'x': 'Highly Rated (1=Yes, 0=No)', 'y': 'Count'},
    title='Distribution of Target Variable'
)
fig.show()
fig.write_html('rate-the-review/assets/target_distribution.html', include_plotlyjs='cdn')


In [None]:
# Create a stratified train/test split and save the indices for reuse in later steps
X = model_data.drop(columns=['highly_rated', 'avg_rating', 'rating'])  # Drop target and rating (keep only features)
y = model_data['highly_rated']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,  # Maintain class distribution across splits
)

train_indices = X_train.index
test_indices = X_test.index

print(f"Training set size: {len(X_train):,} reviews")
print(f"Test set size: {len(X_test):,} reviews")
print(f"Unique recipes in training: {X_train['id'].nunique():,}")
print(f"Unique recipes in test: {X_test['id'].nunique():,}")
print("\nTraining set class distribution:")
print(y_train.value_counts())
print("\nTest set class distribution:")
print(y_test.value_counts())


**Step 5 summary**

- Reviews missing either `review` text or `rating` are removed (only a small fraction).
- The binary target `highly_rated` equals 1 when the individual user's `rating >= 4.0` and 0 otherwise.
- A stratified 80/20 train/test split is created and the indices are saved for reuse in Steps 6 and 7 so the evaluation remains consistent.
- The resulting dataset is class-imbalanced (roughly 94% of reviews are highly rated), so later models will use techniques such as `class_weight='balanced'` to compensate.


## Step 6: Baseline Model

In [None]:
# Re-create the train/test split using the saved indices so every model uses identical data
X_train = model_data.loc[train_indices].drop(columns=['highly_rated', 'avg_rating'])
X_test = model_data.loc[test_indices].drop(columns=['highly_rated', 'avg_rating'])
y_train = model_data.loc[train_indices, 'highly_rated']
y_test = model_data.loc[test_indices, 'highly_rated']

print(f"Training samples: {len(X_train):,} reviews")
print(f"Test samples: {len(X_test):,} reviews")
print("Features:")
print(" - review (text, TF-IDF) - what the user wrote in their review")
print(" - n_steps (numeric) - number of steps in the recipe")

In [None]:
baseline_preprocessor = ColumnTransformer(
    [
        ('text', TfidfVectorizer(stop_words='english'), 'review'),  # Review text as primary feature (no max_features limit)
        ('numeric', 'passthrough', ['n_steps'])  # n_steps as secondary feature
    ]
)

baseline_pipeline = Pipeline(
    steps=[
        ('preprocessor', baseline_preprocessor),
        ('classifier', LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000))
    ]
)

baseline_pipeline


In [None]:
print("Training baseline model...")
baseline_pipeline.fit(X_train, y_train)
print("Training complete!")


In [None]:
y_train_pred = baseline_pipeline.predict(X_train)
y_test_pred = baseline_pipeline.predict(X_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
train_f1_weighted = f1_score(y_train, y_train_pred, average='weighted')
test_f1_weighted = f1_score(y_test, y_test_pred, average='weighted')
train_f1_macro = f1_score(y_train, y_train_pred, average='macro')
test_f1_macro = f1_score(y_test, y_test_pred, average='macro')

# Calculate RMSE (Root Mean Squared Error)
train_rmse = sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = sqrt(mean_squared_error(y_test, y_test_pred))

print("Baseline Model Performance")
print(f"Train accuracy: {train_accuracy:.4f}")
print(f"Test accuracy: {test_accuracy:.4f}")
print(f"\nTrain F1 (weighted): {train_f1_weighted:.4f}")
print(f"Test F1 (weighted): {test_f1_weighted:.4f}")
print(f"Train F1 (macro): {train_f1_macro:.4f}")
print(f"Test F1 (macro): {test_f1_macro:.4f}")
print(f"\nTrain RMSE: {train_rmse:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")


In [None]:
cm_test = confusion_matrix(y_test, y_test_pred)
print("Test confusion matrix:\n", cm_test)
print(classification_report(y_test, y_test_pred, target_names=['Low-rated (0)', 'Highly-rated (1)']))


In [None]:
fig = go.Figure(
    data=go.Heatmap(
        z=cm_test,
        x=['Predicted low-rated', 'Predicted highly-rated'],
        y=['Actual low-rated', 'Actual highly-rated'],
        colorscale='Blues',
        text=cm_test,
        texttemplate='%{text}',
        colorbar=dict(title='Count')
    )
)
fig.update_layout(title='Baseline Model Confusion Matrix (Test Set)', width=600, height=400)
fig.show()
fig.write_html('rate-the-review/assets/baseline_confusion_matrix.html', include_plotlyjs='cdn')


**Baseline model recap**

- **Features**: 
  - TF-IDF representations of `review` text (no max_features limit - uses all words) - what the user wrote in their review
  - Numeric `n_steps` column passed through unchanged - number of steps in the recipe
- **Target**: Binary classification - `highly_rated = 1` if user's rating >= 4, else 0
- **Estimator**: Logistic regression with `class_weight='balanced'` and `max_iter=1000` to cope with class imbalance.
- **Pipeline**: Uses a single `ColumnTransformer` + `Pipeline` so preprocessing and modeling are bundled together.
- **Performance**: This provides a concrete baseline for the final model to beat.



## Step 7: Final Model

In [None]:
# Re-create the train/test split using the saved indices so every model uses identical data
X_train = model_data.loc[train_indices].drop(columns=['highly_rated', 'avg_rating'])
X_test = model_data.loc[test_indices].drop(columns=['highly_rated', 'avg_rating'])
y_train = model_data.loc[train_indices, 'highly_rated']
y_test = model_data.loc[test_indices, 'highly_rated']

print(f"Training samples: {len(X_train):,} reviews")
print(f"Test samples: {len(X_test):,} reviews")
print("\nFinal Model Features:")
print(" - review (text, TF-IDF) - what the user wrote in their review")
print(" - n_steps (numeric, passthrough) - number of steps in the recipe (from baseline)")
print(" - calories(#) (numeric, StandardScaler) - NEW: calories scaled")
print(" - minutes (numeric, QuantileTransformer) - NEW: cooking time transformed")

### Hyperparameters to Tune

Before tuning, we document which hyperparameters we plan to tune and why:

1. **LogisticRegression C (regularization strength)**
   - Why: Controls the trade-off between fitting the training data and preventing overfitting.
     - Smaller C (e.g., 0.01, 0.1) = stronger regularization = simpler model, less overfitting
     - Larger C (e.g., 10, 100) = weaker regularization = more complex model, risk of overfitting
2. **LogisticRegression penalty (regularization type)**
   - Why: Different regularization types (L1 vs L2) can lead to different model behaviors.
     - L1 (Lasso): Creates sparsity by zeroing out coefficients, effectively doing feature selection.
       Useful when many features might be irrelevant.
     - L2 (Ridge): Shrinks coefficients but keeps all features. Generally more stable and 
       standard for text classification.
   - Options: ['l1', 'l2'] - we'll test both to see which works better for our text classification task



In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import QuantileTransformer

# Final model preprocessor with new features
final_preprocessor = ColumnTransformer(
    [
        ('text', TfidfVectorizer(stop_words='english'), 'review'),
        ('numeric_passthrough', 'passthrough', ['n_steps']),  # Keep baseline feature
        ('calories_scaled', StandardScaler(), ['calories(#)']),  # New feature 1
        ('minutes_quantile', QuantileTransformer(output_distribution='normal', random_state=42), ['minutes'])  # New feature 2
    ]
)

# Final model pipeline
final_pipeline = Pipeline(
    steps=[
        ('preprocessor', final_preprocessor),
        ('classifier', LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000))
    ]
)

final_pipeline


In [None]:
param_grid_l2 = {
    'classifier__C': [1, 2, 3, 4, 5, 6],
    'classifier__penalty': ['l2'],
    'classifier__solver': ['lbfgs'],  #for L2
    'classifier__max_iter': [1500, 1600, 1700]  
}

param_grid_l1 = {
    'classifier__C': [1, 2, 3, 4, 5, 6],
    'classifier__penalty': ['l1'],
    'classifier__solver': ['liblinear'],  # for L1
    'classifier__max_iter': [1500, 1600, 1700]  
}

# Run L2 first 
grid_search_l2 = GridSearchCV(final_pipeline, param_grid_l2, cv=5, scoring='f1_macro', n_jobs=-1, verbose=1)
grid_search_l2.fit(X_train, y_train)

# L1
grid_search_l1 = GridSearchCV(final_pipeline, param_grid_l1, cv=5, scoring='f1_macro', n_jobs=-1, verbose=1)
grid_search_l1.fit(X_train, y_train)


In [None]:
# Compare L2 and L1 results
best_l2_score = grid_search_l2.best_score_
best_l1_score = grid_search_l1.best_score_

print("Grid Search Results Comparison")
print(f"Best L2 cross-validation F1-macro: {best_l2_score:.4f}")
print(f"Best L1 cross-validation F1-macro: {best_l1_score:.4f}")
print()

if best_l2_score > best_l1_score:
    best_model = grid_search_l2.best_estimator_
    best_grid = grid_search_l2
    print(f"L2 model performed better")
else:
    best_model = grid_search_l1.best_estimator_
    best_grid = grid_search_l1
    print(f"L1 model performed better")

print("Best Hyperparameters:")
for param, value in best_grid.best_params_.items():
    print(f"  {param}: {value}")
print(f"\nBest cross-validation F1-macro score: {best_grid.best_score_:.4f}")

In [None]:
# Evaluate best model on test set
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
train_f1_weighted = f1_score(y_train, y_train_pred, average='weighted')
test_f1_weighted = f1_score(y_test, y_test_pred, average='weighted')
train_f1_macro = f1_score(y_train, y_train_pred, average='macro')
test_f1_macro = f1_score(y_test, y_test_pred, average='macro')

train_rmse = sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = sqrt(mean_squared_error(y_test, y_test_pred))

print("Final Model Performance (Best Hyperparameters)")
print(f"Train accuracy: {train_accuracy:.4f}")
print(f"Test accuracy: {test_accuracy:.4f}")
print(f"\nTrain F1 (weighted): {train_f1_weighted:.4f}")
print(f"Test F1 (weighted): {test_f1_weighted:.4f}")
print(f"Train F1 (macro): {train_f1_macro:.4f}")
print(f"Test F1 (macro): {test_f1_macro:.4f}")
print(f"\nTrain RMSE: {train_rmse:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")

# Compare to baseline
print("Comparison to Baseline:")
print(f"Baseline test accuracy: 0.9039")
print(f"Final test accuracy: {test_accuracy:.4f}")
print(f"Improvement: {test_accuracy - 0.9039:+.4f}")
print(f"\nBaseline test F1 (macro): 0.7216")
print(f"Final test F1 (macro): {test_f1_macro:.4f}")
print(f"Improvement: {test_f1_macro - 0.7216:+.4f}")

In [None]:
# Confusion matrix for final model
cm_test_final = confusion_matrix(y_test, y_test_pred)
print("Final Model Test Confusion Matrix:")
print(cm_test_final)
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred, target_names=['Low-rated (0)', 'Highly-rated (1)']))

In [None]:
# Visualize confusion matrix
fig = go.Figure(
    data=go.Heatmap(
        z=cm_test_final,
        x=['Predicted low-rated', 'Predicted highly-rated'],
        y=['Actual low-rated', 'Actual highly-rated'],
        colorscale='Blues',
        text=cm_test_final,
        texttemplate='%{text}',
        colorbar=dict(title='Count')
    )
)
fig.update_layout(title='Final Model Confusion Matrix (Test Set)', width=600, height=400)
fig.show()
fig.write_html('rate-the-review/assets/final_model_confusion_matrix.html', include_plotlyjs='cdn')

## Step 8: Fairness Analysis

In [None]:
# Fairness Analysis: Does the model perform differently for quick recipes vs slow recipes?
# Evaluation metric: Precision for the positive class (highly_rated = 1)

X_test = model_data.loc[test_indices].drop(columns=['highly_rated', 'avg_rating'])
y_test = model_data.loc[test_indices, 'highly_rated']
y_test_pred = best_model.predict(X_test)

test_data_with_minutes = model_data.loc[test_indices].copy()
test_data_with_minutes['predicted'] = y_test_pred

#splitting the data into quick and slow recipes
median_minutes = test_data_with_minutes['minutes'].median()
print(f"Median cooking time: {median_minutes:.1f} minutes")
print(f"Quick recipes (≤ {median_minutes:.1f} min): {(test_data_with_minutes['minutes'] <= median_minutes).sum():,} reviews")
print(f"Slow recipes (> {median_minutes:.1f} min): {(test_data_with_minutes['minutes'] > median_minutes).sum():,} reviews")

In [None]:
# Create binary group labels
test_data_with_minutes['is_quick'] = test_data_with_minutes['minutes'] <= median_minutes

# Calculate precision for each group
# Precision = TP / (TP + FP) for the positive class (highly_rated = 1)

def calculate_precision(y_true, y_pred, positive_class=1):
    """Calculate precision for the positive class."""
    tp = ((y_true == positive_class) & (y_pred == positive_class)).sum()
    fp = ((y_true != positive_class) & (y_pred == positive_class)).sum()
    if tp + fp == 0:
        return 0.0
    return tp / (tp + fp)

# Quick recipes group
quick_mask = test_data_with_minutes['is_quick']
quick_precision = calculate_precision(
    test_data_with_minutes.loc[quick_mask, 'highly_rated'],
    test_data_with_minutes.loc[quick_mask, 'predicted']
)

# Slow recipes group
slow_mask = ~test_data_with_minutes['is_quick']
slow_precision = calculate_precision(
    test_data_with_minutes.loc[slow_mask, 'highly_rated'],
    test_data_with_minutes.loc[slow_mask, 'predicted']
)

observed_diff = quick_precision - slow_precision

print("Observed Precision by Group")
print(f"Quick recipes (≤ {median_minutes:.1f} min): {quick_precision:.4f}")
print(f"Slow recipes (> {median_minutes:.1f} min): {slow_precision:.4f}")
print(f"\nDifference (Quick - Slow): {observed_diff:.4f}")


### Hypotheses

- **Null Hypothesis**: Our model is fair. Its precision for quick recipes and slow recipes are roughly the same, and any differences are due to random chance.

- **Alternative Hypothesis**: Our model is unfair. Its precision for quick recipes is different from its precision for slow recipes (two-sided test).

### Permutation Test



In [None]:
np.random.seed(42)
n_repetitions = 10000

# Store the actual group labels and predictions/true values
group_labels = test_data_with_minutes['is_quick'].values
y_true_values = test_data_with_minutes['highly_rated'].values
y_pred_values = test_data_with_minutes['predicted'].values

def precision_diff_permuted(shuffled_groups):
    quick_mask = shuffled_groups
    slow_mask = ~shuffled_groups
    
    quick_prec = calculate_precision(
        y_true_values[quick_mask],
        y_pred_values[quick_mask]
    )
    slow_prec = calculate_precision(
        y_true_values[slow_mask],
        y_pred_values[slow_mask]
    )
    return quick_prec - slow_prec

permuted_diffs = []
for _ in range(n_repetitions):
    shuffled = np.random.permutation(group_labels)
    permuted_diffs.append(precision_diff_permuted(shuffled))

permuted_diffs = np.array(permuted_diffs)

p_value = np.mean(np.abs(permuted_diffs) >= np.abs(observed_diff))

print("Permutation Test Results")
print(f"Observed difference: {observed_diff:.4f}")
print(f"P-value (two-sided): {p_value:.4f}")
print(f"Significance level: 0.05")


In [None]:
# Visualize the permutation test results
fig = px.histogram(
    x=permuted_diffs,
    nbins=50,
    histnorm='probability density',
    title='Null Distribution of Precision Difference (Quick - Slow)',
    labels={'x': 'Precision Difference (Quick - Slow)', 'y': 'Density'},
    opacity=0.7
)

# Add observed value
fig.add_vline(
    x=observed_diff,
    line_dash="dash",
    line_color="red",
    line_width=3,
    annotation_text=f"Observed: {observed_diff:.4f}",
    annotation_position="top right"
)

# Add negative of observed value for two-sided test
fig.add_vline(
    x=-observed_diff,
    line_dash="dash",
    line_color="red",
    line_width=2,
    annotation_text=f"-Observed: {-observed_diff:.4f}",
    annotation_position="top left"
)


fig.update_layout(
    width=800,
    height=500,
    showlegend=False
)
fig.show()
fig.write_html('rate-the-review/assets/fairness_test.html', include_plotlyjs='cdn')


In [None]:
# summary
print(f"Groups Compared:")
print(f"Quick recipes: ≤ {median_minutes:.1f} minutes ({quick_mask.sum():,} reviews)")
print(f"Slow recipes: > {median_minutes:.1f} minutes ({slow_mask.sum():,} reviews)")
print(f"\nEvaluation Metric: Precision for highly_rated = 1")
print(f"\nObserved Performance:")
print(f"Quick recipes precision: {quick_precision:.4f}")
print(f"Slow recipes precision: {slow_precision:.4f}")
print(f"Difference (Quick - Slow): {observed_diff:.4f}")
print(f"\nPermutation Test Results:")
print(f"P-value (two-sided): {p_value:.4f}")
print(f"Significance level: 0.05")

