Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
data = pd.read_csv("epi_r.csv")
data.head()

Unnamed: 0,title,rating,calories,protein,fat,sodium,#cakeweek,#wasteless,22-minute meals,3-ingredient recipes,...,yellow squash,yogurt,yonkers,yuca,zucchini,cookbooks,leftovers,snack,snack week,turkey
0,"Lentil, Apple, and Turkey Wrap",2.5,426.0,30.0,7.0,559.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,Boudin Blanc Terrine with Red Onion Confit,4.375,403.0,18.0,23.0,1439.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Potato and Fennel Soup Hodge,3.75,165.0,6.0,7.0,165.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Mahi-Mahi in Tomato Olive Sauce,5.0,,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Spinach Noodle Casserole,3.125,547.0,20.0,32.0,452.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
data.columns

Index(['title', 'rating', 'calories', 'protein', 'fat', 'sodium', '#cakeweek',
       '#wasteless', '22-minute meals', '3-ingredient recipes',
       ...
       'yellow squash', 'yogurt', 'yonkers', 'yuca', 'zucchini', 'cookbooks',
       'leftovers', 'snack', 'snack week', 'turkey'],
      dtype='object', length=680)

## Data preparation

In [4]:
def load_dataset(file_path):
    return pd.read_csv(file_path)

I am filtering the relevant columns

In [5]:
def filter_columns(data):
   ingredient_columns = [col for col in data.columns if col not in ['title', 'calories', 'protein', 'fat', 'sodium', 'rating']]
   return data[['rating'] + ingredient_columns]

And now handling the missing values

In [6]:
def handle_missing_values(data):
    data['rating'] = data['rating'].fillna(data['rating'].median())
    ingredient_cols = data.columns[1:]
    data = data[data[ingredient_cols].sum(axis=1) > 0]
    return data

and now I am preparing target variables

In [7]:
def prepare_target(data, classification=False):
    if classification:
        data['rating'] = data['rating'].apply(lambda x: 'bad' if x <= 1 else 'so-so' if x <= 3 else 'great')
    return data

I am going to save the cleaned dataset

In [8]:
def save_cleaned_data(data, output_path):
    data.to_csv(output_path, index=False)
    print(f"Cleaned dataset saved to {output_path}")

Main script

In [9]:
if __name__ == "__main__":
    # Replace 'epicurious.csv' with your dataset file path
    file_path = "epi_r.csv"
    output_path = "recipes.csv"

    # Load and process the dataset
    data = load_dataset(file_path)
    data = filter_columns(data)
    data = handle_missing_values(data)
    data = prepare_target(data, classification=True)
    save_cleaned_data(data, output_path)


Cleaned dataset saved to recipes.csv


In [10]:
data = pd.read_csv("recipes.csv")
data.head()

Unnamed: 0,rating,#cakeweek,#wasteless,22-minute meals,3-ingredient recipes,30 days of groceries,advance prep required,alabama,alaska,alcoholic,...,yellow squash,yogurt,yonkers,yuca,zucchini,cookbooks,leftovers,snack,snack week,turkey
0,so-so,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,great,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,great,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,great,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,great,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


So now I should separate the features and target

In [11]:
X = data.drop('rating', axis=1)
y = data['rating']

I am converting the target into categories if classification

In [12]:
y = pd.to_numeric(y.str.replace('bad', '0').str.replace('so-so', '1').str.replace('great', '2'), errors='coerce')
y = y.apply(lambda x: 'bad' if x <= 1 else 'so-so' if x <= 3 else 'great')

Splitting into training and testing sets

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Now classification and regression models

### Regression

Regression Models

In [14]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
models = {
    'Linear Regression': (LinearRegression(), {}),
    'Ridge': (Ridge(), {'alpha': [0.1, 1, 10]}),
    'Lasso': (Lasso(), {'alpha': [0.1, 1, 10]}),
    'Decision Tree': (DecisionTreeRegressor(), {'max_depth': [3, 5, 7]}),
    'Random Forest': (RandomForestRegressor(), {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7]}),
    'Gradient Boosting': (GradientBoostingRegressor(), {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1]}),
}

Converting

In [15]:
y_train_numeric = y_train.map({'bad': 0, 'so-so': 1, 'great': 2})
y_test_numeric = y_test.map({'bad': 0, 'so-so': 1, 'great': 2})

best_model = None
best_rmse = float('inf')


Loop through models to see which model works best

In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

for name, (model, param_grid) in models.items():
    grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=5)
    grid_search.fit(X_train, y_train_numeric)

    y_pred = grid_search.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test_numeric, y_pred))

    print(f"{name} RMSE: {rmse:.2f}")

    if rmse < best_rmse:
        best_rmse = rmse
        best_model = grid_search.best_estimator_

print(f"Best Model: {best_model.__class__.__name__}, RMSE: {best_rmse:.2f}")

Linear Regression RMSE: 0.31
Ridge RMSE: 0.31
Lasso RMSE: 0.33
Decision Tree RMSE: 0.31
Random Forest RMSE: 0.31
Gradient Boosting RMSE: 0.31
Best Model: Ridge, RMSE: 0.31


Naive Regressor

In [17]:
# Calculating the average rating
average_rating = np.mean(y_train_numeric)
# Predicting the average rating for all test instances
y_pred_naive = np.full_like(y_test_numeric, average_rating, dtype=float)

# Calculating RMSE for the naive regressor
rmse_naive = np.sqrt(mean_squared_error(y_test_numeric, y_pred_naive))
print(f"Naive Regressor RMSE: {rmse_naive:.2f}")

Naive Regressor RMSE: 0.33


### Classification

Binarizing the target column

In [18]:
y_train_bin = y_train_numeric.round().astype(int)  # Round and convert to integers
y_test_bin = y_test_numeric.round().astype(int)


Trying classification models to find the best one

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

models = {
    'Logistic Regression': (LogisticRegression(), {'C': [0.1, 1, 10]}),
    'Decision Tree': (DecisionTreeClassifier(), {'max_depth': [3, 5, 7]}),
    'Random Forest': (RandomForestClassifier(), {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7]}),
    'Gradient Boosting': (GradientBoostingClassifier(), {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1]}),
}

In [20]:
best_model = None
best_accuracy = 0


Now again a loop to find the best model

In [21]:
for name, (model, param_grid) in models.items():
    grid_search = GridSearchCV(model, param_grid, scoring='accuracy', cv=5)
    grid_search.fit(X_train, y_train_bin)  # Use binarized target for classification

    y_pred = grid_search.predict(X_test)
    accuracy = accuracy_score(y_test_bin, y_pred)

    print(f"{name} Accuracy: {accuracy:.2f}")

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = grid_search.best_estimator_

print(f"Best Model: {best_model.__class__.__name__}, Accuracy: {best_accuracy:.2f}")

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.88
Decision Tree Accuracy: 0.88
Random Forest Accuracy: 0.88
Gradient Boosting Accuracy: 0.88
Best Model: DecisionTreeClassifier, Accuracy: 0.88


Calculating Accuracy of Naive Classifier


In [22]:
most_frequent_class = max(set(y_train_bin), key=list(y_train_bin).count)  # Use binarized y_train for most frequent class calculation

# Predicting the most frequent class for all test instances
y_pred_naive = np.full_like(y_test_bin, most_frequent_class, dtype=int)

# Calculating accuracy for the naive classifier
accuracy_naive = accuracy_score(y_test_bin, y_pred_naive)
print(f"Naive Classifier Accuracy: {accuracy_naive:.2f}")

Naive Classifier Accuracy: 0.87


Now I am saving it

In [23]:
import joblib

joblib.dump(best_model, 'best_model.pkl')

['best_model.pkl']

##Nutrition Facts

###Collecting Nutrition Facts from USDA API

In [24]:
import pandas as pd
import requests
import time

# 1. Load recipes
recipes_df = pd.read_csv("recipes.csv")

# 2. Add 'ingredients_list' column
recipes_df['ingredients_list'] = recipes_df.drop('rating', axis=1).apply(lambda row: ', '.join(row.index[row == 1]), axis=1)

# 3. Collect all unique ingredients
all_ingredients = set()
for ingredients in recipes_df['ingredients_list']:
    all_ingredients.update(ingredients.split(', '))
unique_ingredients = list(all_ingredients)

print(f"Total unique ingredients: {len(unique_ingredients)}")

# 4. Prepare the API key
api_key = 'NlGP9zCpXfkf8bS5TXd5gXzfbhzE13wDrImUqNAf'

# 5. Functions
def get_nutrition_data(ingredient):
    """Fetches nutrition data from the USDA API."""
    url = f"https://api.nal.usda.gov/fdc/v1/foods/search?api_key={api_key}&query={ingredient}"
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        if 'foods' in data and data['foods']:
            return data['foods'][0]
        else:
            print(f"No data found for ingredient: {ingredient}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for {ingredient}: {e}")
        return None

def transform_to_daily_values(nutrition_data, daily_values_df):
    """Transforms nutrition values to % daily value."""
    nutrients = {}
    for nutrient_info in nutrition_data.get('foodNutrients', []):
        nutrient_name = nutrient_info.get('nutrientName')
        nutrient_value = nutrient_info.get('value')

        if nutrient_name and nutrient_value is not None:
            if nutrient_name.lower() in daily_values_df['nutrient'].str.lower().values:
                rda = daily_values_df.loc[daily_values_df['nutrient'].str.lower() == nutrient_name.lower(), 'daily_value'].values[0]
                if rda != 0:
                    daily_value_percentage = (nutrient_value / rda) * 100
                    nutrients[nutrient_name] = round(daily_value_percentage, 2)
    return nutrients

def sanitize_ingredient(ingredient):
    """Sanitizes ingredient names."""
    return ingredient.split('/')[0].strip()

# 6. Daily Values Table
daily_values_data = [
    {'nutrient': 'Vitamin A', 'daily_value': 900},
    {'nutrient': 'Vitamin C', 'daily_value': 90},
    {'nutrient': 'Calcium', 'daily_value': 1300},
    {'nutrient': 'Iron', 'daily_value': 18},
    {'nutrient': 'Vitamin D', 'daily_value': 20},
    {'nutrient': 'Vitamin E', 'daily_value': 15},
    {'nutrient': 'Vitamin K', 'daily_value': 120},
    {'nutrient': 'Thiamin', 'daily_value': 1.2},
    {'nutrient': 'Riboflavin', 'daily_value': 1.3},
    {'nutrient': 'Niacin', 'daily_value': 16},
    {'nutrient': 'Vitamin B6', 'daily_value': 1.7},
    {'nutrient': 'Folate', 'daily_value': 400},
    {'nutrient': 'Vitamin B12', 'daily_value': 2.4},
    {'nutrient': 'Biotin', 'daily_value': 30},
    {'nutrient': 'Pantothenic Acid', 'daily_value': 5},
    {'nutrient': 'Phosphorus', 'daily_value': 1250},
    {'nutrient': 'Iodine', 'daily_value': 150},
    {'nutrient': 'Magnesium', 'daily_value': 420},
    {'nutrient': 'Zinc', 'daily_value': 11},
    {'nutrient': 'Selenium', 'daily_value': 55},
    {'nutrient': 'Copper', 'daily_value': 0.9},
    {'nutrient': 'Manganese', 'daily_value': 2.3},
    {'nutrient': 'Chromium', 'daily_value': 35},
    {'nutrient': 'Molybdenum', 'daily_value': 45},
    {'nutrient': 'Chloride', 'daily_value': 2300},
    {'nutrient': 'Potassium', 'daily_value': 4700},
    {'nutrient': 'Choline', 'daily_value': 550},
    {'nutrient': 'Total Lipid (Fat)', 'daily_value': 65},
    {'nutrient': 'Saturated Fat', 'daily_value': 20},
    {'nutrient': 'Cholesterol', 'daily_value': 300},
    {'nutrient': 'Carbohydrate', 'daily_value': 300},
    {'nutrient': 'Sodium', 'daily_value': 2400},
    {'nutrient': 'Fiber', 'daily_value': 28},
    {'nutrient': 'Protein', 'daily_value': 50},
    {'nutrient': 'Sugars', 'daily_value': 50},
]
daily_values_df = pd.DataFrame(daily_values_data)




Total unique ingredients: 675


 Fetch and save nutrition data


In [25]:
nutrition_data_list = []
processed_ingredients = [sanitize_ingredient(ing) for ing in unique_ingredients]

for ingredient in processed_ingredients:
    food_data = get_nutrition_data(ingredient)
    if food_data:
        daily_values = transform_to_daily_values(food_data, daily_values_df)
        if daily_values:  # Only add if there is at least some nutrient information
            daily_values['Ingredient'] = ingredient
            nutrition_data_list.append(daily_values)
    time.sleep(1)  # Avoid API rate limits

# 8. Save nutrition facts to CSV
nutrition_df = pd.DataFrame(nutrition_data_list)
nutrition_df.to_csv("nutrition_facts.csv", index=False)

print("✅ Nutrition facts collected and saved to 'nutrition_facts.csv'.")

No data found for ingredient: harpercollins
Error fetching data for kentucky: HTTPSConnectionPool(host='api.nal.usda.gov', port=443): Max retries exceeded with url: /fdc/v1/foods/search?api_key=NlGP9zCpXfkf8bS5TXd5gXzfbhzE13wDrImUqNAf&query=kentucky (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7fdd9dedec00>: Failed to resolve 'api.nal.usda.gov' ([Errno -3] Temporary failure in name resolution)"))
No data found for ingredient: sauté
No data found for ingredient: ramadan
Error fetching data for beef tenderloin: HTTPSConnectionPool(host='api.nal.usda.gov', port=443): Max retries exceeded with url: /fdc/v1/foods/search?api_key=NlGP9zCpXfkf8bS5TXd5gXzfbhzE13wDrImUqNAf&query=beef%20tenderloin (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7fdd9de60380>: Failed to resolve 'api.nal.usda.gov' ([Errno -3] Temporary failure in name resolution)"))
No data found for ingredient: windsor
Error fetching data for grand marnier: HTTPSCo

In [26]:
data = pd.read_csv("nutrition_facts.csv")
data.head()


Unnamed: 0,Protein,Total lipid (fat),Cholesterol,Ingredient,Thiamin,Riboflavin,Niacin,Pantothenic acid,Biotin
0,11.32,5.8,1.67,tuna,,,,,
1,47.6,1.83,25.67,buffet,,,,,
2,0.0,0.0,0.0,iced coffee,,,,,
3,0.14,0.0,0.0,white wine,0.42,1.15,0.68,,
4,26.48,1.88,39.67,cocktail,1.75,1.92,11.29,,


## Similar Recipes

In [27]:
!pip install beautifulsoup4

Collecting beautifulsoup4
  Using cached beautifulsoup4-4.13.4-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4)
  Using cached soupsieve-2.7-py3-none-any.whl.metadata (4.6 kB)
Using cached beautifulsoup4-4.13.4-py3-none-any.whl (187 kB)
Using cached soupsieve-2.7-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.13.4 soupsieve-2.7


In [3]:
# import requests
# from requests.exceptions import RequestException
# from bs4 import BeautifulSoup
# import pandas as pd
# import time
# from concurrent.futures import ThreadPoolExecutor
# import joblib  # For caching

# # Load recipes
# recipes_df = pd.read_csv("recipes.csv")

# # Create a "title" column from ingredients:
# recipes_df["title"] = recipes_df.apply(lambda row: ', '.join(row.index[row == 1]), axis=1)

# # Load or initialize URL cache
# try:
#     url_cache = joblib.load("url_cache.pkl")
# except FileNotFoundError:
#     url_cache = {}

# def get_epicurious_url(recipe_title):
#     """Searches Epicurious for the recipe and returns the URL."""
#     if recipe_title in url_cache:
#         return url_cache[recipe_title]

#     search_url = f"https://www.epicurious.com/search?content=recipe&query={recipe_title}"
#     headers = {
#         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.0.0 Safari/537.36'
#     }
#     retries = 3  # Number of retries
#     for _ in range(retries):
#         try:
#             response = requests.get(search_url, headers=headers, timeout=10)  # Timeout added
#             response.raise_for_status()
#             soup = BeautifulSoup(response.content, "html.parser")
#             # Try to find a more specific selector if possible
#             first_result = soup.select_one("a.result-heading-link")  # Example
#             if first_result:
#                 url = first_result["href"]
#                 url_cache[recipe_title] = url  # Cache the URL
#                 return url
#             else:
#                 return None  # No results found
#         except RequestException as e:
#             print(f"Error searching for {recipe_title}: {e}")
#             if isinstance(e, requests.exceptions.SSLError):
#                 print(f"SSL Error (ignoring verification failed): {e}")
#             time.sleep(2)  # Wait before retrying
#     return None  # Return None if all retries fail

# # Parallelization
# with ThreadPoolExecutor(max_workers=10) as executor:  # Adjust max_workers as needed
#     recipes_df["epicurious_url"] = list(executor.map(get_epicurious_url, recipes_df["title"]))

# # Filtering out recipes without URLs
# recipes_with_urls = recipes_df.dropna(subset=["epicurious_url"])

# # Save the results and the cache
# recipes_with_urls.to_csv("recipes_with_urls.csv", index=False)
# joblib.dump(url_cache, "url_cache.pkl")  # Save the cache

# print("✅ Recipes with URLs saved to 'recipes_with_urls.csv'.")


import requests
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
import pandas as pd
import time
from concurrent.futures import ThreadPoolExecutor
import joblib  # For caching

# Load recipes
try:
    recipes_df = pd.read_csv("recipes.csv")
except FileNotFoundError:
    print("Error: 'recipes.csv' not found.")
    exit(1)

# Create a "title" column from ingredients:
recipes_df["title"] = recipes_df.apply(lambda row: ', '.join(row.index[row == 1]), axis=1)

# Load or initialize URL cache
try:
    url_cache = joblib.load("url_cache.pkl")
except FileNotFoundError:
    url_cache = {}

def get_epicurious_url(recipe_title):
    """Searches Epicurious for the recipe and returns the URL."""
    if recipe_title in url_cache:
        return url_cache[recipe_title]

    search_url = f"https://www.epicurious.com/search?content=recipe&query={recipe_title}"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.0.0 Safari/537.36'
    }
    retries = 3  # Number of retries
    for _ in range(retries):
        try:
            response = requests.get(search_url, headers=headers, timeout=10)  # Timeout added
            response.raise_for_status()
            soup = BeautifulSoup(response.content, "html.parser")
            # Try to find a more specific selector if possible
            first_result = soup.select_one("a.result-heading-link")  # Example
            if first_result:
                url = first_result["href"]
                url_cache[recipe_title] = url  # Cache the URL
                return url
            else:
                return None  # No results found
        except RequestException as e:
            print(f"Error searching for {recipe_title}: {e}")
            if isinstance(e, requests.exceptions.SSLError):
                print(f"SSL Error (ignoring verification failed): {e}")
            time.sleep(2)  # Wait before retrying
    return None  # Return None if all retries fail

# Parallelization
with ThreadPoolExecutor(max_workers=10) as executor:  # Adjust max_workers as needed
    recipes_df["epicurious_url"] = list(executor.map(get_epicurious_url, recipes_df["title"]))

# Filtering out recipes without URLs
recipes_with_urls = recipes_df.dropna(subset=["epicurious_url"])

# Save the results and the cache
recipes_with_urls.to_csv("recipes_with_urls.csv", index=False)
joblib.dump(url_cache, "url_cache.pkl")  # Save the cache

print("✅ Recipes with URLs saved to 'recipes_with_urls.csv'.")

Error searching for berry, bon appétit, condiment/spread, mint, no-cook, sauce, spring, strawberry, summer: HTTPSConnectionPool(host='www.epicurious.com', port=443): Read timed out. (read timeout=10)
Error searching for bon appétit, braise, cinnamon, dairy free, dinner, garlic, lamb shank, low carb, nutmeg, peanut free, red wine, soy free, sugar conscious, tree nut free, vegetable, wheat/gluten-free, winter: HTTPSConnectionPool(host='www.epicurious.com', port=443): Read timed out. (read timeout=10)
Error searching for broccoli, noodle, peanut butter, quick & easy, soy, stir-fry, tofu, vegetable: HTTPSConnectionPool(host='www.epicurious.com', port=443): Read timed out. (read timeout=10)
Error searching for cheese, dinner, eggplant, fish, lunch, noodle, pasta, tomato, vegetable, winter: HTTPSConnectionPool(host='www.epicurious.com', port=443): Read timed out. (read timeout=10)
Error searching for beef, carrot, gourmet, onion, potato, soup/stew, soy sauce, stew: HTTPSConnectionPool(host='

KeyboardInterrupt: 

filtering out recipes without urls

In [29]:
recipes_with_urls = recipes_df.dropna(subset=["epicurious_url"])

In [30]:
recipes_with_urls.to_csv("recipes_with_urls.csv", index=False)
print("✅ Recipes with URLs saved to 'recipes_with_urls.csv'.")

✅ Recipes with URLs saved to 'recipes_with_urls.csv'.
