# Classify recipe cuisines

Label dish cuisines from backend/recipes_data.csv according to the dish title.

Retrieve training data from backend/cuisine_to_titles.json.

Training data source: https://cosylab.iiitd.edu.in/culinarydb/

In [1]:
import pandas as pd
import json
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Train ML model

In [None]:
# Load training data from JSON
with open("labels/cuisine_to_titles.json", "r") as f:
    cuisine_mapping = json.load(f)

# Convert JSON to DataFrame
train_data = []
for cuisine, titles in cuisine_mapping.items():
    for title in titles:
        train_data.append({"Title": title, "Cuisine": cuisine})

df_train = pd.DataFrame(train_data)

# Preprocess: Convert to lowercase
df_train["Title"] = df_train["Title"].str.lower()

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df_train["Title"], df_train["Cuisine"], test_size=0.2, random_state=42)

# Convert titles to numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")  
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Predict on test data
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print("Model Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Save model and vectorizer for future classification
joblib.dump(model, "models/cuisine_model.pkl")
joblib.dump(vectorizer, "models/tfidf_vectorizer.pkl")
print("Model and vectorizer saved!")

Model Accuracy: 0.7152375750955762
Classification Report:
                      precision    recall  f1-score   support

             Africa       0.83      0.48      0.61       141
     Australia & NZ       0.83      0.05      0.10        97
      British Isles       0.87      0.42      0.57       222
             Canada       0.82      0.07      0.14       240
          Caribbean       0.82      0.54      0.65       228
              China       0.78      0.57      0.66       173
     DACH Countries       0.88      0.40      0.55        92
     Eastern Europe       0.89      0.49      0.63       132
             France       0.77      0.51      0.61       535
             Greece       0.81      0.61      0.69       170
Indian Subcontinent       0.86      0.78      0.82       776
              Italy       0.83      0.79      0.81      1521
              Japan       0.84      0.45      0.59       119
              Korea       1.00      0.62      0.76        60
             Mexico      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Classify new data

In [7]:
# Load only the first 5000 rows of recipes_data.csv for classification
df_new = pd.read_csv("recipes_data.csv", nrows=5000)
# Print the column names
print(df_new.columns)

Index(['title', 'ingredients', 'directions', 'link', 'source', 'NER', 'site'], dtype='object')


In [10]:
# Preprocess: Convert title column to lowercase
df_new["title"] = df_new["title"].str.lower()

# Load the saved model & vectorizer
model = joblib.load("models/cuisine_model.pkl")
vectorizer = joblib.load("models/tfidf_vectorizer.pkl")

# Transform new data
X_new_tfidf = vectorizer.transform(df_new["title"])

# Predict cuisines
df_new["Predicted_Cuisine"] = model.predict(X_new_tfidf)

# Save results
df_new.to_csv("recipes_with_cuisine.csv", index=False)
print("Classified data saved to recipes_with_cuisine.csv!")

Classified data saved to backend/recipes_with_cuisine.csv!
