# Classify recipe ingredients to cuisines

Dataset from https://www.kaggle.com/competitions/whats-cooking/

Solution based on https://www.kaggle.com/code/dipayan/whatscooking-python

In [1]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import nltk
import re
from nltk.stem import WordNetLemmatizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
import sklearn.metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
import ast

In [14]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nikit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
# A combination of Word lemmatization + LinearSVC model finally pushes the accuracy score past 80%

traindf = pd.read_json("../labels/train.json")
traindf['ingredients_clean_string'] = [' , '.join(z).strip() for z in traindf['ingredients']]  
traindf['ingredients_string'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) for line in lists]).strip() for lists in traindf['ingredients']]       

testdf = pd.read_json("../labels/test.json") 
testdf['ingredients_clean_string'] = [' , '.join(z).strip() for z in testdf['ingredients']]
testdf['ingredients_string'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) for line in lists]).strip() for lists in testdf['ingredients']]       

In [None]:
# Vectorize training data
corpustr = traindf['ingredients_string']
vectorizertr = TfidfVectorizer(stop_words='english',
                               ngram_range=(1, 1), analyzer="word",
                               max_df=0.57, binary=False,
                               token_pattern=r'\w+', sublinear_tf=False)
tfidftr = vectorizertr.fit_transform(corpustr)  # Keep sparse

# Vectorize test data
corpusts = testdf['ingredients_string']
tfidfts = vectorizertr.transform(corpusts)  # Keep sparse

predictors_tr = tfidftr
predictors_ts = tfidfts
targets_tr = traindf['cuisine']

# Train logistic regression model
clf = LogisticRegression(max_iter=1000)  # Increase iterations
clf.fit(predictors_tr, targets_tr)  # No scaling needed

# Predict on test set
predictions = clf.predict(predictors_ts)

In [None]:
# testdf['cuisine'] = predictions
# testdf = testdf.sort_values('id' , ascending=True)

# testdf[['id' , 'ingredients_clean_string' , 'cuisine' ]].to_csv("submission.csv")

# Classify Recipes dataset

In [26]:
# Load dataset
df = pd.read_csv("../recipes_with_labels.csv")

# Ensure NER column is properly formatted as lists
df['NER'] = df['NER'].apply(ast.literal_eval)

# Convert the NER list to a string for classification
ingredients_text = df['NER'].apply(lambda x: ' '.join(x))

# Vectorize using the same vectorizer from training
tfidfts_new = vectorizertr.transform(ingredients_text)

# Predict cuisines
df['cuisine'] = clf.predict(tfidfts_new)

# Save updated DataFrame without adding a new column
df.to_csv("recipes_with_predicted_cuisines.csv", index=False)

print("Updated dataset saved as recipes_with_predicted_cuisines.csv")

Updated dataset saved as recipes_with_predicted_cuisines.csv
