# Classify recipe ingredients to cuisines

Dataset from https://www.kaggle.com/competitions/whats-cooking/

Solution based on https://www.kaggle.com/code/dipayan/whatscooking-python

In [1]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import nltk
import re
from nltk.stem import WordNetLemmatizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
import sklearn.metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
import ast

In [2]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nikit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# A combination of Word lemmatization + LinearSVC model finally pushes the accuracy score past 80%

traindf = pd.read_json("../labels/train.json")
traindf['ingredients_clean_string'] = [' , '.join(z).strip() for z in traindf['ingredients']]  
traindf['ingredients_string'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) for line in lists]).strip() for lists in traindf['ingredients']]       

testdf = pd.read_json("../labels/test.json") 
testdf['ingredients_clean_string'] = [' , '.join(z).strip() for z in testdf['ingredients']]
testdf['ingredients_string'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) for line in lists]).strip() for lists in testdf['ingredients']]       

In [4]:
# Vectorize training data
corpustr = traindf['ingredients_string']
vectorizertr = TfidfVectorizer(stop_words='english',
                               ngram_range=(1, 1), analyzer="word",
                               max_df=0.57, binary=False,
                               token_pattern=r'\w+', sublinear_tf=False)
tfidftr = vectorizertr.fit_transform(corpustr)  # Keep sparse

# Vectorize test data
corpusts = testdf['ingredients_string']
tfidfts = vectorizertr.transform(corpusts)  # Keep sparse

predictors_tr = tfidftr
predictors_ts = tfidfts
targets_tr = traindf['cuisine']

# Train logistic regression model
clf = LogisticRegression(max_iter=1000)  # Increase iterations
clf.fit(predictors_tr, targets_tr)  # No scaling needed

# Predict on test set
predictions = clf.predict(predictors_ts)

In [None]:
# testdf['cuisine'] = predictions
# testdf = testdf.sort_values('id' , ascending=True)

# testdf[['id' , 'ingredients_clean_string' , 'cuisine' ]].to_csv("submission.csv")

# Classify Recipes dataset

In [5]:
# Define chunk size
chunk_size = 10000  
total_rows = 0  
output_file = "../recipes_with_cuisines.csv"

# Open a new CSV file for writing
with open(output_file, 'w', encoding='utf-8') as f:
    # Process in chunks
    for chunk in pd.read_csv("../recipes_with_labels.csv", chunksize=chunk_size):
        # Ensure NER column is properly formatted as lists
        chunk['NER'] = chunk['NER'].apply(ast.literal_eval)
        
        # Convert the NER list to a string for classification
        ingredients_text = chunk['NER'].apply(lambda x: ' '.join(x))

        # Vectorize using the same vectorizer from training
        tfidfts_new = vectorizertr.transform(ingredients_text)

        # Predict cuisines
        chunk['cuisine'] = clf.predict(tfidfts_new)

        # Save chunk to CSV (append after first batch)
        chunk.to_csv(output_file, mode='a', header=(total_rows == 0), index=False)

        total_rows += len(chunk)
        print(f"Processed {total_rows} rows...")

print(f"Updated dataset saved as {output_file}")


Processed 10000 rows...
Processed 20000 rows...
Processed 30000 rows...
Processed 40000 rows...
Processed 50000 rows...
Processed 60000 rows...
Processed 70000 rows...
Processed 80000 rows...
Processed 90000 rows...
Processed 100000 rows...
Processed 110000 rows...
Processed 120000 rows...
Processed 130000 rows...
Processed 140000 rows...
Processed 150000 rows...
Processed 160000 rows...
Processed 170000 rows...
Processed 180000 rows...
Processed 190000 rows...
Processed 200000 rows...
Processed 210000 rows...
Processed 220000 rows...
Processed 230000 rows...
Processed 240000 rows...
Processed 250000 rows...
Processed 260000 rows...
Processed 270000 rows...
Processed 280000 rows...
Processed 290000 rows...
Processed 300000 rows...
Processed 310000 rows...
Processed 320000 rows...
Processed 330000 rows...
Processed 340000 rows...
Processed 350000 rows...
Processed 360000 rows...
Processed 370000 rows...
Processed 380000 rows...
Processed 390000 rows...
Processed 400000 rows...
Processed

In [7]:
df = pd.read_csv("../recipes_with_cuisines.csv", nrows=10)
print(df)

                      title  \
0       No-Bake Nut Cookies   
1     Jewell Ball'S Chicken   
2               Creamy Corn   
3             Chicken Funny   
4      Reeses Cups(Candy)     
5  Cheeseburger Potato Soup   
6       Rhubarb Coffee Cake   
7            Scalloped Corn   
8      Nolan'S Pepper Steak   
9           Millionaire Pie   

                                         ingredients  \
0  ["1 c. firmly packed brown sugar", "1/2 c. eva...   
1  ["1 small jar chipped beef, cut up", "4 boned ...   
2  ["2 (16 oz.) pkg. frozen corn", "1 (8 oz.) pkg...   
3  ["1 large whole chicken", "2 (10 1/2 oz.) cans...   
4  ["1 c. peanut butter", "3/4 c. graham cracker ...   
5  ["6 baking potatoes", "1 lb. of extra lean gro...   
6  ["1 1/2 c. sugar", "1/2 c. butter", "1 egg", "...   
7  ["1 can cream-style corn", "1 can whole kernel...   
8  ["1 1/2 lb. round steak (1-inch thick), cut in...   
9  ["1 large container Cool Whip", "1 large can c...   

                                         