In [121]:
import pandas as pd
data = pd.read_json("train.json")
print(data.head())

      id      cuisine                                        ingredients
0  10259        greek  [romaine lettuce, black olives, grape tomatoes...
1  25693  southern_us  [plain flour, ground pepper, salt, tomatoes, g...
2  20130     filipino  [eggs, pepper, salt, mayonaise, cooking oil, g...
3  22213       indian                [water, vegetable oil, wheat, salt]
4  13162       indian  [black pepper, shallots, cornflour, cayenne pe...


## 1. Data Understanding

### Total number of rows, cuisines, ingredients

In [122]:
# Convert the JSON data into a pandas DataFrame
df = pd.DataFrame(data)

# Calculate the metrics
total_rows = df.shape[0]  # Total number of rows
total_cuisines = df['cuisine'].nunique()  # Total number of unique cuisines
unique_ingredients = set(ingredient for ingredients_list in df['ingredients'] for ingredient in ingredients_list)
total_ingredients = len(unique_ingredients)  # Total number of unique ingredients

print(f"Total number of rows: {total_rows}")
print(f"Total number of cuisines: {total_cuisines}")
print(f"Total number of ingredients: {total_ingredients}")

Total number of rows: 39774
Total number of cuisines: 20
Total number of ingredients: 6714


### Cuisine Distribution

In [123]:
cuisine_counts = df['cuisine'].value_counts()
print(cuisine_counts)

cuisine
italian         7838
mexican         6438
southern_us     4320
indian          3003
chinese         2673
french          2646
cajun_creole    1546
thai            1539
japanese        1423
greek           1175
spanish          989
korean           830
vietnamese       825
moroccan         821
british          804
filipino         755
irish            667
jamaican         526
russian          489
brazilian        467
Name: count, dtype: int64


### Ingredient Counts

In [124]:
from collections import Counter

# Flatten the list of ingredients and count occurrences
all_ingredients = [ingredient for ingredients_list in df['ingredients'] for ingredient in ingredients_list]
ingredient_counts = Counter(all_ingredients)

# Most common 10 ingredients
most_common_10 = ingredient_counts.most_common(10)

# Least common 10 ingredients
least_common_10 = ingredient_counts.most_common()[:-11:-1]

print("Most Common 10 Ingredients:")
print(most_common_10)

print("\nLeast Common 10 Ingredients:")
print(least_common_10)

Most Common 10 Ingredients:
[('salt', 18049), ('onions', 7972), ('olive oil', 7972), ('water', 7457), ('garlic', 7380), ('sugar', 6434), ('garlic cloves', 6237), ('butter', 4848), ('ground black pepper', 4785), ('all-purpose flour', 4632)]

Least Common 10 Ingredients:
[('crushed cheese crackers', 1), ('tomato garlic pasta sauce', 1), ('lop chong', 1), ('Hidden Valley® Greek Yogurt Original Ranch® Dip Mix', 1), ('Lipton® Iced Tea Brew Family Size Tea Bags', 1), ('ciabatta loaf', 1), ('cholesterol free egg substitute', 1), ('orange glaze', 1), ('Challenge Butter', 1), ('Oscar Mayer Cotto Salami', 1)]


## 2. Data PreProcessing

### Removing Descriptive Modifiers
The following factors have been taken in consider when merging ingredients
- Frequency of usage
- Meaning of modifier
    - Although "Old Bay Seasoning" was used in 98 recipes, out of 2212 total recipes which ends with the word "seasoning," it was merged because the modifier "Old Bay" refers to a brand
- Spacing issues
    - e.g. poppyseeds > poppy seeds

In [125]:
# Create a list of all unique ingredients across the dataset
unique_ingredients_list = set(ingredient.lower() for ingredients_list in df['ingredients'] for ingredient in ingredients_list)

# Convert the set to a sorted list for better readability
sorted_unique_ingredients = sorted(unique_ingredients_list)

# Extract the last word from each ingredient and count occurrences
last_word_counts = Counter(ingredient.split()[-1] for ingredient in sorted_unique_ingredients)

# Filter and print words used more than 15 times
frequent_last_words = {word: count for word, count in last_word_counts.items() if count > 15}

# Print the filtered dictionary
print("Words used more than 15 times:")
print(frequent_last_words)
print("count: ", len(frequent_last_words))

Words used more than 15 times:
{'sauce': 188, 'paste': 70, 'milk': 50, 'tomatoes': 51, 'beans': 80, 'cheese': 163, 'yogurt': 43, 'broth': 65, 'ham': 38, 'seasoning': 67, 'mix': 119, 'beef': 20, 'noodles': 45, 'juice': 71, 'powder': 83, 'squash': 17, 'flakes': 29, 'vinegar': 45, 'steaks': 29, 'water': 28, 'pepper': 50, 'flour': 70, 'butter': 38, 'extract': 30, 'liqueur': 30, 'oil': 70, 'syrup': 43, 'almonds': 16, 'pasta': 31, 'slices': 22, 'chile': 21, 'fillets': 55, 'sausage': 52, 'seeds': 37, 'apples': 18, 'bacon': 22, 'halves': 19, 'rice': 56, 'bread': 45, 'dressing': 44, 'leaves': 67, 'tortillas': 16, 'corn': 17, 'mushrooms': 32, 'potatoes': 24, 'salt': 41, 'chips': 29, 'crust': 20, 'chocolate': 18, 'spray': 16, 'sugar': 48, 'peppers': 21, 'mayonnaise': 17, 'soup': 45, 'roast': 32, 'steak': 36, 'meat': 34, 'stock': 44, 'onion': 18, 'cream': 56, 'dough': 21, 'salsa': 22, 'garlic': 17, 'chicken': 25, 'olives': 30, 'peas': 19, 'chops': 20, 'crumbs': 23, 'rolls': 33, 'buns': 19, 'mustar

#### Filter unique ingredients that end with "milk"

In [126]:
def cleaned_with_frequency(ingredients, data):
    # Extract the last word of the first ingredient (e.g., "squash" from "banana squash")
    last_word = ingredients[0].split()[-1].lower()
    # Count rows where the generalized last word is used
    rows_with_last_word = df[df['ingredients'].apply(
        lambda x: any(last_word in ingredient.lower() for ingredient in x)
    )]
    total_last_word_count = len(rows_with_last_word)
    # Count rows where each specific ingredient is used
    ingredient_frequencies = {}
    for ingredient in ingredients:
        rows_with_ingredient = df[df['ingredients'].apply(
            lambda x: any(ingredient.lower() in ingredient_in_row.lower() for ingredient_in_row in x)
        )]
        total_ingredient_count = len(rows_with_ingredient)
        
        # Calculate percentage frequency
        if total_last_word_count > 0:
            percentage_frequency = (total_ingredient_count / total_last_word_count) * 100
        else:
            percentage_frequency = 0
        
        # Add to the dictionary
        ingredient_frequencies[ingredient] = round(percentage_frequency, 2)
    
    return ingredient_frequencies

In [128]:
ingredients_with_ = [ingredient for ingredient in sorted_unique_ingredients if ingredient.endswith("mushrooms")]

print("Unique ingredients that end with ", len(ingredients_with_))

mapping = {

}

cleaned_ = clean_ingredients_with_mapping(ingredients_with_, mapping)

print("Cleaned ingredients:", len(cleaned_))
print(cleaned_with_frequency(cleaned_, data))

Unique ingredients that end with  32


TypeError: 'list' object is not callable

In [129]:
ingredients_with_ = [ingredient for ingredient in sorted_unique_ingredients if ingredient.endswith("mushrooms")]
print(ingredients_with_)

['baby portobello mushrooms', 'black mushrooms', 'black trumpet mushrooms', 'brown beech mushrooms', 'button mushrooms', 'chestnut mushrooms', 'chinese black mushrooms', 'cremini mushrooms', 'crimini mushrooms', 'diced mushrooms', 'dried black mushrooms', 'dried mushrooms', 'dried porcini mushrooms', 'dried shiitake mushrooms', 'dried wood ear mushrooms', 'fresh mushrooms', 'fresh shiitake mushrooms', 'green giant™ sliced mushrooms', 'maitake mushrooms', 'matsutake mushrooms', 'mixed mushrooms', 'mushrooms', 'oyster mushrooms', 'shimeji mushrooms', 'sliced mushrooms', 'straw mushrooms', 'tree ear mushrooms', 'white button mushrooms', 'white mushrooms', 'wild mushrooms', 'wood ear mushrooms', 'wood mushrooms']


### Function: Ingredient Cleaner

The clean_ingredients function processes a list of ingredient names by standardizing them using a predefined keyword_mapping dictionary. Each ingredient is normalized to lowercase to ensure case-insensitive matching and then checked for the presence of any keywords from the dictionary. If a keyword is found within an ingredient name, the function replaces the name with the corresponding value from the dictionary, appending it to a cleaned list. If no keywords match, the original ingredient name is preserved. This ensures that variations of ingredient names are grouped under a standardized name, reducing redundancy. The function returns the processed list of ingredients, where each name is either mapped to a generalized term or left unchanged if no match is found.

In [8]:
def clean_ingredients_with_mapping(ingredients, mapping):
    cleaned = []
    for ingredient in ingredients:
        # Normalize ingredient to lowercase for case-insensitive matching
        lower_ingredient = ingredient.lower()
        found = False
        # Check the mapping dictionary for matching keywords
        for generalized_name, keywords in mapping.items():
            # If any keyword matches, use the generalized name
            if any(keyword in lower_ingredient for keyword in keywords):
                cleaned.append(generalized_name)
                found = True
                break
        # If no mapping applies, keep the original ingredient
        if not found:
            cleaned.append(ingredient)
    # Return unique cleaned ingredients while preserving order
    return list(dict.fromkeys(cleaned))

#### Function: Merge or Leave Distinguisher
The function analyzes the usage of a specific ingredient in relation to a generalized ingredient within a recipe dataset to decide whether the specific ingredient should be merged with the generalized one or left as distinct. It calculates the total number of recipes that include the generalized ingredient and the specific ingredient, computes the percentage of recipes containing the specific ingredient relative to the generalized ingredient, and makes a decision based on a user-defined threshold. The function returns a dictionary with the total recipe counts for both ingredients, the computed percentage, and a decision ("merge" or "leave") depending on whether the specific ingredient’s percentage is below the threshold. This is particularly useful for cleaning and simplifying ingredient data while preserving significant variations in recipe datasets.

In [9]:
def analyze_ingredient_usage(data, specific_ingredient, generalized_ingredient, threshold=5):
    """
    Analyzes the usage of a specific ingredient compared to its generalized ingredient.
    """
    # Ensure 'ingredients' column is in list format
    data['ingredients'] = data['ingredients'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    
    # Filter recipes containing the generalized ingredient
    recipes_with_generalized = data[data['ingredients'].apply(
        lambda x: any(generalized_ingredient.lower() in ingredient.lower() for ingredient in x)
    )]
    total_with_generalized = len(recipes_with_generalized)
    
    # Filter recipes containing the specific ingredient
    recipes_with_specific = data[data['ingredients'].apply(
        lambda x: any(specific_ingredient.lower() in ingredient.lower() for ingredient in x)
    )]
    total_with_specific = len(recipes_with_specific)
    
    # Calculate the percentage of specific ingredient
    if total_with_generalized > 0:
        percentage = (total_with_specific / total_with_generalized) * 100
    else:
        percentage = 0

    # Decide whether to merge or leave
    decision = "merge" if percentage < threshold else "leave"
    
    return {
        "total_recipes_with_generalized": total_with_generalized,
        "total_recipes_with_specific": total_with_specific,
        "percentage": percentage,
        "decision": decision
    }

# Example usage
data = pd.read_json("train.json")  # Load the dataset


specific_ingredient = "cashew nuts"
generalized_ingredient = "water"
threshold = 5  # Percentage threshold

result = analyze_ingredient_usage(data, specific_ingredient, generalized_ingredient, threshold)

# Print results
print(f"Total recipes that use '{generalized_ingredient}': {result['total_recipes_with_generalized']}")
print(f"Total recipes that use '{specific_ingredient}': {result['total_recipes_with_specific']}")
print(f"Percentage of '{specific_ingredient}': {result['percentage']:.2f}%")
print(f"Decision: {result['decision']}")

Total recipes that use 'water': 9499
Total recipes that use 'cashew nuts': 218
Percentage of 'cashew nuts': 2.29%
Decision: merge


#### Mapping Applier Function

In [132]:
import pandas as pd
import json

def mapping_applier(data, mappings, output_json="train_cleaned.json", log_file="mapping_changes.txt"):
    """
    Applies ingredient mappings to clean the dataset and logs changes.

    Args:
        data (pd.DataFrame): The dataset containing an 'ingredients' column with lists of ingredients.
        mappings (dict): A dictionary mapping last words to standardized ingredients.
        output_json (str): Filename for the cleaned dataset.
        log_file (str): Filename for logging changes.

    Returns:
        None
    """
    changes = []  # List to store changes for logging

    def map_ingredient(ingredient):
        """Maps an individual ingredient based on the last-word mappings."""
        if not isinstance(ingredient, str):  # Ensure ingredient is a string
            return ingredient
#dried shiitake mushrooms > words = ["dried", "shiitake", "mushrooms"]
        words = ingredient.lower().split()  # Split ingredient into words
        last_word = words[-1] if words else ""  # Extract last word safely

        if last_word in mappings:  # Check if last word exists in mappings
            word_mapping = mappings[last_word]  # Get mapping dictionary
#"mushrooms" : ["diced mushrooms", "fresh mushrooms", "sliced mushrooms"],
#"shiitake mushrooms" : ["shiitake"]
            for key, value in word_mapping.items():
                for i in value:
                    if i in words:  # Ensure full word matches
                        if ingredient != key:  # Only log changes
                            changes.append(f'"{ingredient}" > "{key}"')
                        return key  # Apply mapping

        return ingredient  # Return unchanged if no mapping applies

    # Apply mapping correctly by iterating through ingredient lists
    data["ingredients"] = data["ingredients"].apply(
        lambda ingredient_list: [map_ingredient(ingredient) for ingredient in ingredient_list]
        if isinstance(ingredient_list, list) else ingredient_list  # Ensure it's a list
    )

    # Save the cleaned dataset
    data.to_json(output_json, orient="records", indent=4)

    # Save mapping log
    if changes:  # Only write if there are changes
        with open(log_file, "w") as log:
            log.write("\n".join(changes))
        print(f"Mapping applied. Cleaned dataset saved to {output_json}. Changes logged in {log_file}.")
    else:
        print("No mappings applied. Check your mapping dictionary or input data.")

In [133]:
# Example mapping dictionary (only showing for 'salt', you should add all 76 mappings)
mappings = {
    "oil": {
        "olive oil" : ["olive"],
        "truffle oil" : ["truffle"],
        "coconut oil" : ["coconut", "palm"],
        "canola oil" : ["canola"],
        "vegetable oil" : ["vegetable"],
        "corn oil" : ["corn"],
        "almond oil" : ["almond"],
        "sesame oil" : ["sesame"],
        "tuna in oil" : ["tuna"]
    },
    "mushrooms": {
        "mushrooms" : ["diced mushrooms", "fresh mushrooms", "sliced mushrooms"],
        "shiitake mushrooms" : ["shiitake"]
    }
    
}

# Apply the mapping
mapping_applier(data, mappings)

TypeError: isinstance() arg 2 must be a type, a tuple of types, or a union

In [70]:
print(ingredient)

nonstick spray


list

In [138]:
mappings = {
    "oil": {
    "olive oil" : ["olive"],
    "truffle oil" : ["truffle"],
    "coconut oil" : ["coconut", "palm"],
    "canola oil" : ["canola"],
    "vegetable oil" : ["vegetable"],
    "corn oil" : ["corn"],
    "almond oil" : ["almond"],
    "sesame oil" : ["sesame"],
    "tuna in oil" : ["tuna"]
    }
}

data = pd.read_json("train.json")

changes = []
def map_ingredient(ingredient):
    if not isinstance(ingredient, str):
        return ingredient
    
    words = ingredient.lower().split()
    last_word = words[-1] if words else ""
    
    if last_word in mappings:
        word_mapping = mappings[last_word]
        
        for key, value in word_mapping.items():
            for i in value:
                if i in words:  # Ensure full word matches
                    if ingredient != key:  # Only log changes
                        changes.append(f'"{ingredient}" > "{key}"')
                        return key  # Apply mapping
    return ingredient

# Apply mapping correctly by iterating through ingredient lists
data["ingredients"] = data["ingredients"].apply(
    lambda ingredient_list: [map_ingredient(ingredient) for ingredient in ingredient_list]
    if isinstance(ingredient_list, list) else ingredient_list  # Ensure it's a list
)

'''lambda ingredient_list: [map_ingredient(ingredient) for ingredient in ingredient_list]
    if isinstance(ingredient_list, list) else ingredient_list  # Ensure it's a list

if isinstance(ingredient_list, list):
    for ingredient in ingredient_list:
        map_ingredient(ingredient)
    else:
        ingredient_list'''

map_ingredient(ingredient)

TypeError: isinstance() arg 2 must be a type, a tuple of types, or a union

In [151]:
mappings = {
    "oil": {
    "olive oil" : ["olive"],
    "truffle oil" : ["truffle"],
    "coconut oil" : ["coconut", "palm"],
    "canola oil" : ["canola"],
    "vegetable oil" : ["vegetable"],
    "corn oil" : ["corn"],
    "almond oil" : ["almond"],
    "sesame oil" : ["sesame"],
    "tuna in oil" : ["tuna"]
    }
}

data = pd.read_json("train.json")

changes = []
def map_ingredient(ingredient):
    if not isinstance(ingredient, str):
        return ingredient
    
    words = ingredient.lower().split()
    last_word = words[-1] if words else ""
    
    if last_word in mappings:
        word_mapping = mappings[last_word]
        
        for key, value in word_mapping.items():
            for i in value:
                if i in words:  # Ensure full word matches
                    if ingredient != key:  # Only log changes
                        changes.append(f'"{ingredient}" > "{key}"')
                        return key  # Apply mapping
    return ingredient
'''
# Apply mapping correctly by iterating through ingredient lists
data["ingredients"] = data["ingredients"].apply(
    lambda ingredient_list: [map_ingredient(ingredient) for ingredient in ingredient_list]
    if isinstance(ingredient_list, list) else ingredient_list  # Ensure it's a list
)


lambda ingredient_list: [map_ingredient(ingredient) for ingredient in ingredient_list]
    if isinstance(ingredient_list, list) else ingredient_list  # Ensure it's a list
'''
def replace_lambda(ingredient_list):
    if type(ingredient_list) == list:
        for ingredient in ingredient_list:
            map_ingredient(ingredient)
        else:
            ingredient_list

data["ingredients"] = data["ingredients"].apply(replace_lambda)

map_ingredient(ingredient)

'almond oil'

In [98]:
word_mapping = {
    "olive oil" : ["olive"],
    "truffle oil" : ["truffle"],
    "coconut oil" : ["coconut", "palm"],
    "canola oil" : ["canola"],
    "vegetable oil" : ["vegetable"],
    "corn oil" : ["corn"],
    "almond oil" : ["almond"],
    "sesame oil" : ["sesame"],
    "tuna in oil" : ["tuna"]
}

for key, value in word_mapping.items():
    print

olive oil ['olive']
truffle oil ['truffle']
coconut oil ['coconut', 'palm']
canola oil ['canola']
vegetable oil ['vegetable']
corn oil ['corn']
almond oil ['almond']
sesame oil ['sesame']
tuna in oil ['tuna']


In [150]:
data["ingredients"]

0        [romaine lettuce, black olives, grape tomatoes...
1        [plain flour, ground pepper, salt, tomatoes, g...
2        [eggs, pepper, salt, mayonaise, cooking oil, g...
3                      [water, vegetable oil, wheat, salt]
4        [black pepper, shallots, cornflour, cayenne pe...
                               ...                        
39769    [light brown sugar, granulated sugar, butter, ...
39770    [KRAFT Zesty Italian Dressing, purple onion, b...
39771    [eggs, citrus fruit, raisins, sourdough starte...
39772    [boneless chicken skinless thigh, minced garli...
39773    [green chile, jalapeno chilies, onions, ground...
Name: ingredients, Length: 39774, dtype: object