# Robert Thorstad - Recipe2Cuisine Data Challenge

Hi! The following code accompanies my Data Challenge submission for Recipe2Cuisine.

There are 4 sections below:
* Clean and preprocess data
* Exploratory data analysis
* Train model
* Produce figures

The code does assume a basic project directory structure, with the recipes in data/recipes.json and this Python code in scripts/. It also assumes the directories models/ and results/ exist. Thus while the code will not run in a Jupyter environment without these directories, it should be explanatory!

# Clean and Preprocess Data

*   **Remove** poorly formatted recipes, defined as recipes with <3 ingredients or no label (N = 225)

*   **Represent** recipes as lowercase bag-of-unigrams. Note: I tried stemming recipes (e.g. "onions" -> "onion") but this performs poorly on foreign-language words, which are frequent in the recipes. 



In [0]:
import os
import json
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import pickle


def remove_bad_recipies():
    """
    remove any 'bad' recipes from the dataset, defined as recipes
    with < 3 ingredients or no label.
    """

    # Load dataset.
    data_p = os.path.join("..", "data", "recipies.json")
    j = json.loads(open(data_p, "r").read())

    # Create output file.
    of_p = os.path.join("..", "data", "2_recipes_bad_exs_removed.jsonl")
    of = open(of_p, "w", newline="")

    n_bad = 0
    for recipe_j in j:

        valid_label = "cuisine" in recipe_j.keys()
        enough_ingredients = len(recipe_j["ingredients"]) >= 3

        if valid_label and enough_ingredients:
            out = json.dumps(recipe_j) + "\n"
            of.write(out)

        else:
            n_bad += 1


    of.flush()
    of.close()
    print("done, removed {} recipes".format(n_bad))

def preprocess_recipes():
    """
    represent each recipe as a list of stemmed unigrams.
    output recipes as feature matrix.
    """

    # Load recipes.
    print("load recipes")
    in_p = os.path.join("..", "data", "2_recipes_bad_exs_removed.jsonl")
    js = [json.loads(row) for row in open(in_p, "r").readlines()]

    # Represent recipes using two arrays: stemmed unigrams and targets.
    print("represent recipes -> stemmed unigrams")
    unigrams = [] # list of strings, each idx is a recipe.
    cuisines = [] # each idx is a recipe, value is cuisine as string.

    for idx, j in enumerate(js):

        # counter.
        if idx % 5000 == 0:
            print(idx)

        # get data.
        cuisine = j["cuisine"]
        ingredients = j["ingredients"]

        # tokenize and lowercase ingredients.
        stemmed_ = [] # stemmed unigrams for this cuisine.
        joined = " ".join(ingredients)
        lower = joined.lower()
        toks = lower.split()

        # add ingredients, target to data structure.
        joined_unigrams = " ".join(toks)
        unigrams.append(joined_unigrams)
        cuisines.append(cuisine)        

    # Create X feature matrix.
    print("vectorize ingredients")
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(unigrams)
    print(X.shape)

    # Integer encode recipes. 
    target_encoder = LabelEncoder()
    y = target_encoder.fit_transform(cuisines)
    print(y[:10])

    # Save data, vectorizer.
    pickle.dump(obj=X, file=open("../data/X.pkl", "wb"))
    pickle.dump(obj=y, file=open("../data/y.pkl", "wb"))
    pickle.dump(obj=vectorizer, file=open("../models/vectorizer.pkl", "wb"))
    pickle.dump(obj=target_encoder, file=open("../models/label_encoder.pkl", "wb"))
    print("done, saved data/X.pkl, data/y.pkl, models/vectorizer.pkl", "models/label_encoder.pkl")

if __name__ == "__main__":
    #remove_bad_recipies()
    preprocess_recipes()

# Exploratory Data Analysis



*   Plot **distribution of data** by class. (Note: moderately imbalanced) 

*   Check dataset **shapes** (~39.5k rows of data, 20 classes)

*   **Visual inspection** revealed data is relatively clean with few formatting errors, typos, etc.



In [0]:
import pickle

def do_eda():
    """ 
    Do simple EDA.
    Size of dataset?
    Class distribution?
    Total number of classes?
    """

    # Load X, y data.
    print("load data")
    X = pickle.load(open("../data/X.pkl", "rb"))
    y = pickle.load(open("../data/y.pkl", "rb"))

    # Size of dataset.
    print("X shape : {}".format(X.shape))

    # Class distribution.
    import matplotlib.pyplot as plt
    plt.hist(y)
    plt.xticks(range(20))
    plt.title("class distribution")
    plt.show()

    # Total number of classes.
    n_classes = len(set(y))
    print("# of classes {}".format(n_classes))    

if __name__ == "__main__":
    do_eda()

# Train Model

* Train a **ridge regression** model to classify the cuisine of a recipe, based on cuisine. In evaluating this model I also tried a random forest. 

* Model the **"essence" of a cuisine** using feature importances (top-5 regression weights / class).

* Save **model statistics** including F1 by class, confusion matrix. 

In [0]:
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

def train_model():

    # Load data and relevant objects for ml.
    X = pickle.load(open("../data/X.pkl", "rb"))
    y = pickle.load(open("../data/y.pkl", "rb"))
    label_encoder = pickle.load(open("../models/label_encoder.pkl", "rb")) 

    # List idx -> label mapping.
    ylabels = []
    for idx in range(20):
        lbl = label_encoder.inverse_transform([idx])
        print(idx, lbl)
        ylabels.append(lbl[0])

    # Train/test split.
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size = 0.2, random_state = 42)

    # Train and evaluate models.
    models = [RidgeClassifier()]
    labels = ["Ridge"]

    for clf, lbl in zip(models, labels):

        # train
        print("train " + lbl)
        clf.fit(X_train, y_train)

        # score.
        report = classification_report(y_test, clf.predict(X_test))
        print(report)

        # optionally, save.
        if lbl == "Ridge":
            model_of_p = "../models/ridge.pkl"
            pickle.dump(clf, open(model_of_p, "wb"))
            print("wrote {}".format(model_of_p))

        # confusion matrix.
        cm = confusion_matrix(y_test, clf.predict(X_test))
        sns.heatmap(cm, xticklabels=ylabels, yticklabels=ylabels)
        sns.set(font_scale = 2)
        hm_p = "../results/{}_confusion_matrix.png".format(lbl)
        print("write {}".format(hm_p))
        plt.savefig(hm_p)

def explain_cuisines():
    """ list most important features for each cuisine """

    # Load model, vectorizer, cuisine labels.
    clf = pickle.load(open("../models/ridge.pkl", "rb"))
    label_encoder = pickle.load(open("../models/label_encoder.pkl", "rb"))
    vectorizer = pickle.load(open("../models/vectorizer.pkl", "rb"))
    idx_to_term = vectorizer.get_feature_names()

    # Get feature importances.
    coef_ = clf.coef_ # shape (n_classes, n_features).

    # For each cuisine, list 5 most important features.
    # Pause for user input between cuisines.
    for cuisine_idx in range(coef_.shape[0]):

        # Get indices of largest weights.
        weights = coef_[cuisine_idx]
        largest_weight_indices = np.argsort(weights)[::-1][:5]

        # Get feature names for largest weights, and cusine name.
        cuisine_name = label_encoder.inverse_transform([cuisine_idx])
        important_feature_strs = [idx_to_term[idx] for idx in largest_weight_indices]
        print(cuisine_name, important_feature_strs)

if __name__ == "__main__":
    train_model()
    explain_cuisines()

# Create Figures

* Plot **model performance** for ridge regression, random forest. 

* Plot **performance by cuisine**

In [0]:
import seaborn as sns
import matplotlib.pyplot as plt

def plot_model_performance():

    # initialize plot.
    plt.close("all")
    plt.figure(figsize=(6, 6))
    sns.set(font_scale = 2)
    sns.set_palette("deep")

    # plot
    X = range(2)
    y = [.75, .76]
    plt.bar(0, y[0], width=.5)
    plt.bar(1, y[1], width=.5)

    # title, labels.
    plt.ylabel("F1")
    plt.xticks([])
    plt.yticks([0, .2, .4, .6, .8])

    # save and show.
    fig_p = "../results/model_performance.png"
    plt.tight_layout()
    plt.savefig(fig_p)
    plt.show()

def plot_f1_by_class():

    f1s = [.57, .44, .72, .78, .49, .59, .70, .85, .42, .81, .71, .75, .73, .88, .71,
                .53, .73, .46, .74, .49]
    lbls = [
        "Brazilian",
        "British",
        "Cajun/Creole",
        "Chinese",
        "Filipino",
        "French",
        "Greek",
        "Indian",
        "Irish",
        "Italian",
        "Jamaican",
        "Japanese",
        "Korean",
        "Mexican",
        "Moroccan",
        "Russian",
        "Southern US",
        "Spanish",
        "Thai",
        "Vietnamese"
    ]

    # initialize plot.
    plt.close("all")
    plt.figure(figsize=(12, 6))
    sns.set(font_scale = 2)
    sns.set_palette("deep")

    # plot
    X = range(20)
    for xi, yi in zip(X, f1s):
        plt.bar(xi, yi, width=0.8)

    # title, labels.
    plt.ylabel("F1")
    plt.xlabel("Cuisine")
    plt.xticks([])

    # save and show.
    fig_p = "../results/performance_by_class.png"
    plt.tight_layout()
    plt.savefig(fig_p)
    plt.show()

if __name__ == "__main__":
    plot_model_performance()
    plot_f1_by_class()