## Run this notebook on Google Colab if you do not have a suitable GPU (running locally takes forever)

In [None]:
## Run the below if on Colab
# !pip install setfit
# !git clone https://github.com/krumeto/oss_nlp_tools_demos.git
# from oss_nlp_tools_demos.data import preprocess_data

In [5]:
import numpy as np
import pandas as pd
from pprint import pprint

from datasets import Dataset
from setfit import SetFitModel

from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitTrainer

import torch

if torch.cuda.is_available():
    print("CUDA is available!")
else:
    print("CUDA is not available.")

try:
    from data.preprocess_data import combine_json_to_dataframe
except ModuleNotFoundError:
    pass

CUDA is not available.


In [2]:
annotated_df = pd.read_parquet("https://raw.githubusercontent.com/krumeto/oss_nlp_tools_demos/main/data/recipe_classes.parquet")

train_dataset = Dataset.from_pandas(annotated_df)
train_dataset

Dataset({
    features: ['recipe', 'label'],
    num_rows: 99
})

In [3]:
model_id = "sentence-transformers/all-MiniLM-L12-v2"

model = SetFitModel.from_pretrained(model_id)
model.model_body[0].max_seq_length = 512

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


512


In [4]:
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=train_dataset,
    loss_class=CosineSimilarityLoss,
    num_iterations=20,
    batch_size = 5, # Reduce the batch size due to memory issues
    column_mapping={"recipe": "text", "label": "label"},
    
)

In [5]:
trainer.train()

Applying column mapping to training dataset
***** Running training *****
  Num examples = 3960
  Num epochs = 1
  Total optimization steps = 248
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/248 [00:00<?, ?it/s]

: 

: 

In [None]:
complicated_recipe = """Ingredients:
4 ounces pancetta, diced into 1/4 inch cubes
2 1/2 to 3 pounds veal shanks (4 to 6 pieces 2 to 3 inches thick)
1/2 cup diced onion
1/2 cup diced celery
1/2 cup diced carrot
3 garlic cloves , minced
1 1/2 cups canned chopped tomatoes
1 1/2 cups chicken broth
1/2 cup dry white wine
1 bay leaf
1 sprig fresh thyme
salt
freshly ground black pepper
all-purpose flour for dredging
2 tablespoons unsalted butter
2 tablespoons extra-virgin olive oil
4 3-inch strips of lemon zest

Directions:

Preheat oven to 375°F.
Heat the olive oil over medium heat in a large Dutch oven.
Cook pancetta until browned and crisp.
Remove pancetta with a slotted spoon and transfer to a paper towel-lined plate.
Season veal shanks with salt and pepper and dredge in flour.
Cook the veal until browned on all sides, working in batches if necessary, then transfer to a plate.
Add the onion, celery, carrot, garlic, and a pinch of salt to the Dutch oven and cook until softened.
Stir in the tomatoes, chicken broth, dry white wine, bay leaf, and thyme sprig.
Return the veal shanks and pancetta to the Dutch oven and bring the liquid to a simmer.
Cover the pot and place it in the oven to braise for 2-2 1/2 hours, until the veal is very tender.
Serve with gremolata and garnish with lemon zest strips.
Note: To make gremolata, finely chop 2 tablespoons fresh parsley, 1 tablespoon grated lemon zest, and 1 garlic clove. Mix together and sprinkle over the osso buco before serving."""

trainer.model.predict([complicated_recipe])

In [None]:
simple_recipe = """Ingredients:
2 large eggs
Salt and pepper to taste
1 tablespoon unsalted butter
Instructions:
Crack the eggs into a bowl and whisk them with a fork until the whites and yolks are well combined.
Season with salt and pepper to taste."""

trainer.model.predict([simple_recipe])

In [None]:
torch.save(trainer, 'setfit-recipe-cls.pt')

## Download the recipes, preprocess and classify with the trained SetFit model

In [None]:
## Run if on Colab
# !wget https://eightportions.com/recipes_raw.zip

In [None]:
# Weird, but necessary depending on running this one locally or on Colab
try:
    # when running on Colab
    recipe_data = preprocess_data.combine_json_to_dataframe("recipes_raw.zip")
except NameError:
    # when running locally
    recipe_data = combine_json_to_dataframe("../data/recipes_raw.zip")

### Classify the recipes

In [None]:
docs = [rec for rec in recipe_data.full_text]

class_predictions = trainer.model.predict(docs)
class_probas = trainer.model.predict_proba(docs)

### Quick test of the probabilities

In [None]:
def get_max_index(tensor, col):
    # Get highest score per column
    max_val, max_idx = torch.max(tensor[:, col], 0)
    return max_idx.item()

hardest_recipe = get_max_index(class_probas, 3) #hardest recipe

pprint(docs[hardest_recipe])
print(class_probas[hardest_recipe])

In [None]:
easiest_recipe = get_max_index(class_probas, 0) 

pprint(docs[easiest_recipe])
print(class_probas[easiest_recipe])

### Save the scores

In [None]:
scores_pd = pd.DataFrame(torch.cat((class_predictions.unsqueeze(1), class_probas), dim=1), 
             columns=['pred_class', 'pred_very_easy', 'pred_easy', 'pred_medium', 'pred_hard'])

In [None]:
scores_pd.to_parquet("setfit_scores.parquet")