In [None]:
# Install necessary packages
!pip install -q sentence-transformers faiss-cpu

# Imports
import pandas as pd
import random
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from sklearn.utils import shuffle
import torch
from torch.nn.functional import cosine_similarity

# Load your dataset
df = pd.read_csv("training_data.csv")

# Create training pairs
def create_training_pairs(df):
    queries, positives, negatives = [], [], []
    for i in range(len(df)):
        query = df.iloc[i]['tags']
        pos = df.iloc[i]['tags']
        neg_idx = random.choice(df[df['rating'] < 3.0].index)
        negative = df.loc[neg_idx]['tags']
        queries.append(query)
        positives.append(pos)
        negatives.append(negative)
    return queries, positives, negatives

queries, positives, negatives = create_training_pairs(df)

# Initialize Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Prepare training data
train_examples = [
    InputExample(texts=[q, p], label=1.0) for q, p in zip(queries, positives)
] + [
    InputExample(texts=[q, n], label=0.0) for q, n in zip(queries, negatives)
]

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)

# Train the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=3,
    warmup_steps=100,
    show_progress_bar=True
)

# Save the model in a folder
model.save("nlp-recipe-finder")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m70.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m110.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m99.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m60.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m42.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mharsh-t-verma[0m ([33mharsh-t-verma-university-at-buffalo[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,0.0158
1000,0.0075
1500,0.0068
2000,0.0063
2500,0.006
3000,0.006
3500,0.0056
4000,0.0055
4500,0.0053
5000,0.0052


In [None]:

# Encode recipes for retrieval
recipe_embeddings = model.encode(df['tags'].tolist(), convert_to_tensor=True)

# Sample query
query = "potato mushroom stew dinner easy-to-make"
query_embedding = model.encode(query, convert_to_tensor=True)

# Similarity ranking
sims = cosine_similarity(query_embedding, recipe_embeddings)
top_k = torch.topk(sims, k=10)
top_indices = top_k.indices.tolist()

# Return top results sorted by rating
results = df.iloc[top_indices]
results = results.sort_values(by='rating', ascending=False)
print(results[['recipe_id', 'name', 'tags', 'rating']])

        recipe_id                               name  \
204785      24396   crispy mushroom   onion potatoes   
140942     385970        browned leeks and mushrooms   
3258        12100           amazing stuffed  shrooms   
190707      58676         chicken and mushroom saute   
43913       51489                scalloped mushrooms   
130798     315763  gilroy marinated garlic mushrooms   
29209       44786             fresh creamy mushrooms   
100208     192604         senior s sauteed mushrooms   
192967     360855    the best beef stroganoff recipe   
176195      47015      sauteed mushrooms with garlic   

                                                     tags  rating  
204785  weeknight time-to-make course main-ingredient ...    5.00  
140942  30-minutes-or-less time-to-make main-ingredien...    5.00  
3258    30-minutes-or-less time-to-make course prepara...    4.50  
190707  bacon 30-minutes-or-less time-to-make course m...    4.50  
43913   60-minutes-or-less time-to-make cou

In [None]:
!zip -r nlp-recipe-finder.zip nlp-recipe-finder/

  adding: nlp-recipe-finder/ (stored 0%)
  adding: nlp-recipe-finder/model.safetensors (deflated 8%)
  adding: nlp-recipe-finder/1_Pooling/ (stored 0%)
  adding: nlp-recipe-finder/1_Pooling/config.json (deflated 57%)
  adding: nlp-recipe-finder/vocab.txt (deflated 53%)
  adding: nlp-recipe-finder/tokenizer.json (deflated 71%)
  adding: nlp-recipe-finder/README.md (deflated 74%)
  adding: nlp-recipe-finder/modules.json (deflated 62%)
  adding: nlp-recipe-finder/config.json (deflated 48%)
  adding: nlp-recipe-finder/2_Normalize/ (stored 0%)
  adding: nlp-recipe-finder/sentence_bert_config.json (deflated 4%)
  adding: nlp-recipe-finder/special_tokens_map.json (deflated 80%)
  adding: nlp-recipe-finder/tokenizer_config.json (deflated 73%)
  adding: nlp-recipe-finder/config_sentence_transformers.json (deflated 34%)
