In [5]:
# https://huggingface.co/Hum-Works/lodestone-base-4096-v1
# https://huggingface.co/Muennighoff/SGPT-125M-weightedmean-nli-bitfit
# https://huggingface.co/BAAI/bge-large-en-v1.5
# https://huggingface.co/andersonbcdefg/bge-small-4096
# jinaai/jina-embeddings-v2-small-en
# 'andersonbcdefg/bge-small-4096'

In [6]:
# !pip install transformers qdrant-client sentence-transformers

In [7]:
import qdrant_client
import numpy as np
from numpy.linalg import norm
from transformers import AutoModel
from numpy.linalg import norm
import torch
from qdrant_client.http.models import Distance, VectorParams, PointStruct
import pandas as pd
from tqdm import tqdm
import gc
from sentence_transformers import SentenceTransformer
import torch
import subprocess

In [8]:
model_name = 'jinaai/jina-embeddings-v2-small-en'
qdrant_path = "/tmp/recipe_store"
collection_name = "recipies"
batch_size=10
cos_sim = lambda a,b: (a @ b.T) / ((a)*norm(b))

In [9]:
def preprocess(df, nrows=None):
    df['content'] = "Name:\n" + df['name'] + '\nDescription:\n' + df['description'] + '\nSteps:\n' + df['steps']
    df['id'] = df.index
    df.dropna(subset=['content'], inplace=True)
    if nrows is not None:
      df = df[:nrows]
    return df

In [10]:
df = pd.read_csv("../../data/RAW_recipes.csv")
df = preprocess(df, nrows=None)
len(df)

226657

In [11]:
def get_model():
  model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-small-en', trust_remote_code=True)
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model = model.to(device)
  return model

In [14]:
with torch.no_grad():
  model = get_model()
  embeddings = model.encode(['How is the weather today?', 'What is the current weather like today?'])
  print(cos_sim(embeddings[0], embeddings[1]))
  vector_size = len(model.encode("Encode"))
  print(collection_name, vector_size)
  del model
  torch.cuda.empty_cache()

[ 1.04633447e+03 -1.80354118e+01 -6.82214355e+01  1.61469173e+01
  3.68936584e+02 -3.52913551e+01  5.83617744e+01  3.11420155e+01
 -1.87364563e+02 -2.45083313e+01 -1.81096100e+02 -1.14282532e+02
 -6.94248962e+01 -1.15028372e+01  1.68329277e+01 -1.69776440e+01
 -3.42685814e+01 -2.69710007e+01 -1.77627296e+01 -5.66288185e+01
 -3.88246498e+01 -1.89257336e+01  2.77676563e+01 -5.47682152e+01
  1.33862705e+01 -1.15945520e+01 -1.38271370e+01  4.09063492e+01
  2.01984959e+01  1.15810061e+01 -1.50290709e+01  1.84479599e+02
 -1.46108618e+01  9.90531158e+01 -2.53095150e+01  3.05182781e+01
  3.69845543e+01  3.22625580e+01 -4.86448860e+01 -4.04112701e+01
 -2.45597305e+01  1.72922516e+01  2.03023205e+01  1.85253067e+01
 -9.70133209e+00 -1.52091174e+03 -9.54076958e+00  5.83904839e+01
  3.56976166e+01  2.02787216e+02 -1.29416618e+01  2.15052605e+01
 -1.23727083e+01  5.86852455e+01  1.21544724e+02  2.75831165e+01
 -4.57627525e+01  3.82342339e+01 -2.59543037e+01  9.25773087e+01
  2.34367523e+01  1.81031

In [15]:
client = qdrant_client.QdrantClient(
    path=qdrant_path
)

In [16]:
client.delete_collection(collection_name=collection_name)
client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=512, distance=Distance.COSINE),
)

True

In [17]:
def batchify(to_batch, batch_size):
    for i in range(0, len(to_batch), batch_size):
        yield to_batch[i:i + batch_size]

In [18]:
model = get_model()
def upload(df):
  with torch.no_grad():
    texts = df['content'].to_list()
    embeddings = model.encode(texts).tolist()
    payloads = df.to_dict(orient='records')
    points = [PointStruct(id=payload['id'], vector=emb, payload=payload) for emb, payload in zip(embeddings, payloads)]
    client.upsert(
      collection_name=collection_name,
      wait=True,
      points=points
    )
    # del model
  del texts
  del embeddings
  del payloads
  del points
  gc.collect()
  torch.cuda.empty_cache()
  gc.collect()


In [19]:
batch_size = 1
for batch_df in tqdm(batchify(df[:10000], batch_size), total=len(df)//batch_size):
  upload(batch_df)

  4%|▍         | 10000/226657 [37:20<13:28:58,  4.46it/s]


In [22]:
def search(query):
  search_result = client.search(
      collection_name=collection_name, query_vector=model.encode(query), limit=5
  )
  return [e.payload['name']+ '\n'+ e.payload['description'] for e in search_result]

In [32]:
for result in search("almond milk  vegan  raw  gluten free"):
  print(result)

almond milk  vegan  raw  gluten free
nice and creamy. i tweaked recipe #174601 to be vegan although if you aren't vegan then that one is great on its own masha allah. this is also an algerian beverage!!
amazing almond milk
from the cookbook called "how it all vegan".
almond shake paleo and gluten free
this recipe came from cookingchanneltv.com by bal arneson. i changed it up a little to be paleo friendly.
almond custard
a simple dessert custard. serve with some fresh fruit or canned.  can be made suitable for a gluten-free diet by ensuring the cornflour/cornstarch used is gluten-free. ready in less than 10 minutes and can be served warm or cold as desired
almond vanilla pudding
really easy,  vegan pudding :) yummy!
