# Create an Annoy model

In [1]:
import pandas as pd
import numpy as np
from annoy import AnnoyIndex
import ast

# Load the data
df = pd.read_csv('data/questions.csv')

# Convert the embedding strings to actual lists
df['embedding_all_mpnet_base_v2'] = df['embedding_all_mpnet_base_v2'].apply(ast.literal_eval)

# Initialize Annoy
f = len(df.loc[0, 'embedding_all_mpnet_base_v2'])  # Length of item vector that will be indexed
t = AnnoyIndex(f, 'angular')  # Use 'angular' for cosine distance

# Add all the vectors to the index
for i, row in df.iterrows():
    t.add_item(i, row['embedding_all_mpnet_base_v2'])

# Build the index
t.build(10)  # 10 trees

# Save the index
t.save('questions.ann')


True

Now that the index is built and saved, you can load it and use it to find similar questions based on their embeddings. Here's a function that does that:

In [2]:
def find_similar_questions(question_id, index, df, n=10):
    similar_ids = index.get_nns_by_item(question_id, n)
    return df.loc[similar_ids]

# Load the index
u = AnnoyIndex(f, 'angular')
u.load('questions.ann')  # super fast, will just mmap the file

# Find similar questions to question id 0
print(find_similar_questions(0, u, df))

                                               question   
0     What is something that you're afraid of that y...  \
2829                 What's something you're afraid of?   
1661  What is one fear you have that you don't want ...   
4970                     What brings you the most fear?   
4456           What is a personal fear you've overcome?   
3398        What is the biggest fear you have overcome?   
1798  What is something you were afraid of as a chil...   
4052  What is a dream you have that you’re afraid to...   
1224  What's a fear about our relationship that keep...   
2695  What is something you've always wanted to do b...   

                                           tags   
0            ['Fear Topic', 'Overcoming Topic']  \
2829         ['Fear Topic', 'Overcoming Topic']   
1661         ['Fear Topic', 'Overcoming Topic']   
4970         ['Fear Topic', 'Overcoming Topic']   
4456         ['Fear Topic', 'Overcoming Topic']   
3398         ['Fear Topic', 'Overcoming Topi

## Searching and General Recommendations with Annoy

In [3]:
from sentence_transformers import SentenceTransformer
import pandas as pd

csv_file = "data/questions.csv"

# Load the CSV into a DataFrame
df = pd.read_csv(csv_file)

questions = df["question"].tolist()

sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(questions, show_progress_bar=False)

# Create an Annoy index
f = embeddings.shape[1]  # Length of item vector that will be indexed
t = AnnoyIndex(f, 'angular')

# Add all the vectors to the index
for i, embedding in enumerate(embeddings):
    t.add_item(i, embedding)

# Build the index
t.build(10)

# Save the index
t.save('sentences.ann')

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

True

### Searching!

In [16]:
# Now, when you want to find similar sentences to a search term:
search_term = "brother"
search_embedding = sentence_model.encode([search_term])

# Find the ids of the 10 most similar sentences
ids = t.get_nns_by_vector(search_embedding[0], 20)
# Print the similar sentences
for id in ids:
    print(questions[id])

Can you think of a moment when you felt proud of a family member?
What is the most important thing you've learned from your family?
What is the most important lesson you've learned in your life?
When was the last time you were proud of yourself?
What's one thing you're proud of yourself for this week?
What's the most important thing you've learned from someone online?
What is a personal goal you've recently achieved?
What is your 'anima' or 'animus' - the inner feminine/masculine aspect of yourself?
What is something you're proud of in your life?
What's the most important life lesson you've learned so far?
What's a moment when you felt very connected to me?
What's one thing you wish I understood better about you?
What is something you've done recently that you're proud of?
What inspires you to keep going when things get tough?
What is something you are proud of yourself for?
What is a childhood accomplishment that you are proud of?
What is something you appreciate more now that you are

## What about finding good recommendations based on liked questions?

In [18]:
def find_nearest_to_centroid(embeddings, annoy_index, num_results=10):
    # Calculate the centroid of the embeddings
    centroid = np.mean(embeddings, axis=0)

    # Query the Annoy index
    nearest_ids = annoy_index.get_nns_by_vector(centroid, num_results)

    return nearest_ids


def create_embeddings(model, text_list):
    embeddings = model.encode(text_list)

    return embeddings

In [21]:
liked_questions = [
    "What is the biggest lesson you've learned in the past year?",
    "What is something your partner does that makes you feel loved?",
    "What is your idea of a supportive and loving partner?",
    "What does forgiveness mean to you?",
    "What is one mistake you've made that you've been able to laugh about?",
    "What's a moment when you felt very connected to me?",
]
embeddings = create_embeddings(sentence_model, liked_questions)

ids = find_nearest_to_centroid(embeddings, t, 10)

for id in ids:
    print(questions[id])

What is something that you've done for someone else that made you feel good?
What is the biggest lesson you've learned from a past relationship?
What is the most important thing you've learned from your relationships?
What is a lesson you've learned from your relationship?
What do you believe is your greatest contribution to your relationships?
What is something your partner does that makes you feel loved?
What's one thing you appreciate about your partner?
What's one thing about our relationship that you're most grateful for?
What is the most important moral lesson you have learned from a past relationship?
What is something you appreciate in a partner?
