# KMeans for finding nearest clusters

In [25]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import numpy as np

csv_file = "data/questions.csv"

# Load the CSV into a DataFrame
df = pd.read_csv(csv_file)

# Get a list of questions
questions = df["question"].tolist()

# Instantiate the SentenceTransformer model
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

# Create embeddings for each question
embeddings = sentence_model.encode(questions, show_progress_bar=False).astype(np.float64)

In [26]:
# Define the number of clusters
n_clusters = 160

# Create and fit a KMeans model with the defined number of clusters
kmeans_model = KMeans(n_clusters=n_clusters, random_state=42)
kmeans_model.fit(embeddings)



In [27]:
def find_nearest_cluster(embedding, n=5):
    # Convert the embedding to double precision
    # embedding = embedding.astype(np.float64)
    
    # Find the cluster that the supplied embedding belongs to
    current_cluster = kmeans_model.predict([embedding])[0]
    
    # Compute distances to the cluster centers
    distances = kmeans_model.transform([embedding])

    # Sort the distances and find the index of the smallest one (excluding the current cluster)
    nearest_clusters = np.argsort(distances[0])
    nearest_cluster = None
    for cluster in nearest_clusters:
        if cluster != current_cluster:
            nearest_cluster = cluster
            break
    
    # Get the indices of the questions in the nearest cluster
    nearest_indices = np.where(kmeans_model.labels_ == nearest_cluster)[0]

    # If we want more questions than available in the nearest cluster, return all of them
    if n > len(nearest_indices):
        n = len(nearest_indices)

    # Sort these indices by their distance to the center of the nearest cluster and get the top 'n'
    distances_to_nearest_center = np.linalg.norm(embeddings[nearest_indices] - kmeans_model.cluster_centers_[nearest_cluster], axis=1)
    top_n_indices = nearest_indices[np.argsort(distances_to_nearest_center)[:n]]

    # Return the questions at these indices
    return df["question"].iloc[top_n_indices].tolist()


## Get top N questions in cluster closest to cluster containing query embedding

In [28]:
def get_similar_questions(query, n=5):
    # Convert the string to an embedding
    embedding = sentence_model.encode([query], show_progress_bar=False).astype(np.float64)

    # Find and return the nearest cluster to this embedding
    return find_nearest_cluster(embedding[0], n)


In [29]:
get_similar_questions("What's one thing you wish I understood better about you?")

['What qualities do you admire most about yourself?',
 'What are some of the things you admire about yourself?',
 "What's a characteristic you admire in yourself?",
 'What is something you admire about yourself?',
 'What is something you admire about yourself or your personality, and why?']

## Get top N questions in M clusters closest to cluster containing query embedding

### For a single embedding query

In [31]:
def find_nearest_cluster(embedding, num_nearest_clusters=5, n=5):
    # Find the cluster that the supplied embedding belongs to
    current_cluster = kmeans_model.predict([embedding])[0]
    
    # Compute distances to the cluster centers
    distances = kmeans_model.transform([embedding])

    # Sort the distances and find the index of the smallest ones (excluding the current cluster)
    nearest_clusters = np.argsort(distances[0])
    
    nearest_clusters = [cluster for cluster in nearest_clusters if cluster != current_cluster]
    
    nearest_clusters = nearest_clusters[:num_nearest_clusters] # choose nearest v clusters
    
    nearest_questions = []
    
    for nearest_cluster in nearest_clusters:
        # Get the indices of the questions in the nearest cluster
        nearest_indices = np.where(kmeans_model.labels_ == nearest_cluster)[0]

        # If we want more questions than available in the nearest cluster, return all of them
        if n > len(nearest_indices):
            n = len(nearest_indices)

        # Sort these indices by their distance to the center of the nearest cluster and get the top 'n'
        distances_to_nearest_center = np.linalg.norm(embeddings[nearest_indices] - kmeans_model.cluster_centers_[nearest_cluster], axis=1)
        top_n_indices = nearest_indices[np.argsort(distances_to_nearest_center)[:n]]

        # Add the questions at these indices to the list
        nearest_questions += df["question"].iloc[top_n_indices].tolist()
        
    return nearest_questions

In [32]:
get_similar_questions("What's one thing you wish I understood better about you?")


['What qualities do you admire most about yourself?',
 'What are some of the things you admire about yourself?',
 "What's a characteristic you admire in yourself?",
 'What is something you admire about yourself?',
 'What is something you admire about yourself or your personality, and why?',
 "What are the most important lessons you've learned in life?",
 "What is the most important lesson you've learned in your life?",
 "What's the most important life lesson you've learned so far?",
 'What are some of the important life lessons that your family has taught you?',
 "What are the three most important life lessons you've learned so far?",
 'What are some things you are grateful for in your life right now?',
 'What is one thing you are grateful for in your life right now?',
 "What's something you're grateful for that you have in your life right now?",
 'What is something that you are grateful for in your life?',
 "What are some things you're grateful for in your personal life?",
 'What are 

### For multiple embedding query (all at once)

In [52]:
def find_nearest_clusters_for_multiple_queries(query_embeddings, num_nearest_clusters=5, n=5):
    nearest_questions = []

    # Iterate through each embedding
    for embedding in query_embeddings:
        # Find the cluster that the supplied embedding belongs to
        current_cluster = kmeans_model.predict([embedding])[0]

        # Compute distances to the cluster centers
        distances = kmeans_model.transform([embedding])

        # Sort the distances and find the index of the smallest ones (excluding the current cluster)
        nearest_clusters = np.argsort(distances[0])

        nearest_clusters = [cluster for cluster in nearest_clusters if cluster != current_cluster]

        nearest_clusters = nearest_clusters[:num_nearest_clusters] # choose nearest v clusters
        
        for nearest_cluster in nearest_clusters:
            # Get the indices of the questions in the nearest cluster
            nearest_indices = np.where(kmeans_model.labels_ == nearest_cluster)[0]

            # If we want more questions than available in the nearest cluster, return all of them
            if n > len(nearest_indices):
                n = len(nearest_indices)

            # Sort these indices by their distance to the center of the nearest cluster and get the top 'n'
            distances_to_nearest_center = np.linalg.norm(embeddings[nearest_indices] - kmeans_model.cluster_centers_[nearest_cluster], axis=1)
            top_n_indices = nearest_indices[np.argsort(distances_to_nearest_center)[:n]]

            # Add the questions at these indices to the list
            nearest_questions += df["question"].iloc[top_n_indices].tolist()

    return nearest_questions


In [57]:
def get_nearest_questions_for_multiple_queries(queries, num_nearest_clusters=5, n=1):
    # Convert the strings to embeddings
    embeddings = sentence_model.encode(queries, show_progress_bar=False)
    
    # Find and return the nearest questions for these embeddings
    return find_nearest_clusters_for_multiple_queries(embeddings, num_nearest_clusters, n)


In [58]:
queries = ["What is the capital of France?", "Who won the world series?", "How does photosynthesis work?"]
get_nearest_questions_for_multiple_queries(queries)

['What is the most picturesque outdoor location you have ever visited?',
 'What is your favorite piece of clothing and why?',
 'What is your love language?',
 'In your opinion, what is the ideal role of government?',
 "What's your favorite cuisine to cook or bake?",
 'What is the most you have ever won gambling?',
 'What is your favorite piece of clothing and why?',
 'Do you think there is intelligent life on other planets?',
 'What is the most fascinating scientific discovery that has been made in your lifetime?',
 "What's your favorite TV or movie franchise and why?",
 'What small changes can we make at home to reduce our environmental impact?',
 "Do you have any tips for getting a good night's sleep?",
 'What are some ways you prioritize your personal life while still meeting work responsibilities?',
 'What kind of photography do you enjoy most?',
 'How do you think climate change will affect our future?']

## Get top N questions in M clusters farthest to cluster containing query embedding
This can be useful to introduce new content that is not within the space the user has explored.

In [37]:
def find_questions_in_farthest_clusters(embedding, num_furthest_clusters=5, n=5):
    # Find the cluster that the supplied embedding belongs to
    current_cluster = kmeans_model.predict([embedding])[0]
    
    # Compute distances to the cluster centers
    distances = kmeans_model.transform([embedding])

    # Sort the distances and find the index of the largest ones (excluding the current cluster)
    farthest_clusters = np.argsort(distances[0])[::-1]
    
    farthest_clusters = [cluster for cluster in farthest_clusters if cluster != current_cluster]
    
    farthest_clusters = farthest_clusters[:num_furthest_clusters]  # choose farthest v clusters
    
    farthest_questions = []
    
    for farthest_cluster in farthest_clusters:
        # Get the indices of the questions in the farthest cluster
        farthest_indices = np.where(kmeans_model.labels_ == farthest_cluster)[0]

        # If we want more questions than available in the farthest cluster, return all of them
        if n > len(farthest_indices):
            n = len(farthest_indices)

        # Sort these indices by their distance to the center of the farthest cluster and get the top 'n'
        distances_to_farthest_center = np.linalg.norm(embeddings[farthest_indices] - kmeans_model.cluster_centers_[farthest_cluster], axis=1)
        top_n_indices = farthest_indices[np.argsort(distances_to_farthest_center)[:n]]

        # Add the questions at these indices to the list
        farthest_questions += df["question"].iloc[top_n_indices].tolist()
        
    return farthest_questions

def find_farthest_questions(query, n=5):
    # Convert the string to an embedding
    embedding = sentence_model.encode([query], show_progress_bar=False).astype(np.float64)

    # Find and return the nearest cluster to this embedding
    return find_questions_in_farthest_clusters(embedding[0], n)


In [38]:
find_farthest_questions("What's one thing you wish I understood better about you?")


['In what ways do you think globalization has influenced the economy?',
 'How do you think globalization has impacted different cultures?',
 'How do you think globalization has affected international relations?',
 'How do you think globalization has impacted urbanization?',
 'In what ways do you think globalization has affected the distribution of wealth?',
 'What is your favorite way to celebrate a festival or tradition?',
 'What is the most memorable tradition or festival that you have celebrated?',
 'What is your favorite aspect of celebrating traditions or festivals?',
 'What is the most unique or unusual festival or tradition in your culture?',
 'How do you usually celebrate traditions or festivals?',
 'What are some potential use cases for cryptocurrency beyond investment?',
 'What do you think the future holds for cryptocurrency?',
 'What are some common arguments for and against cryptocurrency?',
 'What are some potential risks and rewards of investing in cryptocurrency?',
 'Wh