In [1]:
from llama_index.embeddings.openai import OpenAIEmbedding
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# initialize OpenAI embedding model
embed_model = OpenAIEmbedding(model="text-embedding-ada-002")

# define trivial questions and answers
phrases = [
    "Who was the first president of the United States?",
    "What is the capital city of France?",
    "In what year did humans first land on the moon?",
    "Which element on the periodic table has the chemical symbol O?",
    "What is the largest planet in the solar system?",
    "The first president of the United States was George Washington.",
    "The capital city of France is Paris.",
    "Humans first landed on the moon in the year 1969.",
    "The chemical symbol O represents the element Oxygen.",
    "The largest planet in the solar system is Jupiter."
]

# generate embeddings for phrases
embeddings = embed_model.get_text_embedding_batch(phrases)

# convert embeddings to numpy arrays
embeddings = np.array(embeddings)

# print the first phrase and the first several elements of the embedding
print(f"Phrase: {phrases[0]}")
print(f"Embedding: {embeddings[0][:5]}")

Phrase: Who was the first president of the United States?
Embedding: [-0.00529869 -0.02196502 -0.01970232 -0.02279548 -0.00797962]


In [3]:
# compute cosine similarity between the embeddings
similarity_matrix = cosine_similarity(embeddings)
rounded_similarity_matrix = np.round(similarity_matrix, 2)
# print the cosine similarity matrix
print(f"Cosine Similarity Matrix: {rounded_similarity_matrix}")

Cosine Similarity Matrix: [[1.   0.78 0.83 0.76 0.76 0.92 0.75 0.78 0.72 0.73]
 [0.78 1.   0.74 0.75 0.77 0.75 0.94 0.72 0.73 0.73]
 [0.83 0.74 1.   0.75 0.78 0.78 0.72 0.93 0.72 0.74]
 [0.76 0.75 0.75 1.   0.77 0.73 0.74 0.73 0.93 0.75]
 [0.76 0.77 0.78 0.77 1.   0.72 0.75 0.75 0.73 0.93]
 [0.92 0.75 0.78 0.73 0.72 1.   0.79 0.78 0.74 0.76]
 [0.75 0.94 0.72 0.74 0.75 0.79 1.   0.74 0.76 0.78]
 [0.78 0.72 0.93 0.73 0.75 0.78 0.74 1.   0.74 0.76]
 [0.72 0.73 0.72 0.93 0.73 0.74 0.76 0.74 1.   0.76]
 [0.73 0.73 0.74 0.75 0.93 0.76 0.78 0.76 0.76 1.  ]]


In [8]:
# output the comparison between phrases with improved readability
for i, phrase in enumerate(phrases):
    print(f"Phrase: {phrase}")
    for j, other_phrase in enumerate(phrases):
        if i != j:
            print(f"  {rounded_similarity_matrix[i, j]:.2f}: {other_phrase}")
    print()

Phrase: Who was the first president of the United States?
  0.78: What is the capital city of France?
  0.83: In what year did humans first land on the moon?
  0.76: Which element on the periodic table has the chemical symbol O?
  0.76: What is the largest planet in the solar system?
  0.92: The first president of the United States was George Washington.
  0.75: The capital city of France is Paris.
  0.78: Humans first landed on the moon in the year 1969.
  0.72: The chemical symbol O represents the element Oxygen.
  0.73: The largest planet in the solar system is Jupiter.

Phrase: What is the capital city of France?
  0.78: Who was the first president of the United States?
  0.74: In what year did humans first land on the moon?
  0.75: Which element on the periodic table has the chemical symbol O?
  0.77: What is the largest planet in the solar system?
  0.75: The first president of the United States was George Washington.
  0.94: The capital city of France is Paris.
  0.72: Humans fi

Let's see how well it does if the questions and answers are all about the same topic.
For example, Astronomy.

In [9]:
astronomy_phrases = [
    "What year did the first human land on the moon?",
    "Which planet is known as the Red Planet?",
    "What is the largest moon of Saturn?",
    "Who was the first person to travel into space?",
    "What is the name of NASA's rover that landed on Mars in 2021?",
    "The first human landed on the moon in 1969.",
    "The planet known as the Red Planet is Mars.",
    "The largest moon of Saturn is Titan.",
    "Yuri Gagarin was the first person to travel into space.",
    "NASA's rover that landed on Mars in 2021 is named Perseverance."
]

astronomy_embeddings = embed_model.get_text_embedding_batch(astronomy_phrases)
astronomy_embeddings_array = np.array(astronomy_embeddings)

# Print the first phrase and the first several elements of its embedding
print(f"Phrase: {astronomy_phrases[0]}")
print(f"Embedding: {astronomy_embeddings_array[0][:5]}")

Phrase: What year did the first human land on the moon?
Embedding: [ 0.0055372  -0.03727422  0.00532086 -0.02473242 -0.02219898]


In [10]:

# compute cosine similarity between the embeddings
astronomy_similarity_matrix = cosine_similarity(astronomy_embeddings_array)
rounded_astronomy_similarity_matrix = np.round(astronomy_similarity_matrix, 2)

# print the cosine similarity matrix
print(f"Cosine Similarity Matrix: {rounded_astronomy_similarity_matrix}")


Cosine Similarity Matrix: [[1.   0.77 0.79 0.86 0.82 0.94 0.76 0.76 0.81 0.79]
 [0.77 1.   0.8  0.79 0.82 0.76 0.95 0.79 0.76 0.8 ]
 [0.79 0.8  1.   0.77 0.77 0.76 0.78 0.94 0.74 0.74]
 [0.86 0.79 0.77 1.   0.8  0.85 0.77 0.75 0.92 0.77]
 [0.82 0.82 0.77 0.8  1.   0.8  0.82 0.74 0.75 0.93]
 [0.94 0.76 0.76 0.85 0.8  1.   0.78 0.78 0.83 0.81]
 [0.76 0.95 0.78 0.77 0.82 0.78 1.   0.81 0.77 0.84]
 [0.76 0.79 0.94 0.75 0.74 0.78 0.81 1.   0.76 0.76]
 [0.81 0.76 0.74 0.92 0.75 0.83 0.77 0.76 1.   0.76]
 [0.79 0.8  0.74 0.77 0.93 0.81 0.84 0.76 0.76 1.  ]]


In [13]:
# output the comparison between phrases with improved readability
for i, phrase in enumerate(astronomy_phrases):
    print(f"Phrase: {phrase}")
    for j, other_phrase in enumerate(astronomy_phrases):
        if i != j:
            print(f"  {rounded_astronomy_similarity_matrix[i, j]:.2f}: {other_phrase}")
    print()

Phrase: What year did the first human land on the moon?
  0.77: Which planet is known as the Red Planet?
  0.79: What is the largest moon of Saturn?
  0.86: Who was the first person to travel into space?
  0.82: What is the name of NASA's rover that landed on Mars in 2021?
  0.94: The first human landed on the moon in 1969.
  0.76: The planet known as the Red Planet is Mars.
  0.76: The largest moon of Saturn is Titan.
  0.81: Yuri Gagarin was the first person to travel into space.
  0.79: NASA's rover that landed on Mars in 2021 is named Perseverance.

Phrase: Which planet is known as the Red Planet?
  0.77: What year did the first human land on the moon?
  0.80: What is the largest moon of Saturn?
  0.79: Who was the first person to travel into space?
  0.82: What is the name of NASA's rover that landed on Mars in 2021?
  0.76: The first human landed on the moon in 1969.
  0.95: The planet known as the Red Planet is Mars.
  0.79: The largest moon of Saturn is Titan.
  0.76: Yuri Gaga

Q: Was the score able to find the matching questions and answers?
A: Yes!

# Only one answer is correct

Let's do one last one, the most challenging of all, where we have one question, one correct answer, and four wrong answers.

In [14]:
multiple_choice_questions = [
    "What spacecraft was used in the mission to carry the first humans to the moon?",  # Question
    "Apollo 11 was the spacecraft used to carry the first humans to the moon.",       # Correct Answer
    "Apollo 12 was the spacecraft used to carry the first humans to the moon.",         # Wrong Answer
    "Apollo 14 was the spacecraft used to carry astronauts on the third successful moon landing mission.", # Wrong Answer
    "Apollo 10 was the spacecraft used to carry the first humans to the moon.", # Wrong Answer
    "Apollo 16 was the spacecraft that carried astronauts to explore the lunar highlands."   # Wrong Answer
]

mcq_embeddings = embed_model.get_text_embedding_batch(multiple_choice_questions)
mcq_embeddings_array = np.array(mcq_embeddings)

# Print the first phrase and the first several elements of its embedding
print(f"Phrase: {multiple_choice_questions[0]}")
print(f"Embedding: {mcq_embeddings_array[0][:5]}")

Phrase: What spacecraft was used in the mission to carry the first humans to the moon?
Embedding: [ 0.02234936 -0.01276388  0.02098001 -0.01151388 -0.0080214 ]


In [18]:
# compute cosine similarity between the embeddings
mcq_similarity_matrix = cosine_similarity(mcq_embeddings_array)
rounded_mcq_similarity_matrix = np.round(mcq_similarity_matrix, 2)

# print the cosine similarity matrix
print(f"Cosine Similarity Matrix: {rounded_mcq_similarity_matrix}")

# Output comparison between question (first element) and answers with improved readability
print(f"Question: {multiple_choice_questions[0]}")
for i in range(1, len(multiple_choice_questions)):
    print(f"  {rounded_mcq_similarity_matrix[0, i]:.2f}: {multiple_choice_questions[i]}")


Cosine Similarity Matrix: [[1.   0.93 0.93 0.89 0.93 0.89]
 [0.93 1.   0.97 0.93 0.97 0.91]
 [0.93 0.97 1.   0.93 0.96 0.92]
 [0.89 0.93 0.93 1.   0.93 0.93]
 [0.93 0.97 0.96 0.93 1.   0.91]
 [0.89 0.91 0.92 0.93 0.91 1.  ]]
  0.93: Apollo 11 was the spacecraft used to carry the first humans to the moon.
  0.93: Apollo 12 was the spacecraft used to carry the first humans to the moon.
  0.89: Apollo 14 was the spacecraft used to carry astronauts on the third successful moon landing mission.
  0.93: Apollo 10 was the spacecraft used to carry the first humans to the moon.
  0.89: Apollo 16 was the spacecraft that carried astronauts to explore the lunar highlands.
