# Embedding Adaptors

In [None]:
# Create document chunks for embedding
from pypdf import PdfReader

# Read the PDF file
reader = PdfReader("microsoft_annual_report_2022.pdf")
pdf_texts = [p.extract_text().strip() for p in reader.pages]

# Filter the empty strings
pdf_texts = [text for text in pdf_texts if text]

from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter

character_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=1000,
    chunk_overlap=0
)

character_split_texts = character_splitter.split_text('\n\n'.join(pdf_texts))

token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0, tokens_per_chunk=256)

# Further split the chunks to prepare for embedding
token_split_texts = []
for text in character_split_texts:
    token_split_texts += token_splitter.split_text(text)

In [None]:
# Setup chroma
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

embedding_function = SentenceTransformerEmbeddingFunction()

chroma_client = chromadb.Client()
chroma_collection = chroma_client.create_collection("microsoft_annual_report_2022", embedding_function=embedding_function)

ids = [str(i) for i in range(len(token_split_texts))]

# Add documents to the vectorbase
chroma_collection.add(ids=ids, documents=token_split_texts)
chroma_collection.count()

In [None]:
import umap
import numpy as np
from tqdm import tqdm

# Define the projection function
def project_embeddings(embeddings, umap_transform):
    umap_embeddings = np.empty((len(embeddings),2))
    for i, embedding in enumerate(tqdm(embeddings)): 
        umap_embeddings[i] = umap_transform.transform([embedding])
    return umap_embeddings   

In [None]:
embeddings = chroma_collection.get(include=['embeddings'])['embeddings']
umap_transform = umap.UMAP(random_state=0, transform_seed=0).fit(embeddings)
projected_dataset_embeddings = project_embeddings(embeddings, umap_transform)

In [None]:
# Setup the OpenAI client for the RAG Operation
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
from gen_ai_hub.proxy.native.openai import chat

llm_model = "gpt-35-turbo"

## Creating a dataset

In [None]:
# Generate some relevant queries
def generate_queries(model="gpt-3.5-turbo"):
    messages = [
        {
            "role": "system",
            "content": "You are a helpful expert financial research assistant. You help users analyze financial statements to better understand companies. "
            "Suggest 10 to 15 short questions that are important to ask when analyzing an annual report. "
            "Do not output any compound questions (questions with multiple sentences or conjunctions)."
            "Output each question on a separate line divided by a newline."
        },
    ]

    kwargs = dict(model_name=llm_model, messages=messages)
    response = chat.completions.create(**kwargs)
    
    content = response.choices[0].message.content
    content = content.split("\n")
    return content

In [None]:
generated_queries = generate_queries()
for query in generated_queries:
    print(query)

In [None]:
# Query the Chroma collection with the generated queries
results = chroma_collection.query(query_texts=generated_queries, n_results=10, include=['documents', 'embeddings'])
retrieved_documents = results['documents']

In [None]:
# Generate a function to pass each document through an LLM along with a query
# The LLM will rank each document according to its relevance to the query
# The outputs of the function will 1 for relevant and -1 for irrelevant
# This is because the adaptors loss function will be cosine similarity
# In cosine similarity opposite pointing vectors produce a score of -1 while identical vectors produce a score of 1
def evaluate_results(query, statement, model="gpt-3.5-turbo"):
    messages = [
    {
        "role": "system",
        "content": "You are a helpful expert financial research assistant. You help users analyze financial statements to better understand companies. "
        "For the given query, evaluate whether the following satement is relevant."
        "Output only 'yes' or 'no'."
    },
    {
        "role": "user",
        "content": f"Query: {query}, Statement: {statement}"
    }
    ]

    kwargs = dict(model_name=llm_model, messages=messages)
    response = chat.completions.create(**kwargs)
    
    content = response.choices[0].message.content
    if content == "yes":
        return 1
    return -1

In [None]:
retrieved_embeddings = results['embeddings']
query_embeddings = embedding_function(generated_queries)

In [None]:
adapter_query_embeddings = []
adapter_doc_embeddings = []
adapter_labels = []

In [None]:
# Create a dataset to train the adaptor
# The dataset is composed of triplets of query embeddings, document embeddings, and the evaluation model's attributed relevancy label
for q, query in enumerate(tqdm(generated_queries)):
    for d, document in enumerate(retrieved_documents[q]):
        adapter_query_embeddings.append(query_embeddings[q])
        adapter_doc_embeddings.append(retrieved_embeddings[q][d])
        adapter_labels.append(evaluate_results(query, document))

In [None]:
len(adapter_labels)

In [None]:
import torch

# Convert the dataset to tensors
adapter_query_embeddings = torch.Tensor(np.array(adapter_query_embeddings))
adapter_doc_embeddings = torch.Tensor(np.array(adapter_doc_embeddings))
adapter_labels = torch.Tensor(np.expand_dims(np.array(adapter_labels),1))

In [None]:

# Pack the tensors into a dataset to train the adaptor model
dataset = torch.utils.data.TensorDataset(adapter_query_embeddings, adapter_doc_embeddings, adapter_labels)

## Setting up the model

In [None]:
# The adaptor model is a simple linear layer
# The model will take the query and document embeddings as input
# Then it will multiply the query embeddings by the adaptor matrix
# Finally, it will calculate the cosine similarity between the updated query embeddings and the document embeddings
def model(query_embedding, document_embedding, adaptor_matrix):
    updated_query_embedding = torch.matmul(adaptor_matrix, query_embedding)
    return torch.cosine_similarity(updated_query_embedding, document_embedding, dim=0)

In [None]:
# Define the loss function
# The loss function is the mean squared error between the model's output 
# and the true label (attributed by the evaluation model earlier)
def mse_loss(query_embedding, document_embedding, adaptor_matrix, label):
    return torch.nn.MSELoss()(model(query_embedding, document_embedding, adaptor_matrix), label)

In [None]:
# Initialize the adaptor matrix
mat_size = len(adapter_query_embeddings[0])
adapter_matrix = torch.randn(mat_size, mat_size, requires_grad=True)

In [None]:
min_loss = float('inf')
best_matrix = None

# Train the adaptor model for 100 epochs
# The model will attempt to find the adaptor matrix that minimizes the loss function
# The model will update the adaptor matrix using the gradient of the loss function
# This is a similar process to training a single linear layer in a neural network
for epoch in tqdm(range(100)):
    for query_embedding, document_embedding, label in dataset:
        loss = mse_loss(query_embedding, document_embedding, adapter_matrix, label)

        if loss < min_loss:
            min_loss = loss
            best_matrix = adapter_matrix.clone().detach().numpy()

        loss.backward()
        with torch.no_grad():
            adapter_matrix -= 0.01 * adapter_matrix.grad
            adapter_matrix.grad.zero_()

In [None]:
print(f"Best loss: {min_loss.detach().numpy()}")

In [None]:
# Observe which dimensions of the query embedding are most affetr by the adaptor matrix multiplication
test_vector = torch.ones((mat_size,1))
scaled_vector = np.matmul(best_matrix, test_vector).numpy()

In [None]:
# Plot the scaled dimension vector
import matplotlib.pyplot as plt
plt.bar(range(len(scaled_vector)), scaled_vector.flatten())
plt.show()

In [None]:
# Project the query embeddings and the adapted query embeddings for comparison
query_embeddings = embedding_function(generated_queries)
adapted_query_embeddings = np.matmul(best_matrix, np.array(query_embeddings).T).T

projected_query_embeddings = project_embeddings(query_embeddings, umap_transform)
projected_adapted_query_embeddings = project_embeddings(adapted_query_embeddings, umap_transform)

In [None]:
# Plot the projected query and retrieved documents in the embedding space
# The original queries are marked in red while the adapted queries are marked in green
plt.figure()
plt.scatter(projected_dataset_embeddings[:, 0], projected_dataset_embeddings[:, 1], s=10, color='gray')
plt.scatter(projected_query_embeddings[:, 0], projected_query_embeddings[:, 1], s=150, marker='X', color='r', label="original")
plt.scatter(projected_adapted_query_embeddings[:, 0], projected_adapted_query_embeddings[:, 1], s=150, marker='X', color='green', label="adapted")

plt.gca().set_aspect('equal', 'datalim')
plt.title("Adapted Queries")
plt.axis('off')
plt.legend()