# Cross-Encoder Re-Ranking

In [None]:
# Create document chunks for embedding
from pypdf import PdfReader

# Read the PDF file
reader = PdfReader("microsoft_annual_report_2022.pdf")
pdf_texts = [p.extract_text().strip() for p in reader.pages]

# Filter the empty strings
pdf_texts = [text for text in pdf_texts if text]

from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter

character_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=1000,
    chunk_overlap=0
)

character_split_texts = character_splitter.split_text('\n\n'.join(pdf_texts))

token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0, tokens_per_chunk=256)

# Further split the chunks to prepare for embedding
token_split_texts = []
for text in character_split_texts:
    token_split_texts += token_splitter.split_text(text)

In [None]:
# Setup chroma
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

embedding_function = SentenceTransformerEmbeddingFunction()

chroma_client = chromadb.Client()
chroma_collection = chroma_client.create_collection("microsoft_annual_report_2022", embedding_function=embedding_function)

ids = [str(i) for i in range(len(token_split_texts))]

# Add documents to the vectorbase
chroma_collection.add(ids=ids, documents=token_split_texts)
chroma_collection.count()

## Re-ranking the long tail

In [None]:
# Query the collection, retrieving more documents than usual
query = "What has been the investment in research and development?"
results = chroma_collection.query(query_texts=query, n_results=10, include=['documents', 'embeddings'])

retrieved_documents = results['documents'][0]

for document in results['documents'][0]:
    print(document)
    print('')

In [None]:
from sentence_transformers import CrossEncoder
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [None]:
# Create query, doc pairs for each document
pairs = [[query, doc] for doc in retrieved_documents]
scores = cross_encoder.predict(pairs)
print("Scores:")
for score in scores:
    print(score)

In [None]:
import numpy as np

# Reorder the documents according to the scores
print("New Ordering:")
for o in np.argsort(scores)[::-1]:
    print(o+1)

## Re-ranking with Query Expansion

In [None]:
original_query = "What were the most important factors that contributed to increases in revenue?"
generated_queries = [
    "What were the major drivers of revenue growth?",
    "Were there any new product launches that contributed to the increase in revenue?",
    "Did any changes in pricing or promotions impact the revenue growth?",
    "What were the key market trends that facilitated the increase in revenue?",
    "Did any acquisitions or partnerships contribute to the revenue growth?"
]

In [None]:
# Combine the original query with the generated queries, collect a bigger pool of documents
queries = [original_query] + generated_queries

results = chroma_collection.query(query_texts=queries, n_results=10, include=['documents', 'embeddings'])
retrieved_documents = results['documents']

In [None]:
# Deduplicate the retrieved documents
unique_documents = set()
for documents in retrieved_documents:
    for document in documents:
        unique_documents.add(document)

unique_documents = list(unique_documents)

In [None]:
# Build the query, doc pairs for each extracted document
pairs = []
for doc in unique_documents:
    pairs.append([original_query, doc])

In [None]:
# Pass the pairs through the cross-encoder
scores = cross_encoder.predict(pairs)

In [None]:
print("Scores:")
for score in scores:
    print(score)

In [None]:
# Observe how documents who were considered less relevant in the original embedding 
# similarity retrieval now score higher in their relevance to the original query
print("New Ordering:")
for o in np.argsort(scores)[::-1]:
    print(o)