# Cross-Encoder Re-Ranking

In [1]:
# Create document chunks for embedding
from pypdf import PdfReader

# Read the PDF file
reader = PdfReader("microsoft_annual_report_2022.pdf")
pdf_texts = [p.extract_text().strip() for p in reader.pages]

# Filter the empty strings
pdf_texts = [text for text in pdf_texts if text]

from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter

character_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=1000,
    chunk_overlap=0
)

character_split_texts = character_splitter.split_text('\n\n'.join(pdf_texts))

token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0, tokens_per_chunk=256)

# Further split the chunks to prepare for embedding
token_split_texts = []
for text in character_split_texts:
    token_split_texts += token_splitter.split_text(text)

Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wrong pointing object 22 0 (offset 0)
Ignoring wrong pointing object 43 0 (offset 0)
Ignoring wrong pointing object 49 0 (offset 0)
Ignoring wrong pointing object 51 0 (offset 0)
Ignoring wrong pointing object 53 0 (offset 0)
Ignoring wrong pointing object 55 0 (offset 0)
Ignoring wrong pointing object 57 0 (offset 0)
Ignoring wrong pointing object 72 0 (offset 0)
Ignoring wrong pointing object 162 0 (offset 0)
Ignoring wrong pointing object 229 0 (offset 0)
Ignoring wrong pointing object 231 0 (offset 0)
Ignoring wrong pointing object 252 0 (offset 0)
Ignoring wrong pointing object 257 0 (offset 0)
Ignoring wrong pointing object 294 0 (offset 0)
Ignoring wrong pointing object 299 0 (offset 0)
Ignoring wrong pointing object 319 0 (offset 0)
Ignoring wrong pointing object 331 0 (offset 0)
Ignoring wrong pointing object 336 0 (offset 0)
Ignor

In [2]:
# Setup chroma
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

embedding_function = SentenceTransformerEmbeddingFunction()

chroma_client = chromadb.Client()
chroma_collection = chroma_client.create_collection("microsoft_annual_report_2022", embedding_function=embedding_function)

ids = [str(i) for i in range(len(token_split_texts))]

# Add documents to the vectorbase
chroma_collection.add(ids=ids, documents=token_split_texts)
chroma_collection.count()

451

## Re-ranking the long tail

In [4]:
# Query the collection, retrieving more documents than usual
query = "What has been the investment in research and development?"
results = chroma_collection.query(query_texts=query, n_results=10, include=['documents', 'embeddings'])

retrieved_documents = results['documents'][0]

for document in results['documents'][0]:
    print(document)
    print('')

research and development $ 24, 512 $ 20, 716 18 % as a percent of revenue 12 % 12 % 0ppt research and development expenses include payroll, employee benefits, stock - based compensation expense, and other headcount - related expenses associated with product development. research and development expenses also include third - party development and programming costs, localization costs incurred to translate software for international markets, and the amortization of purchased software code and services content. research and development expenses increased $ 3. 8 billion or 18 % driven by investments in cloud engineering, gaming, and linkedin. sales and marketing ( in millions, except percentages ) 2022 2021 percentage change

. investing in the future our success is based on our ability to create new and compelling products, services, and experiences for our users, to initiate and embrace disruptive technology trends, to enter new geographic and product markets, and to drive broad adoption

In [5]:
from sentence_transformers import CrossEncoder
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [6]:
# Create query, doc pairs for each document
pairs = [[query, doc] for doc in retrieved_documents]
scores = cross_encoder.predict(pairs)
print("Scores:")
for score in scores:
    print(score)

Scores:
1.883019
2.797129
2.9828472
-10.712097
-8.425993
-7.959864
-3.3212464
-9.8535
-10.928937
-8.527864


In [8]:
import numpy as np

# Reorder the documents according to the scores
print("New Ordering:")
for o in np.argsort(scores)[::-1]:
    print(o+1)

New Ordering:
3
2
1
7
6
5
10
8
4
9


## Re-ranking with Query Expansion

In [10]:
original_query = "What were the most important factors that contributed to increases in revenue?"
generated_queries = [
    "What were the major drivers of revenue growth?",
    "Were there any new product launches that contributed to the increase in revenue?",
    "Did any changes in pricing or promotions impact the revenue growth?",
    "What were the key market trends that facilitated the increase in revenue?",
    "Did any acquisitions or partnerships contribute to the revenue growth?"
]

In [11]:
# Combine the original query with the generated queries, collect a bigger pool of documents
queries = [original_query] + generated_queries

results = chroma_collection.query(query_texts=queries, n_results=10, include=['documents', 'embeddings'])
retrieved_documents = results['documents']

In [12]:
# Deduplicate the retrieved documents
unique_documents = set()
for documents in retrieved_documents:
    for document in documents:
        unique_documents.add(document)

unique_documents = list(unique_documents)

In [None]:
# Build the query, doc pairs for each extracted document
pairs = []
for doc in unique_documents:
    pairs.append([original_query, doc])

In [14]:
# Pass the pairs through the cross-encoder
scores = cross_encoder.predict(pairs)

In [15]:
print("Scores:")
for score in scores:
    print(score)

Scores:
-5.6333246
-9.8078785
-1.1550931
-7.782469
-4.9443893
-7.900524
-3.9324174
-4.0156407
-8.623781
-9.5967045
-4.457102
-7.23612
-4.623309
-10.021421
-11.156834
-5.3622727
-4.7538257
-10.119462
-7.1127553
-10.171878


In [None]:
# Observe how documents who were considered less relevant in the original embedding 
# similarity retrieval now score higher in their relevance to the original query
print("New Ordering:")
for o in np.argsort(scores)[::-1]:
    print(o)

New Ordering:
2
6
7
10
12
16
4
15
0
18
11
3
5
8
9
1
13
17
19
14
