This notebook loads all the Question and Answer pairs from a CSV file
Using a given input question, I would then:
- Use query decomposition to construct multiple candidate subqueries
- Feed the subqueries and the QnA pairs to a Cross-Encoder and return the top matching questions

In [None]:
import random
from collections import defaultdict
from sentence_transformers import CrossEncoder, SentenceTransformer, SentencesDataset, SentenceTransformerTrainer, SentenceTransformerTrainingArguments, losses, SimilarityFunction
from sentence_transformers.sampler import BatchSampler
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.losses import TripletLoss
from sentence_transformers.readers import LabelSentenceReader, InputExample
from torch.utils.data import DataLoader
from datasets import Dataset, load_dataset, Value
import chromadb

trained_model_path = "models/all-MiniLM-L6-v2-trained"
model_r_name = "cross-encoder/ms-marco-MiniLM-L6-v2"

# Load fine-tuned model - we are using a Sentence-BERT model
model2 = SentenceTransformer.load(trained_model_path)
# Load Re-ranker model (cross encoder specifically)
model_r = CrossEncoder(model_r_name)

dataset = Dataset.from_csv("trading-v0.csv")

dataset = dataset.rename_column("Questions", "anchor")
dataset = dataset.rename_column("Answers", "positive")
dataset = dataset.map(lambda e, i: {'negative': f"I do not know"}, with_indices=True).cast_column('negative', Value(dtype='string'))

In [None]:
import dspy
lm = dspy.LM('ollama_chat/llama3.2', api_base='http://localhost:11434', api_key='', cache=False, temperature=0)
dspy.configure(lm=lm)

In [None]:
import dspy
from dspy import Signature
decomposer = dspy.Predict('question, context -> subquestions: list, confidence: float')

In [None]:
question = ".................."

In [None]:
res = decomposer(question=f"Decompose this complex question into relevant sub-questions. \n\n{question}")

subquestions = []
for subq in res.subquestions:
    if type(subq) is dict:
        subquestions.append(subq['question'])
    else:
        subquestions.append(subq)
print(subquestions)

In [None]:
import numpy as np

queries = subquestions
sentences = dataset['anchor']
candidates = sentences

all_pairs = [(q, c) for q in queries for c in candidates]
all_scores = model_r.predict(all_pairs)

print(sorted(all_scores, reverse=True))

best_idx = np.argmax(all_scores)
best_pair = all_pairs[best_idx]
response = next((x['positive'] for x in dataset if x['anchor'] == best_pair[1]), None)
print(f"Best Global Match:\n  Query: {best_pair[0]}\n  Candidate: {best_pair[1]}\n  Response: {response}\n  Score: {all_scores[best_idx]:.4f}")

In [None]:
ranks = model_r.rank(question, sentences)
print(f"Query ==> {question}")
for rank in ranks:
    print(f"{rank['score']:.2f}\t {sentences[rank['corpus_id']]}")