This notebook loads all the Question and Answer pairs from a CSV file
Using a given input question, I would then:
- Use query decomposition to construct multiple candidate subqueries
- Feed the subqueries and the QnA pairs to a Cross-Encoder and return the top matching questions

In [1]:
import random
from collections import defaultdict
from sentence_transformers import CrossEncoder, SentenceTransformer, SentencesDataset, SentenceTransformerTrainer, SentenceTransformerTrainingArguments, losses, SimilarityFunction
from sentence_transformers.sampler import BatchSampler
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.losses import TripletLoss
from sentence_transformers.readers import LabelSentenceReader, InputExample
from torch.utils.data import DataLoader
from datasets import Dataset, load_dataset, Value
import chromadb

trained_model_path = "models/all-MiniLM-L6-v2-trained"
model_r_name = "cross-encoder/ms-marco-MiniLM-L6-v2"

# Load fine-tuned model - we are using a Sentence-BERT model
model2 = SentenceTransformer.load(trained_model_path)
# Load Re-ranker model (cross encoder specifically)
model_r = CrossEncoder(model_r_name)

dataset = Dataset.from_csv("trading-v0.csv")

dataset = dataset.rename_column("Questions", "anchor")
dataset = dataset.rename_column("Answers", "positive")
dataset = dataset.map(lambda e, i: {'negative': f"I do not know"}, with_indices=True).cast_column('negative', Value(dtype='string'))

Map:   0%|          | 0/47 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/47 [00:00<?, ? examples/s]

In [2]:
import dspy
lm = dspy.LM('ollama_chat/gemma3', api_base='http://localhost:11434', api_key='', cache=False, temperature=0)
dspy.configure(lm=lm)

In [3]:
import dspy
from dspy import Signature
decomposer = dspy.Predict('question, context -> subquestions: list, confidence: float')

In [34]:
question = "Do I have to report my HYSA?" # "Do I need preapproval for trading QQQ?"

In [35]:
res = decomposer(question=f"Decompose this complex question into relevant sub-questions. \n\n{question}")

subquestions = []
for subq in res.subquestions:
    if type(subq) is dict:
        subquestions.append(subq['question'])
    else:
        subquestions.append(subq)
print(subquestions)



['What is a HYSA?', 'Are there any tax implications of reporting a HYSA?', 'Does the IRS require reporting of HYSA interest income?', 'What are the potential penalties for failing to report HYSA interest?', 'Are there specific circumstances where reporting a HYSA is recommended (e.g., for tax planning)?']


In [None]:
import numpy as np

queries = subquestions
sentences = dataset['anchor']
candidates = sentences

all_pairs = [(q, c) for q in queries for c in candidates]
all_scores = model_r.predict(all_pairs)

print(sorted(all_scores, reverse=True))

best_idx = np.argmax(all_scores)
best_pair = all_pairs[best_idx]
response = next((x['positive'] for x in dataset if x['anchor'] == best_pair[1]), None)
print(f"Best Global Match:\n  Query: {best_pair[0]}\n  Candidate: {best_pair[1]}\n  Response: {response}\n  Score: {all_scores[best_idx]:.4f}")

[np.float32(-7.8458185), np.float32(-9.172959), np.float32(-9.357361), np.float32(-9.963114), np.float32(-10.273611), np.float32(-10.322607), np.float32(-10.483141), np.float32(-10.551), np.float32(-10.570628), np.float32(-10.734219), np.float32(-10.739819), np.float32(-10.793228), np.float32(-10.832656), np.float32(-10.850616), np.float32(-10.858981), np.float32(-10.888412), np.float32(-10.913237), np.float32(-10.962453), np.float32(-10.978567), np.float32(-11.002724), np.float32(-11.008127), np.float32(-11.0165825), np.float32(-11.017458), np.float32(-11.018847), np.float32(-11.019191), np.float32(-11.059926), np.float32(-11.074915), np.float32(-11.090278), np.float32(-11.093088), np.float32(-11.105385), np.float32(-11.108767), np.float32(-11.125722), np.float32(-11.134575), np.float32(-11.139936), np.float32(-11.141357), np.float32(-11.144301), np.float32(-11.147865), np.float32(-11.15118), np.float32(-11.151602), np.float32(-11.162624), np.float32(-11.168293), np.float32(-11.171454

In [37]:
ranks = model_r.rank(question, sentences)
print(f"Query ==> {question}")
for rank in ranks:
    print(f"{rank['score']:.2f}\t {sentences[rank['corpus_id']]}")

Query ==> Do I have to report my HYSA?
-7.83	 Do high yield savings need to be reported?
-8.65	 Do i have to declare restricted stocks awarded by my previous employer
-8.86	 Do I have to disclose my 529 account opened by my parents. I don't have access Do i need to disclose my parents trades, even though i have no access to them?
-8.93	 Do we have to disclose accounts for family members I live with but don't have any access to
-8.96	 Do i have to disclose accounts that I don't have any investments in? (Ex. Robinhood account with no investments in currently)
-9.83	 Are normal 401Ks required to be reported from prior employers?
-9.91	 Do we have to disclose index futures trading accounts?
-10.54	 Do we need to declare non restricted securities.
-10.56	 Do we have to disclose we have an account with Robinhood or Fidelity if there is no money or trades in said accounts?
-10.88	 Do my wife's 403B account have to be disclosed?
-10.95	 Do I need to transfer my Fundrise account?
-11.11	 Can yo