# üîé Retrieval (and QG?) playground

In [None]:
import json, random
import matplotlib.pyplot as plt
from langchain_core.example_selectors import (
    MaxMarginalRelevanceExampleSelector,
    SemanticSimilarityExampleSelector,
)
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
# from langchain_chroma import Chroma
from langchain_core.documents import Document
fro m langchain_community.retrievers import KNNRetriever
from IPython.display import display, Markdown, Latex
from sentence_transformers import CrossEncoder


In [2]:
!ollama pull qwen3-vl:32b

[?2026h[?25l[1Gpulling manifest ‚†ã [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ‚†ô [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ‚†π [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest [K[?25h[?2026l
Error: pull model manifest: 412: 

The model you are attempting to pull requires a newer version of Ollama.

Please download the latest version at:

	https://ollama.com/download




## ‚öôÔ∏è 0. Choose claim & docstore

In [3]:
random.seed(111)

In [15]:
DATA_DIR = "/mnt/data/factcheck/averimatec"
SPLIT = "val"

with open(f"{DATA_DIR}/{SPLIT}.json") as f:
    datapoints = json.load(f)

CLAIM_ID = random.randint(0,len(datapoints))
datapoint = datapoints[CLAIM_ID]
claim = datapoint["claim_text"]
docstore = []
for line in open(
    f"{DATA_DIR}/knowledge_store/{SPLIT}/text_related/text_related_store_text_{SPLIT}/{CLAIM_ID}.json"
):
    docstore.append(json.loads(line))

# print claim in markdown with some sample evidence
display(Markdown("### üóØÔ∏è " + claim + " [" + datapoint["label"] + "]"))
# sample 3
for i in random.sample(range(len(docstore)), 3):
    newline = "\n"
    display(Markdown(f"**{docstore[i]['url']}**\n\n * {newline.join(docstore[i]['url2text'][:10])}"))

### üóØÔ∏è The White House violated flag code by putting a LGBTQ+ pride flag in between two U.S. flags. [Refuted]

**https://www.fordlibrarymuseum.gov/the-fords/gerald-r-ford/key-speeches-and-writings-gerald-r-ford**

 * Pre-Presidential Speeches
December 6, 1973
Mr. Speaker, Mr. Chief Justice, Mr. President pro tempore, distinguished guests and friends:
Together we have made history here today. For the first time we have carried out the command of the 25th Amendment.
In exactly 8 weeks, we have demonstrated to the world that our great Republic stands solid, stands strong upon the bedrock of the Constitution.
I am a Ford, not a Lincoln. My addresses will never be as eloquent as Mr. Lincoln's. But I will do my very best to equal his brevity and his plain speaking.
I am deeply grateful to you, Mr. President, for the trust and the confidence your nomination implies.
As I have throughout my public service under six administrations I will try to set a fine example of respect for the crushing and lonely burdens which the Nation lays upon the President of the United States.
Mr. President, you have my support and my loyalty.
To the Congress assembled, my former colleagues who have elected me on behalf of our fellow countrymen I express my heartfelt thanks.

**https://mainelegion.org/pages/resources/links/the-flag.php**

 * The American Legion Flag Advocacy Flag Code Flag Questions & Answers Folding the Flag Frequency Asked Questions The Pledge of Allegiance Unserviceable Flags

**https://give.hrc.org/page/23977/donate/1?locale=en-US**

 * Each and every day, the Human Rights Campaign flies our flag with pride above our Washington, D.C. offices.
Our logo is one of the most recognizable symbols of the lesbian, gay, bisexual, transgender and queer community (LGBTQ) -- and has become synonymous with the fight for equality.
Now you can honor a loved one or friend by flying the HRC flag above the heart of the nation's capital, only blocks away from the White House.
Once your flag has been flown, we will send it to you with a certificate to commemorate your contribution to the fight for equality.
Please allow 15 business days from the day your flag is flown for your shipment to arrive. Flag measures 3 ft. x 4 ft.
Your flag will be shipped via UPS Ground. Domestic shipping only. No PO Boxes.
If you have any questions, please contact [email protected].
Please note that the minimum contribution for the HRC Flag Program is $150.

### üóÇÔ∏è 0.1 Docstore formatting/scraping

In [12]:
# Naive version with \n concatenated url2texts:
documents = [
    Document(
        page_content=" ".join(doc["url2text"]),
        metadata={
            "url": doc["url"],
            # "sentences": doc["url2text"]
        },
    )
    for doc in docstore
]

TOKENS_PER_CHAR = 0.25
EMBEDDING_INPUT_SIZE = 512

chunks = []
for doc in docstore:
    buffer = ""
    for i, sentence in enumerate(doc["url2text"]):
        if (
            i == len(doc["url2text"]) - 1
            or len(buffer) + len(sentence) >= EMBEDDING_INPUT_SIZE / TOKENS_PER_CHAR
        ):
            context_before = ""
            if chunks and chunks[-1].metadata["url"] == doc["url"]:
                chunks[-1].metadata["context_after"] = buffer
                context_before = chunks[-1].page_content
            chunks.append(
                Document(
                    page_content=buffer,
                    metadata={"url": doc["url"], "context_before": context_before, "context_after": ""},
                )
            )

            buffer = ""
        buffer += sentence + " "
# chunk the documents into smaller pieces
chunks[random.randint(0, len(chunks))]

In [10]:
from utils.chat import  pretty_print

In [13]:
chid = -1

display(Markdown(chunks[chid].metadata["context_before"]))
display(Markdown(chunks[chid].page_content))
display(Markdown(chunks[chid].metadata["context_after"]))
chunks[chid].metadata["url"]

In [14]:
from langchain_community.retrievers import BM25Retriever

retriever = BM25Retriever.from_documents(
    chunks, k=2500
)
chunks_pruned = retriever.invoke(claim)

### üìä Docstore hist & truncation

In [17]:
len(chunks),len(chunks_pruned)

In [16]:
# histogram of sentences len, between 0 and 100
plt.hist([len(doc.page_content) for doc in chunks], bins=50, range=(0, 3000))

In [18]:
# histogram of sentences len, between 0 and 100
plt.hist([len(doc.page_content) for doc in chunks], bins=50, range=(0, 3000))

In [19]:
# show doc with most sentences
max_sentences = max(documents, key=lambda d: len(d.metadata["sentences"]))
max_sentences.metadata["url"]
max_sentences.page_content[:1000]

In [110]:
# trim document page contents to 13000 characters
for doc in documents:
    doc.page_content = doc.page_content[:13000]

## üìê 1. Embedding searches

In [20]:
embeddings = HuggingFaceEmbeddings(model_name="Linq-AI-Research/Linq-Embed-Mistral")

In [None]:
reranker = KNNRetriever.from_documents(documents, embeddings, k=10)
results = reranker.get_relevant_documents(claim)

display(Markdown("### üóØÔ∏è " + claim))
display(Markdown("*Retrieved by " + embeddings.model_name + "*\n\n"))
# sample 3
for r in results:
    newline = "\n"
    display(Markdown(f"**{r.metadata['url']}**\n\n{r.page_content[:256]}"))

In [None]:
reranker = KNNRetriever.from_documents(documents, embeddings, k=10)
results = reranker.get_relevant_documents(claim)

display(Markdown("### üóØÔ∏è " + claim))
display(Markdown("*Retrieved by " + embeddings.model_name + "*\n\n"))
# sample 3
for r in results:
    newline = "\n"
    display(Markdown(f"**{r.metadata['url']}**\n\n{r.page_content[:256]}"))

In [None]:
# purge cuda mem completely
import gc
import torch
gc.collect()
torch.cuda.empty_cache()

## üìê Max. Marginal relevance


In [92]:
%pip install --upgrade --quiet  rank_bm25

In [55]:
from langchain_community.retrievers import BM25Retriever

retriever = BM25Retriever.from_documents(
    chunks, k=3000
)
chunks_pruned = retriever.invoke(claim)

In [109]:
len(chunks_pruned)

In [110]:
chroma = Chroma(f"dev_{CLAIM_ID}_mxbai", persist_directory=f"data_store/vector_store_dev/{CLAIM_ID}")
chroma.delete_collection()
documents_ = [doc.copy() for doc in chunks_pruned]
for doc in documents_:
    if 'sentences' in doc.metadata:
        doc.metadata.pop("sentences", None)
    
chroma = chroma.from_documents(documents_, embeddings)


In [106]:
# make or overwrite /home/ullriher/aic_averitec/data_store/vector_store_dev/CLAIM_ID and persist chroma there
!mkdir -p /home/ullriher/aic_averitec/data_store/vector_store_dev/{CLAIM_ID}
chroma(f"/home/ullriher/aic_averitec/data_store/vector_store_dev/{CLAIM_ID}")

In [114]:
claim

In [14]:
embeddings = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")

In [56]:
len(chunks_pruned)

In [58]:
from langchain_community.retrievers import BM25Retriever

retriever = BM25Retriever.from_documents(
    chunks, k=6000
)
chunks_pruned = retriever.invoke(claim)

In [1]:
from langchain_community.vectorstores import FAISS

db = FAISS.from_documents(chunks_pruned, embeddings)
db.save_local(f"data_store/vecstore/dev/6k/{CLAIM_ID}")

In [10]:
from langchain_community.vectorstores import FAISS

db = FAISS.load_local(f"data_store/vecstore/dev/6k/{CLAIM_ID}", embeddings,allow_dangerous_deserialization=True)

In [11]:
from utils.chat import SimpleJSONChat

chat = SimpleJSONChat(
    model="gpt-4o",
    system_prompt=f"""Pretend you are a researcher who receives a claim where your goal is to be as unbiased as possible. There are two teams: Query Generation (your team) and Information Acquisition. The goal of your team is to generate pertinent queries based on the claim such that when queried on Google/Bing, the results will be accurate and helpful in finding relevant sources. The second team then uses those sources to further refine whether the claim is supported (true), unsupported (false), or cherry picked (conflicting evidence). In summary, you will receive a claim and the task is to generate 10 queries that are relevant to the claim, paying mind to the different possible labels (i.e ensure the generated queries cover all possible labels). A necessary requirement is to include metadata like claim date, speaker, and reporting source. Use your imagination and take time to be thoughtful with queries, ensuring relevance to the claim. The queries should be formatted in a manner ready for querying via Google/Bing API, so no need for extra text or explanations intended for a user.""",
    parse_output=False,
)

In [12]:
datapoint

In [13]:
claim

In [14]:
res = chat(claim+f" (Speaker: {datapoint['speaker']}, {datapoint['claim_date']})")
res

In [15]:
def parse(res):
    result = []
    lines = res.strip().split("\n")

    # Parse each line to extract the content
    parsed_results = [line.split(". ")[1].strip('"') for line in lines]

    # Print the parsed results
    for item in parsed_results:
        result.append(item)
    return result

In [26]:
CLAIM_ID = 253
CLAIM_ID = random.choice(range(500))
datapoint = json.load(open("data/dev.json"))[CLAIM_ID]
claim = datapoint["claim"]

res = chat(claim+f" (Speaker: {datapoint['speaker']}, {datapoint['claim_date']})")
db = FAISS.load_local(f"data_store/vecstore/dev/6k/{CLAIM_ID}", embeddings,allow_dangerous_deserialization=True)

display(Markdown(f"## üóØÔ∏è {CLAIM_ID}: {claim} [{datapoint['label']}]"))

for query in parse(res):
    results = db.similarity_search(query, 2)

    display(Markdown("### üîé " + query))
    # sample 3
    for r in results:
        newline = "\n"
        display(Markdown(f"**{r.metadata['url']}**\n\n{r.page_content}"))

In [24]:
parse(res)

In [25]:
results = db.similarity_search(claim, 10)

display(Markdown("### üóØÔ∏è " + claim))
display(Markdown("*Retrieved by " + embeddings.model_name + " (mmr Œª=0.5)*\n\n"))
# sample 3
for r in results:
    newline = "\n"
    display(Markdown(f"**{r.metadata['url']}**\n\n{r.page_content}"))

In [119]:
results = db.max_marginal_relevance_search(claim, 10, 40, .5)

display(Markdown("### üóØÔ∏è " + claim))
display(Markdown("*Retrieved by " + embeddings.model_name + " (mmr Œª=0.5)*\n\n"))
# sample 3
for r in results:
    newline = "\n"
    display(Markdown(f"**{r.metadata['url']}**\n\n{r.page_content[:256]}"))

In [112]:
results = chroma.max_marginal_relevance_search(claim, 10, 40, .5)

display(Markdown("### üóØÔ∏è " + claim))
display(Markdown("*Retrieved by " + embeddings.model_name + " (mmr Œª=0.5)*\n\n"))
# sample 3
for r in results:
    newline = "\n"
    display(Markdown(f"**{r.metadata['url']}**\n\n{r.page_content[:256]}"))

## ‚ùå Cross-encoder re-ranking 

In [100]:
# Load the model, here we use our base sized model
model_name = "mixedbread-ai/mxbai-rerank-large-v1"
model = CrossEncoder(model_name)

# Example query and document

# Lets get the scores
results = model.rank(claim, [doc.page_content for doc in chunks_pruned], return_documents=True, top_k=10)


display(Markdown("### üóØÔ∏è " + claim))
display(Markdown("*Retrieved by " + model_name + "*\n\n"))
# sample 3
for r in results:
    newline = "\n"
    r = chunks_pruned[r['corpus_id']]
    display(Markdown(f"**{r.metadata['url']}**\n\n{r.page_content[:256]}"))

In [101]:
for r in results:
    newline = "\n"
    r = chunks_pruned[r['corpus_id']]
    display(Markdown(f"**{r.metadata['url']}**\n\n{r.page_content}"))