# Build vector store

In [None]:
import numpy as np
import pandas as pd
import json
import sys

sys.path.append("../src")
from utils import data_to_use
from build_rag_vectorstore import build_vector_db_from_df

In [None]:
out_dir = "XXX"
data = "XXX"
labels = "XXX"

data_df = pd.read_pickle(data)
label_df = pd.read_pickle(labels)

data_df["viral_diagnosis"] = label_df
data_df

In [None]:
list(data_df.columns)

In [None]:
%%time
# Build vector DB (only include useful columns to keep context short)
cols_to_include = data_df.columns

build_vector_db_from_df(
    df=data_df,
    out_dir=out_dir,
    cols=cols_to_include,
    id_col="record_id",
    model_name="sentence-transformers/all-MiniLM-L6-v2",  # general-purpose embedding
    normalize=True,
)

# Test performance of RAG on test data

In [None]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import sys
sys.path.append("../src")
from model_llm_rag_funcs import load_vectorstore_for_rag, retrieve_context
from utils import row_to_json

%config InlineBackend.figure_format='retina'

In [None]:
data_path = "XXX"
data_df = pd.read_pickle(data_path)

rag_dir = "XXX"

In [None]:
%%time
averages = []
averages_round = []
weighted_averages = []
weighted_averages_round = []

for _, row in data_df.iterrows():
    query_text = row_to_json(row)

    index, texts, metas, model_info = load_vectorstore_for_rag(rag_dir)
    rag_state = {"index": index, "texts": texts, "metas": metas, "model_info": model_info}
    print(f"RAG loaded from {rag_dir} — {len(texts)} docs, embedder={model_info.get('model_name')}")

    rag_top_k = 20
    rag_max_tokens = 1200
    retrieved_block = retrieve_context(
        query_text, rag_state, top_k=rag_top_k, max_ctx_tokens=rag_max_tokens
    )

    avg = np.mean(retrieved_block)
    averages.append(avg)

    avg_r = int(round(np.mean(retrieved_block)))
    averages_round.append(avg_r)

    # Take a weighted average where earlier positions count for more
    # weight = 1/(position+1) (harmonic decay)
    weights = np.arange(len(retrieved_block), 0, -1)
    weights = weights / weights.sum()  # normalize to sum to 1

    weighted_avg = np.average(retrieved_block, weights=weights)
    weighted_averages.append(weighted_avg)
    weighted_averages_round.append(int(round(weighted_avg)))

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))

ax.hist(averages, bins=20, alpha=1, label='Averages', histtype='step', lw=1)
ax.hist(weighted_averages, bins=20, alpha=1, label='Weighted Averages', histtype='step', lw=1)

ax.set_xlabel('Value')
ax.set_ylabel('Frequency')
ax.set_title('Histogram of RAG Averages and Weighted Averages')
ax.legend()

plt.show()

In [None]:
data_df["averages"] = averages
data_df["averages_round"] = averages_round
data_df["weighted_averages"] = weighted_averages
data_df["weighted_averages_round"] = weighted_averages_round

data_df.to_pickle(data_path.replace(".pkl", "_rag.pkl"))
data_df