In [1]:
# Install dependencies (run once)
!pip install pandas python-dotenv requests sentence-transformers faiss-cpu scikit-learn



In [2]:
import pandas as pd
import numpy as np
import json
import os
import re
import pickle
from pathlib import Path
from dotenv import load_dotenv
from difflib import get_close_matches
import requests

from sentence_transformers import SentenceTransformer
import faiss
from sklearn.preprocessing import normalize

load_dotenv()
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")

if not OPENROUTER_API_KEY:
    print("WARNING: OPENROUTER_API_KEY not found in .env file")
else:
    print("API key loaded successfully")

API key loaded successfully


In [3]:
DATA_DIR = Path("data")
CACHE_DIR = DATA_DIR / "cache"
CACHE_DIR.mkdir(exist_ok=True)

TRADES_CSV = DATA_DIR / "trades.csv"
HOLDINGS_CSV = DATA_DIR / "holdings.csv"

# Load data
print("Loading data...")
trades = pd.read_csv(TRADES_CSV, dtype=str)
holdings = pd.read_csv(HOLDINGS_CSV, dtype=str)

print(f"Loaded {len(trades)} trades")
print(f"Loaded {len(holdings)} holdings")

Loading data...
Loaded 649 trades
Loaded 1022 holdings


In [4]:
trades = pd.read_csv(TRADES_CSV, dtype=str)  # read as str to avoid dtype surprises
holdings = pd.read_csv(HOLDINGS_CSV, dtype=str)

print("Trades shape:", trades.shape)
print("Holdings shape:", holdings.shape)

Trades shape: (649, 31)
Holdings shape: (1022, 25)


In [5]:
display(trades.head(3))
display(holdings.head(3))

Unnamed: 0,id,RevisionId,AllocationId,TradeTypeName,SecurityId,SecurityType,Name,Ticker,CUSIP,ISIN,...,AllocationFees,AllocationCash,PortfolioName,CustodianName,StrategyName,Strategy1Name,Strategy2Name,Counterparty,AllocationRule,IsCustomAllocation
0,3489863,2,3460886,Buy,270471,Equity,Berry Brand 4/11 Equity,,,,...,2800.0,7002800.0,HoldCo 1,JP MORGAN SECURITIES LLC,Default,DefaultS1,DefaultS2,ABGS,Single Fund Rule - HoldCo 1,1
1,3489864,1,3460887,Sell,270471,Equity,Berry Brand 4/11 Equity,,,,...,128.8,6999871.2,HoldCo 1,JP MORGAN SECURITIES LLC,Default,DefaultS1,DefaultS2,ABGS,Single Fund Rule - HoldCo 1,0
2,3496826,1,3462756,Sell,290063,Equity,META-US,META,30303M102,US30303M1027,...,46985.99,2553539898.0,HoldCo 3,CITIGROUP GLOBAL MARKETS INC.,Default,DefaultS1,DefaultS2,ABGS,Single Fund Rule - HoldCo 3,0


Unnamed: 0,AsOfDate,OpenDate,CloseDate,ShortName,PortfolioName,StrategyRefShortName,Strategy1RefShortName,Strategy2RefShortName,CustodianName,DirectionName,...,StartPrice,Price,StartFXRate,FXRate,MV_Local,MV_Base,PL_DTD,PL_QTD,PL_MTD,PL_YTD
0,01/08/23,04/03/20,,Garfield,Garfield,Default,Asset,DefaultS2,Well Prime,Long,...,96,96,1.33,1.33,568320.0,755865.6,92.504,10833.7294,92.504,41054.5854
1,01/08/23,04/03/20,,Garfield,Garfield,Default,Asset,DefaultS2,Well Prime,Long,...,96,96,1.33,1.33,84.48,112.3584,0.0138,1.6104,0.0138,6.1027
2,01/08/23,04/03/20,,Garfield,Garfield,Default,Asset,DefaultS2,Well Prime,Long,...,96,96,1.33,1.33,756000.0,1005480.0,123.0523,14411.4221,123.0523,54612.3074


In [6]:
print("Trades columns:", list(trades.columns))
print("Holdings columns:", list(holdings.columns))

Trades columns: ['id', 'RevisionId', 'AllocationId', 'TradeTypeName', 'SecurityId', 'SecurityType', 'Name', 'Ticker', 'CUSIP', 'ISIN', 'TradeDate', 'SettleDate', 'Quantity', 'Price', 'TradeFXRate', 'Principal', 'Interest', 'TotalCash', 'AllocationQTY', 'AllocationPrincipal', 'AllocationInterest', 'AllocationFees', 'AllocationCash', 'PortfolioName', 'CustodianName', 'StrategyName', 'Strategy1Name', 'Strategy2Name', 'Counterparty', 'AllocationRule', 'IsCustomAllocation']
Holdings columns: ['AsOfDate', 'OpenDate', 'CloseDate', 'ShortName', 'PortfolioName', 'StrategyRefShortName', 'Strategy1RefShortName', 'Strategy2RefShortName', 'CustodianName', 'DirectionName', 'SecurityId', 'SecurityTypeName', 'SecName', 'StartQty', 'Qty', 'StartPrice', 'Price', 'StartFXRate', 'FXRate', 'MV_Local', 'MV_Base', 'PL_DTD', 'PL_QTD', 'PL_MTD', 'PL_YTD']


In [7]:
def preprocess_df(df):
    df = df.copy()
    df.columns = [c.strip() for c in df.columns]
    for c in df.columns:
        if df[c].dtype == object:
            df[c] = df[c].fillna('').astype(str).str.strip()
    return df

trades = preprocess_df(trades)
holdings = preprocess_df(holdings)

In [8]:
def safe_to_numeric(series):
    if series is None:
        return pd.Series(dtype=float)
    s = series.astype(str).str.replace(',', '')
    s = s.str.replace('(', '-', regex=False).str.replace(')', '', regex=False)
    return pd.to_numeric(s, errors='coerce')

for col in ['Quantity', 'Price', 'Principal', 'TotalCash', 'AllocationQTY']:
    if col in trades.columns:
        trades[col + '_num'] = safe_to_numeric(trades[col])

for col in ['Qty', 'StartQty', 'Price', 'MV_Base', 'MV_Local', 'PL_YTD', 'PL_MTD', 'PL_QTD', 'PL_DTD']:
    if col in holdings.columns:
        holdings[col + '_num'] = safe_to_numeric(holdings[col])

print("Data preprocessed and numeric columns converted")

Data preprocessed and numeric columns converted


In [9]:
def unique_nonnull_values(df, col):
    if col in df.columns:
        vals = [v for v in df[col].astype(str).unique().tolist() if v and v.upper() != 'NULL']
        return sorted(vals)
    return []

In [10]:
PORTFOLIOS_TRADES = unique_nonnull_values(trades, 'PortfolioName')
PORTFOLIOS_HOLDINGS = unique_nonnull_values(holdings, 'PortfolioName')
SHORTNAMES_HOLDINGS = unique_nonnull_values(holdings, 'ShortName')
ALL_PORTFOLIOS = sorted(list(set(PORTFOLIOS_TRADES + PORTFOLIOS_HOLDINGS + SHORTNAMES_HOLDINGS)))
TICKERS = unique_nonnull_values(trades, 'Ticker')

print(f"\nFound {len(ALL_PORTFOLIOS)} unique portfolio identifiers (including ShortName fallbacks)")
print(f"Found {len(TICKERS)} unique tickers")


Found 41 unique portfolio identifiers (including ShortName fallbacks)
Found 44 unique tickers


In [11]:
def normalize_text(text):
    if text is None:
        return ""
    s = str(text).lower().strip()
    s = re.sub(r'[^a-z0-9]', '', s)
    return s

def extract_entities_fuzzy(question, debug=False):
    q = question or ""
    q_lower = q.lower()
    q_norm = normalize_text(q)

    entities = {}
    clean_portfolios = []
    for p in ALL_PORTFOLIOS:
        p_clean = re.sub(r'\s+', ' ', p).strip()       
        clean_portfolios.append(p_clean)

    norms_to_portfolio = {}
    for p in clean_portfolios:
        key = normalize_text(p)                       
        norms_to_portfolio[key] = p               


    norms_to_portfolio = {}
    for p in clean_portfolios:
        key = normalize_text(p)
        norms_to_portfolio[key] = p

    if q_norm in norms_to_portfolio:
        entities['portfolio'] = norms_to_portfolio[q_norm]
        if debug:
            print("[debug] direct portfolio normalized match:", entities['portfolio'])
    else:
        tokens = re.findall(r'[A-Za-z0-9]+', q)
        for t in tokens[::-1]:  
            norm_t = normalize_text(t)
            if norm_t == normalize_text(p):
                entities['portfolio'] = norms_to_portfolio[norm_t]
                if debug:
                    print("[debug] token portfolio match:", entities['portfolio'], "token:", t)
                break

    if 'portfolio' not in entities:
        if tokens:
            candidates = [normalize_text(t) for t in tokens if len(t) > 0]
            portfolio_norm_keys = list(norms_to_portfolio.keys())
            for cand_norm in candidates:
                matches = get_close_matches(cand_norm, portfolio_norm_keys, n=1, cutoff=0.7)
                if matches:
                    entities['portfolio_suggestion'] = norms_to_portfolio[matches[0]]
                    if debug:
                        print("[debug] fuzzy portfolio suggestion:", entities['portfolio_suggestion'])
                    break

    for sn in SHORTNAMES_HOLDINGS:
        if normalize_text(sn) == q_norm:
            entities['portfolio'] = sn
            break


    ticker_match = re.findall(r'\b[A-Z]{1,5}\b', question)
    if ticker_match:
        for tk in ticker_match:
            if tk in TICKERS:
                entities['ticker'] = tk
                break
        if 'ticker' not in entities:
            entities['ticker'] = ticker_match[0]

    if 'ticker' not in entities:
        for tk in TICKERS:
            if normalize_text(tk) in q_norm:
                entities['ticker'] = tk
                break

    if any(word in q_lower for word in ['buy', 'bought', 'purchase', 'purchased']):
        entities['trade_type'] = 'Buy'
    elif any(word in q_lower for word in ['sell', 'sold', 'sale', 'sold']):
        entities['trade_type'] = 'Sell'

    metric_aliases = {
        'pl_ytd': ['pl ytd', 'pl_ytd', 'pl-ytd', 'ytd profit', 'yearly profit', 'annual profit', 'profit ytd'],
        'pl_mtd': ['pl mtd', 'pl_mtd', 'pl-mtd', 'monthly profit', 'mtd profit', 'profit month'],
        'pl_qtd': ['pl qtd', 'pl_qtd', 'pl-qtd', 'quarterly profit', 'qtd profit'],
        'mv_base': ['mv base', 'mv_base', 'mv-base', 'market value base', 'base mv'],
        'mv_local': ['mv local', 'mv_local', 'mv-local', 'market value local'],
        'principal': ['principal', 'principle', 'notional', 'trade value'],
        'quantity': ['quantity', 'qty', 'amount', 'volume', 'shares'],
        'totalcash': ['total cash', 'totalcash', 'cash']
    }
    for metric, aliases in metric_aliases.items():
        for alias in aliases:
            if alias in q_lower.replace('_', ' '):
                entities['metric'] = metric
                break
        if 'metric' in entities:
            break

    if 'negative' in q_lower or re.search(r'\b<\s*0\b', q_lower) or 'below zero' in q_lower:
        entities['condition'] = 'negative'

    return entities

In [12]:
print("Loading embedding model...")
EMBED_MODEL = "all-MiniLM-L6-v2"
model = SentenceTransformer(EMBED_MODEL)
print("Embedding model loaded")

Loading embedding model...
Embedding model loaded


In [13]:
def trade_row_to_text(row):
    parts = []
    for k in ['PortfolioName', 'TradeTypeName', 'Name', 'Ticker', 'Quantity', 'Price', 'Principal', 'TotalCash']:
        if k in row and pd.notna(row[k]) and str(row[k]) not in ('', 'NULL'):
            parts.append(f"{k}: {row[k]}")
    return ". ".join(parts)

def holding_row_to_text(row):
    parts = []
    for k in ['PortfolioName', 'ShortName', 'SecName', 'SecurityTypeName', 'Qty', 'Price', 'MV_Base', 'PL_YTD', 'PL_MTD']:
        if k in row and pd.notna(row[k]) and str(row[k]) not in ('', 'NULL'):
            parts.append(f"{k}: {row[k]}")
    return ". ".join(parts)

In [14]:
print("Building row structures...")
rows_trades = []
for i, r in trades.iterrows():
    text = trade_row_to_text(r.to_dict())
    rows_trades.append({
        "source": "trades",
        "index": int(i),
        "text": text,
        "row_dict": r.to_dict()
    })

rows_holdings = []
for i, r in holdings.iterrows():
    text = holding_row_to_text(r.to_dict())
    rows_holdings.append({
        "source": "holdings",
        "index": int(i),
        "text": text,
        "row_dict": r.to_dict()
    })

print(f"Created {len(rows_trades)} trade rows")
print(f"Created {len(rows_holdings)} holding rows")

Building row structures...
Created 649 trade rows
Created 1022 holding rows


In [15]:
EMB_TRADES_FILE = CACHE_DIR / "embeddings_trades.pkl"
EMB_HOLDINGS_FILE = CACHE_DIR / "embeddings_holdings.pkl"

def build_embeddings(rows, cache_file):
    if cache_file.exists():
        print(f"Loading embeddings from {cache_file}")
        with open(cache_file, 'rb') as f:
            return pickle.load(f)
    print(f"Encoding {len(rows)} rows...")
    texts = [r['text'] for r in rows]
    embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True)
    with open(cache_file, 'wb') as f:
        pickle.dump(embeddings, f)
    return embeddings

emb_trades = build_embeddings(rows_trades, EMB_TRADES_FILE)
emb_holdings = build_embeddings(rows_holdings, EMB_HOLDINGS_FILE)

print(f"Trade embeddings shape: {emb_trades.shape}")
print(f"Holding embeddings shape: {emb_holdings.shape}")

Loading embeddings from data\cache\embeddings_trades.pkl
Loading embeddings from data\cache\embeddings_holdings.pkl
Trade embeddings shape: (649, 384)
Holding embeddings shape: (1022, 384)


In [16]:
print("Building FAISS indexes...")
def build_faiss_index(embeddings):
    emb_norm = normalize(embeddings, axis=1, norm='l2').astype('float32')
    dim = emb_norm.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(emb_norm)
    return index

index_trades = build_faiss_index(emb_trades)
index_holdings = build_faiss_index(emb_holdings)

print(f"Trade index: {index_trades.ntotal} vectors")
print(f"Holdings index: {index_holdings.ntotal} vectors")

Building FAISS indexes...
Trade index: 649 vectors
Holdings index: 1022 vectors


In [17]:
def semantic_search(question, dataset='trades', top_k=30, min_score=None):
    if dataset == 'trades':
        index = index_trades
        rows = rows_trades
    else:
        index = index_holdings
        rows = rows_holdings

    if min_score is None:
        min_score = 0.45

    q_emb = model.encode([question], convert_to_numpy=True)
    q_norm = normalize(q_emb, axis=1).astype('float32')

    scores, indices = index.search(q_norm, top_k)

    results = []
    for score, idx in zip(scores[0], indices[0]):
        if score >= min_score:
            results.append({
                'score': float(score),
                'index': int(idx),
                'text': rows[idx]['text'],
                'row_dict': rows[idx]['row_dict'],
                'source': dataset
            })
    results = sorted(results, key=lambda x: x['score'], reverse=True)
    return results

In [18]:
def filter_dataframe_fuzzy(df, entities, debug=False):
    df_filtered = df.copy()

    portfolio = entities.get('portfolio') or entities.get('portfolio_suggestion')
    if portfolio:
        pn_col = 'PortfolioName' if 'PortfolioName' in df_filtered.columns else None
        sn_col = 'ShortName' if 'ShortName' in df_filtered.columns else None

        masks = []
        if pn_col:
            masks.append(df_filtered[pn_col].apply(lambda x: normalize_text(x) == normalize_text(portfolio)))
        if sn_col:
            masks.append(df_filtered[sn_col].apply(lambda x: normalize_text(x) == normalize_text(portfolio)))

        if masks:
            combined = masks[0]
            for m in masks[1:]:
                combined = combined | m
            df_filtered = df_filtered[combined]
        else:
            mask = pd.Series(False, index=df_filtered.index)
            for c in df_filtered.select_dtypes(include=['object', 'string']).columns:
                mask = mask | df_filtered[c].str.lower().str.contains(str(portfolio).lower(), na=False)
            df_filtered = df_filtered[mask]

        if debug:
            print(f"[debug] filter_dataframe_fuzzy after portfolio filter: {len(df_filtered)} rows")

    if 'ticker' in entities and 'Ticker' in df_filtered.columns:
        t = entities['ticker']
        df_filtered = df_filtered[df_filtered['Ticker'].apply(lambda x: normalize_text(x) == normalize_text(t))]
        if debug:
            print(f"[debug] filter_dataframe_fuzzy after ticker filter: {len(df_filtered)} rows")

    if 'trade_type' in entities and 'TradeTypeName' in df_filtered.columns:
        df_filtered = df_filtered[df_filtered['TradeTypeName'].str.lower() == entities['trade_type'].lower()]
        if debug:
            print(f"[debug] filter_dataframe_fuzzy after trade_type filter: {len(df_filtered)} rows")

    cond = entities.get('condition')
    if cond and cond == 'negative':
        pl_col = None
        for candidate in ['PL_YTD_num', 'PL_MTD_num', 'PL_QTD_num']:
            if candidate in df_filtered.columns:
                pl_col = candidate
                break
        if pl_col:
            df_filtered = df_filtered[df_filtered[pl_col] < 0]
            if debug:
                print(f"[debug] filter_dataframe_fuzzy after negative condition: {len(df_filtered)} rows")

    return df_filtered

In [19]:
def exec_count(dataset, entities, debug=False):
    df = trades if dataset == 'trades' else holdings
    df_filtered = filter_dataframe_fuzzy(df, entities, debug=debug)
    return len(df_filtered), df_filtered

In [20]:
def exec_aggregate(dataset, metric, entities, agg='sum', debug=False):
    df = trades if dataset == 'trades' else holdings
    df_filtered = filter_dataframe_fuzzy(df, entities, debug=debug)

    if df_filtered.empty:
        return None, df_filtered

    metric_map = {
        'principal': 'Principal_num',
        'totalcash': 'TotalCash_num',
        'quantity': 'Quantity_num',
        'pl_ytd': 'PL_YTD_num',
        'pl_mtd': 'PL_MTD_num',
        'pl_qtd': 'PL_QTD_num',
        'mv_base': 'MV_Base_num',
        'mv_local': 'MV_Local_num',
        'qty': 'Qty_num',
    }

    col = metric_map.get(metric)
    if not col or col not in df_filtered.columns:
        for cand in metric_map.values():
            if cand in df_filtered.columns:
                col = cand
                break
        if not col or col not in df_filtered.columns:
            return None, df_filtered

    if agg == 'sum':
        result = df_filtered[col].sum()
    elif agg == 'mean':
        result = df_filtered[col].mean()
    elif agg == 'max':
        result = df_filtered[col].max()
    elif agg == 'min':
        result = df_filtered[col].min()
    else:
        result = df_filtered[col].sum()

    return float(result) if not pd.isna(result) else None, df_filtered


In [21]:
def exec_rank(metric='PL_YTD', top_n=5, ascending=False):
    col = metric + '_num'
    if col not in holdings.columns:
        possible = [c for c in holdings.columns if c.lower().startswith(metric.lower()) and c.endswith('_num')]
        if possible:
            col = possible[0]
        else:
            return None

    name_col = 'PortfolioName' if 'PortfolioName' in holdings.columns else None
    if name_col:
        grouped = holdings.groupby(name_col)[col].sum()
    else:
        if 'ShortName' in holdings.columns:
            grouped = holdings.groupby('ShortName')[col].sum()
        else:
            return None

    sorted_series = grouped.sort_values(ascending=ascending)
    return sorted_series.head(top_n)

In [22]:
def classify_query(question):
    q = (question or "").lower()

    dataset = None
    if any(w in q for w in ['trade', 'trades', 'buy', 'sell', 'traded', 'counterparty', 'principal']):
        dataset = 'trades'
    if any(w in q for w in ['holding', 'holdings', 'fund', 'portfolio', 'performed', 'performance', 'pl', 'profit', 'loss']):
        dataset = 'both' if dataset == 'trades' else 'holdings'

    if dataset is None:
        dataset = 'both'

    operation = None
    if any(w in q for w in ['how many', 'count', 'number of', 'total number']):
        operation = 'count'
    elif any(w in q for w in ['total', 'sum', 'aggregate']) and any(w in q for w in ['principal', 'quantity', 'cash', 'pl', 'mv', 'qty']):
        operation = 'aggregate'
    elif any(w in q for w in ['best', 'worst', 'top', 'rank', 'performed', 'highest', 'lowest']):
        operation = 'rank'
    elif any(w in q for w in ['show']) and not any(w in q for w in ['profit', 'pl', 'mtd', 'ytd']):
        operation = 'show'
    else:
        operation = 'unknown'

    return dataset, operation

In [23]:
def format_evidence_rows(results, max_rows=10):
    pieces = []
    for i, r in enumerate(results[:max_rows]):
        pieces.append(f"[{i+1}] {r['text']}")
    return "\n".join(pieces)

In [24]:
def call_llm_formatter(question, summary, evidence_text):
    if not OPENROUTER_API_KEY:
        return summary

    system_prompt = """You are a financial data assistant that formats results.

CRITICAL RULES:
1. You receive a COMPUTED SUMMARY and EVIDENCE ROWS
2. Your job is to REPHRASE the summary into natural language
3. DO NOT compute, calculate, or aggregate anything
4. DO NOT make up information not in the summary
5. If summary says "No data", respond EXACTLY: "Sorry can not find the answer"
6. Be concise and clear

EXAMPLES:
Summary: "Count = 15"
Response: "There are 15 trades matching your criteria."

Summary: "Total Principal = 1500000.50"
Response: "The total principal is $1,500,000.50."

Summary: "No data found"
Response: "Sorry can not find the answer"
"""

    user_message = f"""QUESTION: {question}

COMPUTED SUMMARY:
{summary}

EVIDENCE ROWS:
{evidence_text}

Rephrase the summary into a natural language answer. Do not compute anything."""

    try:
        response = requests.post(
            "https://openrouter.ai/api/v1/chat/completions",
            headers={
                "Authorization": f"Bearer {OPENROUTER_API_KEY}",
                "Content-Type": "application/json"
            },
            json={
                "model": "openai/gpt-4o-mini",
                "messages": [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_message}
                ],
                "temperature": 0.1,
                "max_tokens": 300,
            },
            timeout=30
        )
        response.raise_for_status()
        result = response.json()
        return result['choices'][0]['message']['content'].strip()
    except Exception:
        return summary

In [25]:
def answer_question(question, debug=False):
    if not question or not str(question).strip():
        return "Sorry can not find the answer"

    entities = extract_entities_fuzzy(question, debug=debug)

    dataset, operation = classify_query(question)
    if debug:
        print("[debug] entities:", entities)
        print("[debug] dataset, operation:", dataset, operation)

    if operation == 'count':
        datasets_try = []
        if dataset in ('trades', 'both'):
            datasets_try.append('trades')
        if dataset in ('holdings', 'both'):
            datasets_try.append('holdings')

        for ds in datasets_try:
            count, df_filtered = exec_count(ds, entities, debug=debug)
            if count > 0:
                evidence = semantic_search(question, dataset=ds, top_k=10, min_score=0.40)
                summary = f"Count: {count} {'trades' if ds == 'trades' else 'holdings'}"
                evidence_text = format_evidence_rows(evidence[:5])
                return call_llm_formatter(question, summary, evidence_text)
        return "Sorry can not find the answer"

    if operation == 'aggregate':
        metric = entities.get('metric')
        if not metric:
            return "Sorry can not find the answer"

        if metric in ('principal', 'totalcash', 'quantity'):
            ds_target = 'trades'
        else:
            ds_target = 'holdings'

        result, df_filtered = exec_aggregate(ds_target, metric, entities, agg='sum', debug=debug)
        if result is None:
            return "Sorry can not find the answer"
        evidence = semantic_search(question, dataset=ds_target, top_k=8, min_score=0.40)
        summary = f"Total {metric}: {result:,.2f}"
        evidence_text = format_evidence_rows(evidence[:5])
        return call_llm_formatter(question, summary, evidence_text)

    if operation == 'rank':
        ascending = 'worst' in question.lower() or 'lowest' in question.lower()
        metric = entities.get('metric', 'pl_ytd')
        if metric not in ('pl_ytd', 'pl_mtd', 'pl_qtd', 'mv_base', 'mv_local'):
            metric = 'pl_ytd'
        top_n = 5
        m = re.search(r'top\s*(\d+)', question.lower())
        if m:
            top_n = int(m.group(1))
        ranked = exec_rank(metric.upper(), top_n=top_n, ascending=ascending)
        if ranked is None or ranked.empty:
            return "Sorry can not find the answer"
        lines = [f"Ranking by {metric.upper()}:"]
        for i, (portfolio, value) in enumerate(ranked.items(), 1):
            lines.append(f"{i}. {portfolio}: {value:,.2f}")
        summary = "\n".join(lines)
        top_portfolio = ranked.index[0]
        evidence = semantic_search(f"{top_portfolio} {metric}", dataset='holdings', top_k=5, min_score=0.40)
        evidence_text = format_evidence_rows(evidence[:3])
        return call_llm_formatter(question, summary, evidence_text)

    if operation == 'show':
        strong_filter_present = any(k in entities for k in ['portfolio', 'ticker', 'trade_type', 'condition'])
        results = []
        if strong_filter_present:
            if dataset in ('trades', 'both'):
                _, df_filtered_trades = exec_count('trades', entities)
                if not df_filtered_trades.empty:
                    for i, row in df_filtered_trades.head(20).iterrows():
                        results.append({'score': 1.0, 'index': int(i), 'text': trade_row_to_text(row.to_dict()), 'row_dict': row.to_dict(), 'source': 'trades'})
            if dataset in ('holdings', 'both'):
                _, df_filtered_hold = exec_count('holdings', entities)
                if not df_filtered_hold.empty:
                    for i, row in df_filtered_hold.head(20).iterrows():
                        results.append({'score': 1.0, 'index': int(i), 'text': holding_row_to_text(row.to_dict()), 'row_dict': row.to_dict(), 'source': 'holdings'})
            if results:
                summary = f"Found {len(results)} relevant rows (showing top {min(10,len(results))}):"
                evidence_text = format_evidence_rows(results[:10])
                return call_llm_formatter(question, summary, evidence_text)

        if dataset in ('trades', 'both'):
            results.extend(semantic_search(question, 'trades', top_k=20, min_score=0.35))
        if dataset in ('holdings', 'both'):
            results.extend(semantic_search(question, 'holdings', top_k=20, min_score=0.35))

        results = sorted(results, key=lambda x: x['score'], reverse=True)
        if not results:
            return "Sorry can not find the answer"

        summary = f"Found {len(results)} relevant rows (showing top 10):"
        evidence_text = format_evidence_rows(results[:10])
        return call_llm_formatter(question, summary, evidence_text)

    for ds in (['trades'] if dataset == 'trades' else (['holdings'] if dataset == 'holdings' else ['trades','holdings'])):
        count, df_filtered = exec_count(ds, entities)
        if count > 0:
            summary = f"Found {count} matching rows in {ds}"
            evidence = semantic_search(question, ds, top_k=8, min_score=0.40)
            evidence_text = format_evidence_rows(evidence[:5])
            return call_llm_formatter(question, summary, evidence_text)

    results = []
    results.extend(semantic_search(question, 'trades', top_k=15, min_score=0.35))
    results.extend(semantic_search(question, 'holdings', top_k=15, min_score=0.35))
    results = sorted(results, key=lambda x: x['score'], reverse=True)
    if not results:
        return "Sorry can not find the answer"
    summary = f"Found {len(results)} relevant results"
    evidence_text = format_evidence_rows(results[:10])
    return call_llm_formatter(question, summary, evidence_text)

In [26]:
test_questions = [
    "Total holdings for garfield",
    "Total PL YTD for Garfield",
    "Which fund has best yearly profit?",
    "Total quantity of AA stock traded?",
    "Which fund performed best based on PL_YTD?",
    "Show top 3 funds by profit",
    "Show holdings with negative PL_YTD",
    "Who is the CEO?",
    "Total holdings for Sandberg Fund",
    "Total PL YTD for Wellington Portfolio",
    "Which fund has best quarterly profit?",
    "Total quantity of TSLA stock traded?",
    "Which portfolio performed best based on PL_MTD?",
    "Show top 3 portfolios by market value",
    "Show holdings with negative PL_QTD",
    "Who is the portfolio manager?",
    "Total MV Base for Redwood Income Fund",
    "Total trades for GreenRock Capital?",
    "Which fund has worst yearly PL?",
    "Count all FX trades for USD/JPY",
    "Show monthly profit for Riverdale Growth Fund"
]


In [27]:
print("=" * 80)
print("HYBRID RAG CHATBOT TEST RESULTS")
print("=" * 80)

for i, q in enumerate(test_questions, 1):
    print(f"\n[Q{i}] {q}")
    answer = answer_question(q, debug=False)
    print(f"[A{i}] {answer}")
    print("-" * 80)

HYBRID RAG CHATBOT TEST RESULTS

[Q1] Total holdings for garfield
[A1] There are 221 holdings associated with Garfield.
--------------------------------------------------------------------------------

[Q2] Total PL YTD for Garfield
[A2] The total profit and loss year-to-date for Garfield is -$168,551,028.29.
--------------------------------------------------------------------------------

[Q3] Which fund has best yearly profit?
[A3] The fund with the best yearly profit is Ytum, with a profit of $7,229,903.10.
--------------------------------------------------------------------------------

[Q4] Total quantity of AA stock traded?
[A4] The total quantity of AA stock traded is 15,100.00.
--------------------------------------------------------------------------------

[Q5] Which fund performed best based on PL_YTD?
[A5] The fund that performed best based on year-to-date profit and loss (PL_YTD) is Ytum, with a total of $7,229,903.10.
------------------------------------------------------