In [118]:
# %% [markdown]
# # QA-supervised GNN (clean version)
# - No "question" nodes inside the graph
# - Train the GNN on your code KG
# - Supervise using external QA embeddings: pull each question toward its mapped cluster

# %%
import os, ast, json, math, random, warnings
import numpy as np
import pandas as pd
from typing import Optional, Iterable

import torch
import torch.nn as nn
import torch.nn.functional as F

# PyG
from torch_geometric.data import HeteroData
from torch_geometric.nn import HGTConv

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# Clean printing
pd.set_option('display.max_colwidth', 160)
warnings.filterwarnings("ignore")


Device: cpu


In [119]:
# %%
def parse_vec_cell(x, fallback_dim: int = 384) -> np.ndarray:
    """Robustly parse an embedding stored as list/np/str (even with '...')."""
    if isinstance(x, np.ndarray):
        return x.astype(np.float32)
    if isinstance(x, list):
        return np.array(x, dtype=np.float32)
    if isinstance(x, (tuple,)):
        return np.array(list(x), dtype=np.float32)
    if isinstance(x, str):
        s = x.strip()
        if "..." in s:
            s = s.replace("...", "")
        # Try JSON first
        try:
            return np.array(json.loads(s), dtype=np.float32)
        except Exception:
            pass
        # Try ast
        try:
            return np.array(ast.literal_eval(s), dtype=np.float32)
        except Exception:
            pass
        # Fallback: comma-split floats
        try:
            s = s.strip("[]")
            vals = [float(t) for t in s.split(",") if t.strip()]
            return np.array(vals, dtype=np.float32)
        except Exception:
            return np.zeros((fallback_dim,), dtype=np.float32)
    # unknown -> zeros
    return np.zeros((fallback_dim,), dtype=np.float32)


def stack_series_to_tensor(series: pd.Series, fallback_dim: int = 384) -> torch.Tensor:
    """Vectorizes a DF/Series column with arbitrary embedding representations into a 2D torch tensor."""
    vecs = [parse_vec_cell(v, fallback_dim=fallback_dim) for v in series.tolist()]
    dim = max((len(v) for v in vecs), default=fallback_dim) or fallback_dim
    arr = np.zeros((len(vecs), dim), dtype=np.float32)
    for i, v in enumerate(vecs):
        if v.size:
            d = min(dim, v.size)
            arr[i, :d] = v[:d]
    return torch.tensor(arr, dtype=torch.float)


# Optional text encoder for cluster summaries, if you want semantic text features.
# Falls back to TF-IDF if transformers aren't available.
def build_text_encoder():
    try:
        from sentence_transformers import SentenceTransformer
        model = SentenceTransformer("all-MiniLM-L6-v2")
        def encode(texts: Iterable[str]) -> np.ndarray:
            return model.encode(list(texts), convert_to_numpy=True, show_progress_bar=False).astype(np.float32)
        print("Text encoder: sentence-transformers (all-MiniLM-L6-v2)")
        return encode
    except Exception:
        from sklearn.feature_extraction.text import TfidfVectorizer
        vec = TfidfVectorizer(max_features=512)
        print("Text encoder: TF-IDF fallback (512 dims)")
        def encode(texts: Iterable[str]) -> np.ndarray:
            return vec.fit_transform(list(texts)).toarray().astype(np.float32)
        return encode


In [120]:
import pickle
with open("graph_v3.pkl", "rb") as f:
    repograph = pickle.load(f)


In [123]:
import ast
import numpy as np
import torch

def safe_parse_embedding(x):
    """Biztonságosan konvertálja az embedding stringet listává."""
    if isinstance(x, list):
        return x
    try:
        val = ast.literal_eval(x)
        if isinstance(val, list):
            return val
    except (ValueError, SyntaxError):
        pass
    # ha valami hiba van, adunk egy nullvektort ugyanakkora dimenzióval, mint az első helyes elem
    return None

# próbáljuk meg kinyerni a jó sorokat
parsed = func_nodes["docstring_embedding"].apply(safe_parse_embedding)

# eldobunk minden hibás sort
valid = parsed.dropna()
print(f"✅ {len(valid)} valid embedding of {len(parsed)} total")

# stack only valid embeddings
X = np.stack(valid.values)
x = torch.tensor(X, dtype=torch.float)

# ha kell, a df-et is szűkítsd ehhez:
func_nodes = func_nodes.loc[valid.index].reset_index(drop=True)

# Feltételezzük, hogy ez már megvan:
func_edges = repograph["function_edges"]

# A PyG által elvárt formátum: 2 x num_edges tensor
edge_index = torch.tensor(
    func_edges[["source", "target"]].T.values,  # Transpose → [2, num_edges]
    dtype=torch.long
)


✅ 11128 valid embedding of 11128 total


In [124]:
min_id = min(func_edges["source"].min(), func_edges["target"].min())
func_edges["source"] -= min_id
func_edges["target"] -= min_id

edge_index = torch.tensor(
    func_edges[["source", "target"]].T.values,
    dtype=torch.long
)


In [125]:
from torch_geometric.data import Data

data = Data(x=x, edge_index=edge_index)
print(data)


Data(x=[11128, 384], edge_index=[2, 19239])


In [126]:
qa = pd.read_csv("stackowerQnA_context.csv", sep=",", on_bad_lines='warn', engine='python')
qa

Unnamed: 0,question_ids,questions,answer_ids,answers,contexts
0,79779783,"<p>I have recently noticed that when I change the order of the observations in a sparse array, <code>scikit-learn</code> PCA with <code>svd_solver=&quot;arp...",79779909,"<p>In short - yes, this is to be expected, however the differences should be small as they are caused by precision limits of the floating point arithmetic.<...",[]
1,79748223,"<p>I would like to perform a regression analysis and test different transformations of the input variables for the same model. To accomplish this, I created...",79751118,"<p>I actually found a solution similar to Ben Reiniger, but using GridSearchCV. The transformation of the target variable was not smooth at first, but Ben's...",['GridSearchCV._run_search']
2,79748461,"<p>I want to undersample 3 cross-validation folds from a dataset, using say, RandomUnderSampler from imblearn, and then, optimize the hyperparameters of var...",79751379,<p>You can do this:</p>\n<ol>\n<li><p>Get initial folds using <code>.split()</code> method of your sklearn CV object. It returns indices for train and test ...,['HalvingRandomSearchCV._generate_candidate_params']
3,79749078,<p>I’m trying to evaluate classification models on a highly imbalanced fraud dataset using the Brier Skill Score (BSS) as the evaluation metric.</p>\n<br>\n...,79749170,<p>I was getting <code>NaN</code> values when using <strong>Brier Skill Score</strong> with cross-validation.<br />\nThe issue was in how I defined the scor...,"['DummyClassifier.predict_proba', '_ConstantPredictor.predict_proba', 'OneVsRestClassifier.predict_proba', 'MultiOutputClassifier.predict_proba', 'Pipeline...."
4,79730533,<p>I am trying to install <code>scikit-learn</code> and <code>imbalanced-learn</code> for ML project using <code>poetry</code>.</p>\n<pre><code># File pypro...,79730580,<p>It seems <code>imbalanced-learn</code> uses <code>sklearn-compat</code> which needs <code>scikit-learn &lt; 1.7</code></p>\n<p>So if you can work with ol...,[]
...,...,...,...,...,...
4793,59881343,"<p>As part of pursuing a course, I was trying to implement L1 logistic regression using scikit-learn in Python. Unfortunately for the code</p>\n\n<pre><code...",59881394,"<p>You can do it like you are doing in the first code snippet, but you have to define another solver. Use either ‘liblinear’ or ‘saga’, <a href=""https://sci...",[]
4794,59864306,<p>i'm building a neural network using <code>sklearn.neural_network.MLPClassifier</code> :</p>\n\n<pre><code> clf = sklearn.neural_network.MLPClassifier(...,59865247,"<p>Уour method may give different scaling factors. It is for single scaling jobs, but not for the ones requiring consistent transformation.</p>\n\n<p>I sugg...","['RBFSampler.transform', 'AdditiveChi2Sampler.transform', 'Nystroem.transform', 'Pipeline.fit_transform', 'Pipeline.transform', 'FeatureUnion.fit_transform'..."
4795,59812995,<pre><code>from sklearn.feature_extraction.text import CountVectorizer\nvectorizer = CountVectorizer()\nvector = vectorizer.fit_transform(X_train).toarray()...,59813181,<p>You should call <code>.toarray()</code> as you have done for train data:</p>\n\n<p><code>test_vectors = vectorizer.transform(X_test).toarray()</code></p>\n,[]
4796,59839096,"<p>Whenever I make a confusion matrix, I see the cells are not separated by a boundary line. I want to put a black line bordering between all cells. Can thi...",59860688,"<p>use linewidths</p>\n\n<pre><code>sns.heatmap(confusion_mat_df, annot=True,linewidths=2,cmap=""Blues"")\n</code></pre>\n\n<p>see <a href=""https://seaborn.py...",[]


In [127]:
import pandas as pd
import ast
import re
from bs4 import BeautifulSoup

qa_stack = qa.copy()

# --- 1️⃣ HTML szövegek megtisztítása ---
def clean_html(text):
    if not isinstance(text, str):
        return ""
    return BeautifulSoup(text, "html.parser").get_text(" ", strip=True)

qa_stack["q_clean"] = qa_stack["questions"].apply(clean_html)
qa_stack["a_clean"] = qa_stack["answers"].apply(clean_html)

# --- 2️⃣ Kontextusos kérdés-szöveg létrehozása ---
qa_stack["context"] = (
    qa_stack["q_clean"].fillna('') + " " + qa_stack["a_clean"].fillna('')
).str.strip()

print("✅ context column ready:", qa_stack["context"].iloc[0][:200])

# --- 3️⃣ A 'contexts' oszlop feldolgozása ---
# pl. "['GridSearchCV._run_search']" → ['GridSearchCV._run_search']
qa_stack["contexts"] = qa_stack["contexts"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith('[') else []
)

print("📊 Golden context coverage:", (qa_stack["contexts"].str.len() > 0).mean(), "have function names")


✅ context column ready: I have recently noticed that when I change the order of the observations in a sparse array, scikit-learn PCA with svd_solver="arpack" returns different floating point numbers. Is this an expected beha
📊 Golden context coverage: 0.23280533555648186 have function names


In [128]:
# %% -------------------------------------------------------------
# 4️⃣ Map golden context function names → node indices in graph
# --------------------------------------------------------------
import torch

# Create lookup dictionary for function names → node indices
fn_name_to_idx = {
    name: i for i, name in enumerate(repograph["function_nodes"]["combinedName"].fillna("").tolist())
}

pos_function_idx = []
unmatched = 0

for contexts in qa_stack["contexts"]:
    if not contexts:  # üres lista
        pos_function_idx.append(-1)
        continue

    # Use the first known function name that exists in the graph
    found_idx = None
    for c in contexts:
        if c in fn_name_to_idx:
            found_idx = fn_name_to_idx[c]
            break

    if found_idx is not None:
        pos_function_idx.append(found_idx)
    else:
        pos_function_idx.append(-1)
        unmatched += 1

pos_function_idx = torch.tensor(pos_function_idx, dtype=torch.long, device=device)

print(f"✅ Created pos_function_idx tensor: {pos_function_idx.shape}")
print(f"⚠️ {unmatched} questions had no matching function in the graph.")
print(f"📈 Match rate: {(1 - unmatched/len(pos_function_idx)) * 100:.2f}%")

# Optional: filter QA pairs that have valid matches only
mask_valid = pos_function_idx >= 0
qa_stack_valid = qa_stack[mask_valid.cpu().numpy()].reset_index(drop=True)
pos_function_idx = pos_function_idx[mask_valid]
print(f"✅ Filtered valid QA pairs: {len(qa_stack_valid)} remaining.")

# Save for reuse
torch.save(pos_function_idx, "pos_function_idx_stack.pt")
qa_stack_valid.to_csv("qa_stack_valid.csv", index=False)
print("💾 Saved pos_function_idx_stack.pt and qa_stack_valid.csv")


✅ Created pos_function_idx tensor: torch.Size([4798])
⚠️ 0 questions had no matching function in the graph.
📈 Match rate: 100.00%
✅ Filtered valid QA pairs: 1117 remaining.
💾 Saved pos_function_idx_stack.pt and qa_stack_valid.csv
💾 Saved pos_function_idx_stack.pt and qa_stack_valid.csv


In [130]:
# %% ------------------------------------------------------------
# Train GNN using golden StackOverflow QA supervision
# ---------------------------------------------------------------
import torch
import torch.nn.functional as F
import math

# Load valid QA pairs
qa_stack_valid = pd.read_csv("qa_stack_valid.csv")
pos_function_idx = torch.load("pos_function_idx_stack.pt").to(device)

# SentenceTransformer model (same as before)
qa_encoder = SentenceTransformer("all-MiniLM-L6-v2").to(device)

# Context = kérdés + válasz
qa_texts = (qa_stack_valid["questions"].fillna("") + " " +
             qa_stack_valid["answers"].fillna("")).str.strip()

# Encode embeddings
q_emb = qa_encoder.encode(qa_texts.tolist(), convert_to_tensor=True, device=device)
print(f"✅ Encoded {len(q_emb)} QA pairs → shape: {q_emb.shape}")

# Align lengths
n_q = min(len(q_emb), len(pos_function_idx))
q_emb = q_emb[:n_q]
pos_function_idx = pos_function_idx[:n_q]

indices = torch.arange(n_q, device=device)
print(f"📘 Using {n_q} golden QA pairs for training")

# --- InfoNCE loss ---
def info_nce(q, items, pos_index, temperature=0.07):
    q = F.normalize(q, dim=-1)
    items = F.normalize(items, dim=-1)
    logits = q @ items.T / temperature
    return F.cross_entropy(logits, pos_index.to(q.device)), logits

# --- Training loop ---
EPOCHS = 200
BATCH = 64

for epoch in range(1, EPOCHS+1):
    model.train()
    total_loss = 0.0
    perm = indices[torch.randperm(len(indices))]
    num_batches = math.ceil(len(perm) / BATCH)

    for start in range(0, len(perm), BATCH):
        idx = perm[start:start+BATCH]
        q_batch = q_emb[idx]
        pos_batch = pos_function_idx[idx]

        opt.zero_grad()
        func_emb = F.normalize(model(data.x.to(device), data.edge_index.to(device)), dim=-1)

        loss, _ = info_nce(q_batch, func_emb, pos_batch)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        opt.step()
        total_loss += loss.item()

    avg_loss = total_loss / num_batches
    print(f"Epoch {epoch:02d} | loss = {avg_loss:.4f}")


✅ Encoded 1117 QA pairs → shape: torch.Size([1117, 384])
📘 Using 1117 golden QA pairs for training
Epoch 01 | loss = 3.8803
Epoch 01 | loss = 3.8803
Epoch 02 | loss = 3.8498
Epoch 02 | loss = 3.8498
Epoch 03 | loss = 3.8668
Epoch 03 | loss = 3.8668
Epoch 04 | loss = 3.8324
Epoch 04 | loss = 3.8324
Epoch 05 | loss = 3.7753
Epoch 05 | loss = 3.7753
Epoch 06 | loss = 3.8437
Epoch 06 | loss = 3.8437
Epoch 07 | loss = 3.8066
Epoch 07 | loss = 3.8066
Epoch 08 | loss = 3.7691
Epoch 08 | loss = 3.7691
Epoch 09 | loss = 3.8327
Epoch 09 | loss = 3.8327
Epoch 10 | loss = 3.8012
Epoch 10 | loss = 3.8012
Epoch 11 | loss = 3.8083
Epoch 11 | loss = 3.8083
Epoch 12 | loss = 3.7970
Epoch 12 | loss = 3.7970
Epoch 13 | loss = 3.8131
Epoch 13 | loss = 3.8131
Epoch 14 | loss = 3.7930
Epoch 14 | loss = 3.7930
Epoch 15 | loss = 3.8307
Epoch 15 | loss = 3.8307
Epoch 16 | loss = 3.7856
Epoch 16 | loss = 3.7856
Epoch 17 | loss = 3.7512
Epoch 17 | loss = 3.7512
Epoch 18 | loss = 3.7401
Epoch 18 | loss = 3.7401
E

In [107]:
from sentence_transformers import SentenceTransformer
import torch
import torch.nn.functional as F

# ugyanaz az encoder, amit a tanításnál is használtunk
qa_encoder = SentenceTransformer("all-MiniLM-L6-v2").to(device)
model.eval()

@torch.no_grad()
def get_function_embeddings():
    """Lekérdezi és normalizálja a GNN által tanult function node embeddingeket."""
    func_emb = model(data.x.to(device), data.edge_index.to(device))
    func_emb = F.normalize(func_emb, dim=-1)
    print(f"✅ Function embeddings shape: {func_emb.shape}")
    return func_emb

function_emb = get_function_embeddings()

def ask(question: str, k: int = 5):
    """
    Kérdés megválaszolása a GNN + text encoder alapján.
    Kiszámolja a hasonlóságot a kérdés embeddingje és a function node embeddingek között.
    """
    # Kérdés embeddingje
    q_vec = qa_encoder.encode([question], convert_to_tensor=True).to(device)
    q_vec = F.normalize(q_vec, dim=-1)

    # Cosine-similaritás
    sim = q_vec @ function_emb.T
    top_vals, top_idx = torch.topk(sim, k)

    print(f"\n❓ Question: {question}\n")
    print("Top-k related functions:\n")
    for rank, (fid, score) in enumerate(zip(top_idx[0].tolist(), top_vals[0].tolist()), start=1):
        name = repograph["function_nodes"]["combinedName"].iloc[fid]
        doc = repograph["function_nodes"]["docstring"].iloc[fid]
        print(f"{rank:>2}. {name} (score={score:.4f})")
        print(f"   {doc[:200]}{'...' if len(doc)>200 else ''}\n")


✅ Function embeddings shape: torch.Size([11128, 384])


In [131]:
# %% ----------------------- RAG over GNN functions -----------------------
import re
import torch
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer, util
import numpy as np
import pandas as pd

# --- Előfeltételek (ezeknek már létezniük kell a notebookodban) ---
# - model: a betanított FunctionGNN (forward(x, edge_index) -> [N, D])
# - data: PyG Data(x=[N,384], edge_index=[2,E])
# - repograph["function_nodes"]: DataFrame (combinedName, docstring, ...)
# - qa_stack_valid: DataFrame a StackOverflow QA-kkal (questions, answers, contexts)
# - device: "cuda" vagy "cpu"
# - qa_encoder: SentenceTransformer("all-MiniLM-L6-v2").to(device)

def _ensure_func_emb_cache():
    """Számolja és cache-eli a normalizált function embeddingeket (GNN output)."""
    global _FUNC_EMB_CACHE
    if "_FUNC_EMB_CACHE" not in globals() or _FUNC_EMB_CACHE is None:
        model.eval()
        with torch.no_grad():
            emb = model(data.x.to(device), data.edge_index.to(device))      # [N, D]
            emb = F.normalize(emb, dim=-1)
        _FUNC_EMB_CACHE = emb
    return _FUNC_EMB_CACHE

def _sent_tokenize(text: str, max_sent=6):
    """Nagyon egyszerű mondat-szeletelő, fallback arra, ha nincs nltk."""
    if not isinstance(text, str) or not text.strip():
        return []
    # darabolás pont/kérdőjel/felkiáltójel mentén
    parts = re.split(r'(?<=[\.\?\!])\s+', text.strip())
    return [s.strip() for s in parts if s.strip()][:max_sent]

def _gather_function_contexts(func_name: str, top_rows=6):
    """
    Kikeresi a qa_stack_valid sorait, ahol a contexts oszlop tartalmazza a func_name-t.
    Visszaad (row_idx, question_text, answer_text) triplákat.
    """
    if "contexts" not in qa_stack_valid.columns:
        return []
    hits = []
    # contexts oszlop listákat tartalmaz (pl. ['GridSearchCV._run_search', ...])
    for i, ctx_list in enumerate(qa_stack_valid["contexts"]):
        try:
            if isinstance(ctx_list, str):
                # ha valamiért stringként maradt, próbáljuk listává
                ctx_list = eval(ctx_list)  # a datasetet már megtisztítottad korábban
        except Exception:
            ctx_list = []
        if isinstance(ctx_list, (list, tuple)) and func_name in ctx_list:
            q = qa_stack_valid.loc[i, "questions"]
            a = qa_stack_valid.loc[i, "answers"]
            hits.append((i, q, a))
            if len(hits) >= top_rows:
                break
    return hits

@torch.no_grad()
def ask_rag(question: str, k_funcs: int = 10, k_ctx: int = 5, max_snips_per_source: int = 2):
    """
    RAG pipeline:
      1) top-k function jelölt a GNN embeddingekkel,
      2) ezekhez docstring + StackOverflow context gyűjtése,
      3) újrarangsorolás a kérdéshez képest (MiniLM),
      4) rövid válasz + források.
    """
    # 1) Kérdés embedding
    q_vec = qa_encoder.encode([question], convert_to_tensor=True).to(device)
    q_vec = F.normalize(q_vec, dim=-1)  # [1, 384]

    # 2) top-k function jelöltek a GNN-ből
    func_emb = _ensure_func_emb_cache()               # [N, D]
    sim = (q_vec @ func_emb.T).squeeze(0)            # [N]
    vals, idxs = torch.topk(sim, k=min(k_funcs, func_emb.size(0)))
    idxs = idxs.tolist()
    vals = vals.tolist()

    # 3) Kontextek összegyűjtése (docstring + SO QA)
    fn_df = repograph["function_nodes"]
    cand_contexts = []
    for rank, (fid, score) in enumerate(zip(idxs, vals), start=1):
        row = fn_df.iloc[fid]
        fname = row.get("combinedName", f"Function_{fid}")
        doc = row.get("docstring", "") or ""
        # docstringből mondatok
        doc_sents = _sent_tokenize(doc, max_sent=6)

        # SO kontextusok (kérdés+válasz), ahol ez a function szerepel a contexts-ben
        so_hits = _gather_function_contexts(fname, top_rows=k_ctx)

        # minden forrást külön elemként tárolunk (szöveg + meta)
        if doc_sents:
            cand_contexts.append({
                "type": "docstring",
                "function_id": fid,
                "function_name": fname,
                "text": " ".join(doc_sents),
                "source": f"docstring:{fname}"
            })
        for (row_id, q_html, a_html) in so_hits:
            # nagyon alap HTML → plain text (ha nem tisztítottad korábban)
            q_plain = re.sub("<[^>]+>", " ", q_html or "")
            a_plain = re.sub("<[^>]+>", " ", a_html or "")
            joined = (q_plain + " " + a_plain).strip()
            if joined:
                # vágjunk ki mondatokat, hogy tömörebb legyen
                snips = _sent_tokenize(joined, max_sent=6)
                if snips:
                    cand_contexts.append({
                        "type": "stack_overflow",
                        "function_id": fid,
                        "function_name": fname,
                        "text": " ".join(snips[:max_snips_per_source]),
                        "source": f"SO row {row_id} (contexts contains {fname})"
                    })

    if not cand_contexts:
        print("⚠️ Nem találtam felhasználható kontextust. Visszaadom a top-k function-öket debugra.")
        return {
            "answer": "",
            "functions": [(fn_df.iloc[fid].get("combinedName", f"Function_{fid}"), score) for fid, score in zip(idxs, vals)],
            "sources": []
        }

    # 4) Újrarangsorolás a kérdéshez képest (MiniLM)
    ctx_texts = [c["text"] for c in cand_contexts]
    ctx_emb = qa_encoder.encode(ctx_texts, convert_to_tensor=True, device=device)
    ctx_emb = F.normalize(ctx_emb, dim=-1)
    rel = (q_vec @ ctx_emb.T).squeeze(0)  # [M]
    top_rel_vals, top_rel_idx = torch.topk(rel, k=min(8, rel.size(0)))  # 8 kontextust viszünk tovább
    top_rel_idx = top_rel_idx.tolist()
    top_rel_vals = top_rel_vals.tolist()

    top_contexts = [cand_contexts[i] for i in top_rel_idx]

    # 5) Egyszerű válasz-összeállítás: top kontextusok első 2-3 mondatából
    # (ha szeretnéd, itt használhatsz LLM-et is a generáláshoz)
    answer_snips = []
    for c in top_contexts[:4]:
        sents = _sent_tokenize(c["text"], max_sent=3)
        answer_snips.extend(sents)
    # dedup + rövidítés
    seen = set()
    final_lines = []
    for s in answer_snips:
        t = s.strip()
        if t and t not in seen:
            seen.add(t)
            final_lines.append(t)
        if len(final_lines) >= 6:  # maximum 6 rövid sor
            break

    answer = " ".join(final_lines) if final_lines else "(Nem találtam elég releváns, rövid kontextust.)"

    # 6) Források listázása
    source_list = []
    for c, score in zip(top_contexts, top_rel_vals):
        source_list.append({
            "function": c["function_name"],
            "type": c["type"],
            "score": float(score),
            "source": c["source"]
        })

    # 7) Debug: top-k function jelöltek
    top_functions = []
    for fid, s in zip(idxs, vals):
        nm = fn_df.iloc[fid].get("combinedName", f"Function_{fid}")
        top_functions.append((nm, float(s)))

    return {
        "answer": answer,
        "functions": top_functions,
        "sources": source_list
    }


def print_rag_answer(question):
    res = ask_rag(question, k_funcs=15, k_ctx=6)
    """Segédfüggvény a RAG válasz kiírásához."""
    print("\n🟩 Answer:\n", res["answer"])
    print("\n🔎 Top function candidates:")
    for name, sc in res["functions"][:8]:
        print(f" - {name:60s}  (sim={sc:.4f})")

    print("\n📚 Sources (reranked):")
    for s in res["sources"][:8]:
        print(f" - [{s['type']}] {s['function']}  | sim={s['score']:.4f}  | {s['source']}")
# ---------------- Example usage ----------------



In [132]:
print_rag_answer("How can I evaluate classifier performance with cross-validation?")



🟩 Answer:
 Generate cross-validated estimates for each input data point. The data is split according to the cv parameter. Each sample belongs
to exactly one test set, and its prediction is computed with an
estimator fitted on the corresponding training set. I am training a model to solve binary classification problem usign scikitlearn, and i wish to perform cross validation with 5 folds. As metrics, i would like to get both the average accuracy and a confusion matrix over the 5 folds. This is my minimal reproducible example: 
  import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_validate

x = np.array([
   [1, 2],
   [3, 4],
   [5, 6],
   [6, 7]
])  
y = [1, 0, 0, 1]

model = GaussianNB()
scores = cross_validate(model, x, y, cv=2, scoring=(&quot;accuracy&quot;))

model.predict([8,9])
  
 What I intended to do is instantiating a  Gaussian Naive Bayes Classifier  and use  sklearn.model_selection.cross_validate  for cross validate my mo

In [133]:
print_rag_answer("How does LogisticRegression perform optimization?")


🟩 Answer:
 I've built a model using  LogisticRegression()  and after a grid search the data suggests for my inverse of regularization strength,  C = .0000001  is the &quot;best&quot; value to make my predictions. This parameter works fine for  LogisticRegression() , but seeing as I want to cross-validate I decide to use  LogisticRegressionCV()  the equivalent  c  parameter here is denoted as  Cs , yet when I try to pass the same variable  Cs = .0000001 , I get an error: 
      797     warm_start_sag = {&quot;coef&quot;: np.expand_dims(w0, axis=1)}
    799 coefs = list()
--&gt; 800 n_iter = np.zeros(len(Cs), dtype=np.int32)
    801 for i, C in enumerate(Cs):
    802     if solver == &quot;lbfgs&quot;:

TypeError: object of type 'float' has no len()
  
 When referring to the  documents  it seems that for  LogisticRegressionCV() : 
 
 If Cs is as an int, then a grid of Cs values are chosen in a
logarithmic scale between 1e-4 and 1e4. I tried to find the best combination of hyperparameter

In [134]:
print_rag_answer("How does PCA handle sparse data?")



🟩 Answer:
 Partially fit underlying estimators. Should be used when memory is inefficient to train all data. Chunks of data can be passed in several iterations. for example we have: 
  from sklearn.decomposition import PCA
import numpy as np 

xx = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
pca = PCA()
pca.fit_transform(xx)
  
 otput: 
  array([[ 1.38340578,  0.2935787 ],
   [ 2.22189802, -0.25133484],
   [ 3.6053038 ,  0.04224385],
   [-1.38340578, -0.2935787 ],
   [-2.22189802,  0.25133484],
   [-3.6053038 , -0.04224385]])
  
 In this case i am not reducing the size however the array is changed... why? Background:  
I'm doing research using EigenFaces with Python.

🔎 Top function candidates:
 - PCA.fit                                                       (sim=0.3625)
 - _infer_dimension                                              (sim=0.3510)
 - PCA.fit_transform                                             (sim=0.2964)
 - RBFSampler.transform                 

# No need to run the cells below!

In [53]:
qa = pd.read_csv("generated_qna_large_gpt.csv", sep="\t")
qa_df = qa.copy()

In [81]:
# %% ---------------------------------------------------------
# 1️⃣ - Compute embeddings for QA questions and summaries
# ------------------------------------------------------------
from sentence_transformers import SentenceTransformer, util
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

# Use the same model as for the function_nodes
qa_encoder = SentenceTransformer("all-MiniLM-L6-v2").to(device)

# Take the relevant text fields
qa_texts = (
    qa_df["questions"].fillna("") + " " + qa_df["summaries"].fillna("") + " " + qa_df["statements"].fillna("")
).str.strip()

print(f"📘 Total QA examples: {len(qa_texts)}")

# Encode all questions (batch mode)
qa_df["context"] = (
    qa_df["questions"].fillna('') + " " +
    qa_df["summaries"].fillna('') + " " +
    qa_df["comments"].fillna('') + " " +
    qa_df["statements"].fillna('')
).str.strip()

# Encode context-rich text instead of plain questions
q_emb = qa_encoder.encode(qa_df["context"].tolist(), convert_to_tensor=True, device=device)
print("✅ Encoded QA questions + summaries →", q_emb.shape)

# ------------------------------------------------------------
# 2️⃣ - Compute similarity with function node docstrings
# ------------------------------------------------------------
func_texts = repograph["function_nodes"]["combinedName"].fillna("").tolist()
func_embs = qa_encoder.encode(func_texts, convert_to_tensor=True, device=device, show_progress_bar=True)
print("✅ Encoded function names →", func_embs.shape)

# ------------------------------------------------------------
# 3️⃣ - Find the most similar function for each question
# ------------------------------------------------------------
pos_function_idx = []
for q in q_emb:
    sim = util.cos_sim(q.unsqueeze(0), func_embs)[0]
    best_func = torch.argmax(sim).item()
    pos_function_idx.append(best_func)

pos_function_idx = torch.tensor(pos_function_idx, dtype=torch.long, device=device)
print(f"✅ Mapped {len(pos_function_idx)} questions → function nodes.")

# Optional: save for reuse
torch.save(pos_function_idx, "pos_function_idx.pt")


📘 Total QA examples: 433
✅ Encoded QA questions + summaries → torch.Size([433, 384])


Batches: 100%|██████████| 348/348 [00:22<00:00, 15.65it/s]


✅ Encoded function names → torch.Size([11128, 384])
✅ Mapped 433 questions → function nodes.


In [84]:
# %% ---------------------------------------------
# Nézzük meg, mely function node-okat rendelt a model a kérdésekhez
# ---------------------------------------------
import pandas as pd

# Convert tensors back to CPU for readability
pos_idx_cpu = pos_function_idx.cpu().numpy()

print("🔍 10 random QA → function mapping példa:\n")

sample_ids = torch.randperm(len(qa_df))[:10].tolist()

for i in sample_ids:
    q_text = qa_df.loc[i, "questions"]
    f_id = pos_idx_cpu[i]

    f_name = repograph["function_nodes"].iloc[f_id]["combinedName"]
    f_doc = repograph["function_nodes"].iloc[f_id]["docstring"]

    print(f"🧩 [Q{i}] {q_text.strip()[:100]}{'...' if len(q_text)>100 else ''}")
    print(f"➡️ Function: {f_name}")
    print(f"📘 Docstring: {f_doc.strip()[:200]}{'...' if len(f_doc)>200 else ''}\n")
    print("-" * 80)


🔍 10 random QA → function mapping példa:

🧩 [Q245] What change is being applied in the referenced PR to address platform‑dependency issues with NumPy's...
➡️ Function: test_yeojohnson_for_different_scipy_version
📘 Docstring: Check that the results are consistent across different SciPy versions.

--------------------------------------------------------------------------------
🧩 [Q224] In which pull request was the issue about the MNT Fix issue template link to a blank issue reported?
➡️ Function: test_knn_imputer_drops_all_nan_features
📘 Docstring: 

--------------------------------------------------------------------------------
🧩 [Q283] Why did the Binder build for the scikit‑learn repository fail after the mybinder.org deploy update (...
➡️ Function: ContainerAdapterProtocol.is_supported_container
📘 Docstring: Return True if X is a supported container.

Parameters
----------
Xs: container
    Containers to be checked.

Returns
-------
is_supported_container : bool
    True if X is a 

In [85]:
# %% ----------------------------------------------------------
# 📘 Enhanced Function-GNN with Text Alignment + Annealing
# ------------------------------------------------------------
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from torch_geometric.nn import GCNConv

# ------------------------------------------------------------
# 1️⃣ Define model
# ------------------------------------------------------------
class FunctionGNN(nn.Module):
    def __init__(self, in_dim=384, hidden_dim=128, out_dim=384):
        super().__init__()
        self.conv1 = GCNConv(in_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.text_align = nn.Linear(hidden_dim, out_dim)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = self.dropout(x)
        x = self.conv2(x, edge_index)
        x = self.text_align(x)   # project to text embedding space
        return x


# ------------------------------------------------------------
# 2️⃣ Rebuild / reuse model + optimizer
# ------------------------------------------------------------
model = FunctionGNN(in_dim=x.size(1)).to(device)
opt = torch.optim.AdamW(model.parameters(), lr=2e-4, weight_decay=1e-4)


# ------------------------------------------------------------
# 3️⃣ InfoNCE loss with annealing temperature
# ------------------------------------------------------------
def info_nce(q, items, pos_index, temperature):
    q = F.normalize(q, dim=-1)
    items = F.normalize(items, dim=-1)
    logits = q @ items.T / temperature
    pos_index = pos_index.clamp(0, items.size(0) - 1)
    loss = F.cross_entropy(logits, pos_index.to(q.device))
    return loss


# ------------------------------------------------------------
# 4️⃣ Continue training with temperature annealing
# ------------------------------------------------------------
EPOCHS = 400
BATCH_SIZE = 64
indices = torch.arange(len(q_emb), device=device)

for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0.0
    perm = indices[torch.randperm(len(indices))]
    num_batches = math.ceil(len(perm) / BATCH_SIZE)

    # slowly decrease temperature
    temperature = max(0.03, 0.07 * (0.97 ** epoch))

    for start in range(0, len(perm), BATCH_SIZE):
        idx = perm[start:start + BATCH_SIZE]
        q_batch = q_emb[idx]
        pos_batch = pos_function_idx[idx]

        opt.zero_grad()
        func_emb = model(data.x, data.edge_index)
        loss = info_nce(q_batch, func_emb, pos_batch, temperature)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        opt.step()
        total_loss += loss.item()

    avg_loss = total_loss / num_batches
    print(f"Epoch {epoch:02d} | loss = {avg_loss:.4f} | temp = {temperature:.4f}")

    # optional checkpoint
    if epoch % 10 == 0:
        torch.save(model.state_dict(), f"function_gnn_aligned_epoch{epoch:02d}.pt")


Epoch 01 | loss = 9.3351 | temp = 0.0679
Epoch 02 | loss = 9.2665 | temp = 0.0659
Epoch 03 | loss = 9.1974 | temp = 0.0639
Epoch 04 | loss = 9.1304 | temp = 0.0620
Epoch 05 | loss = 9.0681 | temp = 0.0601
Epoch 06 | loss = 9.0163 | temp = 0.0583
Epoch 07 | loss = 8.9336 | temp = 0.0566
Epoch 08 | loss = 8.8473 | temp = 0.0549
Epoch 09 | loss = 8.7614 | temp = 0.0532
Epoch 10 | loss = 8.7037 | temp = 0.0516
Epoch 11 | loss = 8.5612 | temp = 0.0501
Epoch 12 | loss = 8.4967 | temp = 0.0486
Epoch 13 | loss = 8.4051 | temp = 0.0471
Epoch 14 | loss = 8.3323 | temp = 0.0457
Epoch 15 | loss = 8.2388 | temp = 0.0443
Epoch 16 | loss = 8.1590 | temp = 0.0430
Epoch 17 | loss = 8.0642 | temp = 0.0417
Epoch 18 | loss = 7.9674 | temp = 0.0405
Epoch 19 | loss = 7.8651 | temp = 0.0392
Epoch 20 | loss = 7.6986 | temp = 0.0381
Epoch 21 | loss = 7.6825 | temp = 0.0369
Epoch 22 | loss = 7.4833 | temp = 0.0358
Epoch 23 | loss = 7.4217 | temp = 0.0347
Epoch 24 | loss = 7.3028 | temp = 0.0337
Epoch 25 | loss 

In [86]:
# %% -----------------------------------------------
# ✅ Updated ask() function – compatible with text_align GNN
# ---------------------------------------------------
from sentence_transformers import SentenceTransformer
import torch.nn.functional as F

# Ugyanaz a MiniLM encoder, mint a tanításkor
qa_encoder = SentenceTransformer("all-MiniLM-L6-v2").to(device)
model.eval()

@torch.no_grad()
def get_function_embeddings():
    """Lekéri és szinkronizálja a GNN function embeddingeket a szövegtérbe."""
    model.eval()
    func_emb = model(data.x.to(device), data.edge_index.to(device))
    func_emb = F.normalize(func_emb, dim=-1)  # cosine-simhez normalizálás
    return func_emb

func_emb = get_function_embeddings()

def ask(question: str, k: int = 5):
    """Kérdés → top-k legrelevánsabb function node."""
    q_vec = qa_encoder.encode([question], convert_to_tensor=True).to(device)
    q_vec = F.normalize(q_vec, dim=-1)

    # cosine-similarity a szinkronizált térben
    sim = q_vec @ func_emb.T
    top_vals, top_idx = torch.topk(sim, k)

    print(f"\n❓ Question: {question}\n")
    print("Top-k related functions:\n")
    for rank, (fid, score) in enumerate(zip(top_idx[0].tolist(), top_vals[0].tolist()), start=1):
        row = repograph["function_nodes"].iloc[fid]
        name = row.get("combinedName", f"Function_{fid}")
        doc = row.get("docstring", "")
        print(f"{rank:>2}. {name} (score={score:.4f})")
        if isinstance(doc, str) and doc.strip():
            print(f"   {doc[:180]}{'...' if len(doc) > 180 else ''}\n")


In [87]:
ask("Which test in `sklearn/ensemble/tests/test_bagging.py` was updated to use the `global_random_seed` fixture?")


❓ Question: Which test in `sklearn/ensemble/tests/test_bagging.py` was updated to use the `global_random_seed` fixture?

Top-k related functions:

 1. test_newrand_set_seed (score=0.4108)
   Test that `set_seed` produces deterministic results

 2. pytest_generate_tests (score=0.2841)
   Parametrization of global_random_seed fixture

based on the SKLEARN_TESTS_GLOBAL_RANDOM_SEED environment variable.

The goal of this fixture is to prevent tests that use it to be s...

 3. test_newrand_set_seed_overflow (score=0.2510)
   Test that `set_seed_wrap` is defined for unsigned 32bits ints

 4. test_affinity_propagation_random_state (score=0.2418)
   Check that different random states lead to different initialisations
by looking at the center locations after two iterations.

 5. BaseSearchCV._run_search (score=0.2135)
   Repeatedly calls `evaluate_candidates` to conduct a search.

This method, implemented in sub-classes, makes it possible to
customize the scheduling of evaluations: GridSearchC

In [88]:
ask("** Which Naive Bayes estimators in scikit‑learn are being targeted for array‑API support in the described pull request?")


❓ Question: ** Which Naive Bayes estimators in scikit‑learn are being targeted for array‑API support in the described pull request?

Top-k related functions:

 1. test_fetch_openml_requires_pandas_error (score=0.1916)
   Check that we raise the proper errors when we require pandas.

 2. test_config_array_api_dispatch_error_scipy (score=0.1624)
   Check error when SciPy is too old

 3. bench_scikit_tree_regressor (score=0.1620)
   Benchmark with scikit-learn decision tree regressor

 4. test_raises_value_error_on_same_number_of_classes_and_samples (score=0.1561)
   Tests that if the number of samples equals the number
of classes, a ValueError is raised.

 5. AdamOptimizer._get_updates (score=0.1560)
   Get the values used to update params with given gradients

Parameters
----------
grads : list, length = len(coefs_) + len(intercepts_)
    Containing gradients with respect to coef...



In [70]:
# További tanítás — folytatás az előző checkpointból
EXTRA_EPOCHS = 200   # pl. még 25 epoch
START = 75          # előző utolsó epoch után

for epoch in range(START, START + EXTRA_EPOCHS):
    model.train()
    total_loss = 0.0
    perm = indices[torch.randperm(len(indices))]
    num_batches = math.ceil(len(perm) / BATCH_SIZE)

    for start in range(0, len(perm), BATCH_SIZE):
        idx = perm[start:start + BATCH_SIZE]
        q_batch = q_emb[idx]
        pos_batch = pos_function_idx[idx]

        opt.zero_grad()
        func_emb = model(data.x, data.edge_index)
        loss = info_nce(q_batch, func_emb, pos_batch)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        opt.step()
        total_loss += loss.item()

    avg_loss = total_loss / num_batches
    print(f"Epoch {epoch:02d} | loss = {avg_loss:.4f}")


Epoch 75 | loss = 6.8991
Epoch 76 | loss = 6.8660
Epoch 77 | loss = 6.8823
Epoch 78 | loss = 6.8583
Epoch 79 | loss = 6.8662
Epoch 80 | loss = 6.8413
Epoch 81 | loss = 6.8158
Epoch 82 | loss = 6.8090
Epoch 83 | loss = 6.7740
Epoch 84 | loss = 6.7791
Epoch 85 | loss = 6.7624
Epoch 86 | loss = 6.7714
Epoch 87 | loss = 6.7430
Epoch 88 | loss = 6.7469
Epoch 89 | loss = 6.7392
Epoch 90 | loss = 6.7352
Epoch 91 | loss = 6.6878
Epoch 92 | loss = 6.6808
Epoch 93 | loss = 6.6869
Epoch 94 | loss = 6.6737
Epoch 95 | loss = 6.6768
Epoch 96 | loss = 6.6642
Epoch 97 | loss = 6.6386
Epoch 98 | loss = 6.6098
Epoch 99 | loss = 6.6234
Epoch 100 | loss = 6.6356
Epoch 101 | loss = 6.6045
Epoch 102 | loss = 6.5987
Epoch 103 | loss = 6.5859
Epoch 104 | loss = 6.5764
Epoch 105 | loss = 6.5702
Epoch 106 | loss = 6.5543
Epoch 107 | loss = 6.5470
Epoch 108 | loss = 6.5381
Epoch 109 | loss = 6.5478
Epoch 110 | loss = 6.5346
Epoch 111 | loss = 6.5383
Epoch 112 | loss = 6.5444
Epoch 113 | loss = 6.4825
Epoch 114 |

In [64]:
# %% -----------------------------------------------
# Real-time kérdezés a tanított function-GNN modellen
# -----------------------------------------------
from sentence_transformers import SentenceTransformer
import torch.nn.functional as F

qa_encoder = SentenceTransformer("all-MiniLM-L6-v2").to(device)
model.eval()

@torch.no_grad()
def get_function_embeddings():
    """GNN által tanult function embeddingek (384-D térben)."""
    func_emb = model(data.x.to(device), data.edge_index.to(device))
    func_emb = F.normalize(func_emb, dim=-1)
    return func_emb

func_emb = get_function_embeddings()

def ask(question: str, k: int = 5):
    """Adott kérdéshez legrelevánsabb function node-ok lekérdezése."""
    q_vec = qa_encoder.encode([question], convert_to_tensor=True).to(device)
    q_vec = F.normalize(q_vec, dim=-1)
    sim = q_vec @ func_emb.T
    top_vals, top_idx = torch.topk(sim, k)

    print(f"\n❓ Question: {question}\n")
    print("Top-k related functions:\n")
    for rank, (fid, score) in enumerate(zip(top_idx[0].tolist(), top_vals[0].tolist()), start=1):
        name = repograph["function_nodes"].iloc[fid]["combinedName"]
        doc = repograph["function_nodes"].iloc[fid]["docstring"]
        print(f"{rank:>2}. {name} (score={score:.4f})")
        if isinstance(doc, str) and doc.strip():
            print(f"   {doc[:180]}{'...' if len(doc) > 180 else ''}\n")

# Példa:
ask("How is PCA implemented?")



❓ Question: How is PCA implemented?

Top-k related functions:

 1. BayesianGaussianMixture._estimate_precisions (score=0.1896)
   Estimate the precisions parameters of the precision distribution.

Parameters
----------
nk : array-like of shape (n_components,)

xk : array-like of shape (n_components, n_feature...

 2. Kernel.requires_vector_input (score=0.1804)
   Returns whether the kernel is defined on fixed-length feature
vectors or generic objects. Defaults to True for backward
compatibility.

 3. _check_precision_positivity (score=0.1803)
   Check a precision vector is positive-definite.

 4. BayesianGaussianMixture._check_precision_parameters (score=0.1760)
   Check the prior parameters of the precision distribution.

Parameters
----------
X : array-like of shape (n_samples, n_features)

 5. KNeighborsClassifier.score (score=0.1671)
   Return the mean accuracy on the given test data and labels.

In multi-label classification, this is the subset accuracy
which is a harsh metric si

In [69]:
ask("Which test in `sklearn/ensemble/tests/test_bagging.py` was updated to use the `global_random_seed` fixture?")


❓ Question: Which test in `sklearn/ensemble/tests/test_bagging.py` was updated to use the `global_random_seed` fixture?

Top-k related functions:

 1. test_newrand_set_seed (score=0.4508)
   Test that `set_seed` produces deterministic results

 2. pytest_generate_tests (score=0.3875)
   Parametrization of global_random_seed fixture

based on the SKLEARN_TESTS_GLOBAL_RANDOM_SEED environment variable.

The goal of this fixture is to prevent tests that use it to be s...

 3. test_newrand_set_seed_overflow (score=0.3630)
   Test that `set_seed_wrap` is defined for unsigned 32bits ints

 4. test_affinity_propagation_random_state (score=0.3146)
   Check that different random states lead to different initialisations
by looking at the center locations after two iterations.

 5. test_kernel_pca_precomputed (score=0.2819)
   Test that kPCA works with a precomputed kernel, for all solvers



In [4]:
# %%
# Assumes your KG dict is already in `repograph` (from your notebook).
# If not, uncomment and adjust:

data = HeteroData()

# --- Node features ---
# function: use precomputed docstring embeddings (robust parser)
if "function_nodes" in repograph and "docstring_embedding" in repograph["function_nodes"].columns:
    fn_x = stack_series_to_tensor(repograph["function_nodes"]["docstring_embedding"], fallback_dim=384)
else:
    # If you had no function embeddings, you could fall back to text encodings here.
    raise RuntimeError("function_nodes['docstring_embedding'] not found; please ensure it's in the notebook.")

data["function"].x = fn_x
data["function"].num_nodes = len(repograph["function_nodes"])

# cluster: encode summaries text (or fallback to zeros)
if "cluster_nodes" in repograph and "summary" in repograph["cluster_nodes"].columns:
    encode = build_text_encoder()
    cl_text = repograph["cluster_nodes"]["summary"].fillna("").astype(str).tolist()
    cl_x = torch.tensor(encode(cl_text), dtype=torch.float)
else:
    cl_x = torch.zeros((len(repograph["cluster_nodes"]), 384), dtype=torch.float)

data["cluster"].x = cl_x
data["cluster"].num_nodes = len(repograph["cluster_nodes"])

# --- Edges (keep the core structural edge; add more later if you like) ---
assert "cluster_function_edges" in repograph, "Expected 'cluster_function_edges' in repograph."
cfe = repograph["cluster_function_edges"][["source","target"]].to_numpy(dtype=np.int64).T
data[("cluster","has_function","function")].edge_index = torch.tensor(cfe, dtype=torch.long)

print("Node types:", data.node_types)
print("Edge types:", data.edge_types)
for et in data.edge_types:
    ei = data[et].edge_index
    print(f"{et}: {ei.size(1)} edges")


Text encoder: sentence-transformers (all-MiniLM-L6-v2)
Node types: ['function', 'cluster']
Edge types: [('cluster', 'has_function', 'function')]
('cluster', 'has_function', 'function'): 11128 edges


In [5]:
# %%
# --- Load QA dataset (text → embedding) ---
# If not loaded yet:
qa_df = pd.read_csv("generated_qna_large_gpt.csv", sep="\t").fillna("")

print("QA dataset:", qa_df.shape)
print("Columns:", qa_df.columns.tolist())

# Choose which text fields to embed
QUESTION_TEXT_COL = "questions"
CONTEXT_TEXT_COL  = "summaries"   # or "answers" if you prefer

from tqdm.auto import tqdm
tqdm.pandas()

# --- Text encoder (same as clusters) ---
encode = build_text_encoder()

# --- Compute embeddings ---
print("Encoding QA questions and contexts...")
q_emb_np = encode(qa_df[QUESTION_TEXT_COL].astype(str).fillna("").tolist())
ctx_emb_np = encode(qa_df[CONTEXT_TEXT_COL].astype(str).fillna("").tolist())

# --- Convert to tensors ---
q_emb = torch.tensor(q_emb_np, dtype=torch.float).to(device)
ctx_emb = torch.tensor(ctx_emb_np, dtype=torch.float).to(device)

print(f"Generated embeddings: q_emb={tuple(q_emb.shape)}, ctx_emb={tuple(ctx_emb.shape)}")


QA dataset: (433, 6)
Columns: ['statements', 'comments', 'summaries', 'questions', 'answers', 'scores']
Text encoder: sentence-transformers (all-MiniLM-L6-v2)
Encoding QA questions and contexts...
Generated embeddings: q_emb=(433, 384), ctx_emb=(433, 384)


In [9]:
import pandas as pd

# %%
# Build a text embedding for cluster summaries using the SAME encoder as earlier (or parse if you precomputed them).
# If you already have 'cluster_summary_emb' cached in your notebook, replace cl_text_emb accordingly.

# If we used sentence-transformers above:
try:
    # Reuse the same encoder function built earlier
    cl_text = repograph["cluster_nodes"]["summary"].fillna("").astype(str).tolist()
    cl_text_emb = torch.tensor(encode(cl_text), dtype=torch.float).to(device)
    print("Cluster text embeddings:", tuple(cl_text_emb.shape))
except Exception:
    # fallback: use data["cluster"].x as is
    cl_text_emb = data["cluster"].x.to(device)
    print("Using cluster node features as text embeddings:", tuple(cl_text_emb.shape))

# Normalize
ctx_n = F.normalize(ctx_emb, dim=-1)
clt_n = F.normalize(cl_text_emb, dim=-1)

# Map each QA row to its nearest cluster ID in text space (teacher)
with torch.no_grad():
    sim = ctx_n @ clt_n.T            # [num_qa, num_clusters]
    pos_cluster_idx = sim.argmax(dim=1)  # [num_qa]
pos_cluster_idx = pos_cluster_idx.detach()  # LongTensor
print("positive cluster ids (sample):", pos_cluster_idx[:100].tolist())

# Show the counts of each cluster assignment
pd.Series(pos_cluster_idx.cpu().numpy()).value_counts()

Cluster text embeddings: (20, 384)
positive cluster ids (sample): [11, 19, 19, 11, 11, 17, 17, 17, 11, 7, 17, 17, 17, 17, 18, 17, 19, 17, 17, 19, 11, 11, 17, 17, 17, 17, 11, 11, 17, 17, 17, 11, 17, 19, 5, 11, 17, 17, 11, 17, 11, 11, 17, 17, 17, 17, 11, 17, 4, 17, 11, 19, 19, 17, 19, 17, 17, 17, 17, 1, 17, 13, 17, 17, 11, 11, 1, 19, 5, 11, 17, 1, 17, 19, 17, 17, 17, 17, 17, 14, 11, 17, 17, 17, 17, 11, 17, 17, 17, 6, 17, 17, 11, 9, 17, 10, 17, 4, 17, 17]


17    219
11    112
19     29
10     14
5      13
1       8
4       8
7       6
3       5
9       5
18      4
6       3
13      2
14      2
16      2
2       1
Name: count, dtype: int64

In [25]:
import torch
import torch.nn as nn
from torch_geometric.nn import HGTConv

import torch.nn as nn

class HeteroEncoder(nn.Module):
    def __init__(self, data, hidden=256, num_layers=2, heads=4, dropout=0.1):
        super().__init__()
        self.metadata = (data.node_types, data.edge_types)
        self.ntypes = data.node_types

        self.proj = nn.ModuleDict({
            n: nn.Linear(data[n].x.size(-1), hidden)
            for n in self.ntypes if hasattr(data[n], "x") and data[n].x is not None
        })

        self.convs = nn.ModuleList([
            HGTConv(
                in_channels=hidden,
                out_channels=hidden,
                metadata=self.metadata,
                heads=heads
            )
            for _ in range(num_layers)
        ])

        self.text_align = nn.Linear(hidden, hidden)  # <-- projection head

        self.act = nn.GELU()
        self.drop = nn.Dropout(dropout)

    def forward(self, data):
        x = {n: self.act(self.proj[n](data[n].x)) for n in self.proj.keys()}
        for conv in self.convs:
            x = conv(x, data.edge_index_dict)
            for n in x:
                x[n] = self.drop(self.act(x[n]))
        # Apply projection to improve text–graph alignment
        for n in x:
            x[n] = F.normalize(self.text_align(x[n]), dim=-1)
        return x



# ⚠️ Add reverse edge
rev = data[("cluster", "has_function", "function")].edge_index.flip(0)
data[("function", "rev_has_function", "cluster")].edge_index = rev

# ✅ Reinitialize model + optimizer
model = HeteroEncoder(data, hidden=64, num_layers=2, heads=4, dropout=0.05).to(device)
opt = torch.optim.AdamW(model.parameters(), lr=2e-4, weight_decay=1e-4)
data = data.to(device)


In [26]:
# %%
import math
import torch
import torch.nn.functional as F
from sentence_transformers import util

# ----------------------------------------------------------
# 1️⃣ InfoNCE (CLIP-style in-batch variant)
# ----------------------------------------------------------
def info_nce_inbatch(q, items, temperature=0.02):
    """
    Contrastive InfoNCE with in-batch negatives.
    Each query is paired with its corresponding item in the same batch.
    """
    q = F.normalize(q, dim=-1)
    items = F.normalize(items, dim=-1)

    logits = q @ items.T / temperature          # [B, B]
    labels = torch.arange(q.size(0), device=q.device)  # positives are diagonal
    loss = F.cross_entropy(logits, labels)
    return loss, logits


# ----------------------------------------------------------
# 2️⃣ Ensure pos_function_idx exists and data aligned
# ----------------------------------------------------------
if "pos_function_idx" not in locals() or len(pos_function_idx) != len(qa_df):
    print("Generating pos_function_idx from QA dataset via semantic similarity...")

    func_texts = repograph["function_nodes"]["combinedName"].tolist()
    func_embs = qa_encoder.encode(
        func_texts,
        convert_to_tensor=True,
        device=device,
        show_progress_bar=True
    )

    pos_function_idx_list = []
    for q in qa_df["questions"]:
        q_emb = qa_encoder.encode(q, convert_to_tensor=True, device=device)
        cos_sim = util.cos_sim(q_emb, func_embs)[0]
        best_func = int(torch.argmax(cos_sim))
        pos_function_idx_list.append(best_func)

    pos_function_idx = torch.tensor(pos_function_idx_list, dtype=torch.long, device=device)
    print(f"✅ pos_function_idx generated for {len(pos_function_idx)} questions.")

# ----------------------------------------------------------
# 3️⃣ Align all arrays to the same length
# ----------------------------------------------------------
n_q = min(len(q_emb), len(pos_cluster_idx), len(pos_function_idx))
q_emb = q_emb[:n_q]
pos_cluster_idx = pos_cluster_idx[:n_q]
pos_function_idx = pos_function_idx[:n_q]
indices = torch.arange(n_q, device=device)
print(f"✅ Using {n_q} aligned QA pairs for training")

# ----------------------------------------------------------
# 4️⃣ Dual in-batch InfoNCE training loop (cluster + function)
# ----------------------------------------------------------
EPOCHS = 15
BATCH_SIZE = 64

for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0.0

    perm = indices[torch.randperm(len(indices))]
    num_batches = math.ceil(len(perm) / BATCH_SIZE)

    for start in range(0, len(perm), BATCH_SIZE):
        idx = perm[start:start + BATCH_SIZE]

        # slice aligned batches
        q_batch = q_emb[idx]
        pos_cluster_batch = pos_cluster_idx[idx]
        pos_func_batch = pos_function_idx[idx]

        # ensure 2D queries
        if q_batch.ndim == 1:
            q_batch = q_batch.unsqueeze(0)

        opt.zero_grad(set_to_none=True)

        # Forward pass through GNN
        xdict = model(data)
        cl_all = F.normalize(xdict["cluster"], dim=-1)
        func_all = F.normalize(xdict["function"], dim=-1)

        # gather the positive cluster/function embeddings for this batch
        cl_batch = cl_all[pos_cluster_batch]
        func_batch = func_all[pos_func_batch]

        # compute dual InfoNCE losses
        loss_cluster, _ = info_nce_inbatch(q_batch, cl_batch, temperature=0.02)
        loss_func, _ = info_nce_inbatch(q_batch, func_batch, temperature=0.02)
        loss = 0.5 * loss_cluster + 0.5 * loss_func

        # backward
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        opt.step()

        total_loss += float(loss.detach().cpu())

    avg_loss = total_loss / num_batches
    print(f"Epoch {epoch:02d} | loss = {avg_loss:.4f}")


Generating pos_function_idx from QA dataset via semantic similarity...


Batches: 100%|██████████| 348/348 [00:22<00:00, 15.52it/s]


✅ pos_function_idx generated for 433 questions.
✅ Using 384 aligned QA pairs for training
Epoch 01 | loss = 4.2364
Epoch 02 | loss = 4.3104
Epoch 03 | loss = 4.2208
Epoch 04 | loss = 4.3034
Epoch 05 | loss = 4.2189
Epoch 06 | loss = 4.2377
Epoch 07 | loss = 4.0687
Epoch 08 | loss = 4.1512
Epoch 09 | loss = 4.1198
Epoch 10 | loss = 4.1452
Epoch 11 | loss = 4.2068
Epoch 12 | loss = 4.1260
Epoch 13 | loss = 4.2332
Epoch 14 | loss = 4.1671
Epoch 15 | loss = 4.0344


In [32]:
# Add this small adapter layer once
model.text_align = torch.nn.Linear(64, 384).to(device)

@torch.no_grad()
def get_function_embeddings():
    xdict = model(data.to(device))
    func_emb = F.normalize(model.text_align(xdict["function"]), dim=-1)
    return func_emb


In [33]:
ask("How is PCA implemented?")


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x384 and 64x11128)

In [37]:
from sentence_transformers import SentenceTransformer
import torch
import torch.nn.functional as F

qa_encoder = SentenceTransformer("all-MiniLM-L6-v2").to(device)
model.eval()

# 🔧 Fix: make sure projection matches GNN output
if not hasattr(model, "text_align"):
    model.text_align = torch.nn.Linear(384, 384).to(device)  # in_dim must match your GNN output dim

@torch.no_grad()
def get_function_embeddings():
    """Extract normalized function embeddings in text space (384-D)."""
    xdict = model(data.to(device))
    func_emb = model.text_align(xdict["function"])   # project to text space (no-op if already 384)
    func_emb = F.normalize(func_emb, dim=-1)
    return func_emb

function_emb = get_function_embeddings()

@torch.no_grad()
def ask(question: str, k: int = 5):
    """Retrieve top-k most relevant functions for a natural language question."""
    q_vec = qa_encoder.encode([question], convert_to_tensor=True, device=device)
    q_vec = F.normalize(q_vec, dim=-1)  # 384-D
    sim = (q_vec @ function_emb.T).squeeze(0)

    top_vals, top_idx = torch.topk(sim, k)
    print(f"\n❓ Question: {question}\n")
    print("Top-k related functions:\n")
    for rank, (fid, score) in enumerate(zip(top_idx.tolist(), top_vals.tolist()), start=1):
        name = repograph["function_nodes"]["combinedName"].iloc[fid]
        doc = repograph["function_nodes"]["docstring"].iloc[fid]
        snippet = (doc or "").replace("\n", " ")[:180]
        if len(doc) > 180:
            snippet += "..."
        print(f"{rank:>2}. {name} (score={score:.4f})\n   {snippet}\n")


RuntimeError: mat1 and mat2 shapes cannot be multiplied (11128x384 and 64x384)

In [35]:
ask("How is PCA implemented?")


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x64 and 384x11128)