# Upper layer의 long bridge 조작 - trial1 w/ faiss

### Data Preparation

In [5]:
import faiss  # hnswlib 대신 faiss 임포트
import numpy as np
import umap
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
from sklearn.cluster import KMeans
import h5py
import struct
from collections import defaultdict  # cluster-wise order에 필요

def read_fvecs(filename):
    """Reads .fvecs binary file into np.ndarray of shape (n, d)."""
    with open(filename, 'rb') as f:
        data = f.read()
    dim = struct.unpack('i', data[:4])[0]
    vecs = np.frombuffer(data, dtype=np.float32)
    vecs = vecs.reshape(-1, dim + 1)[:, 1:]  # drop the leading 'dim'
    return vecs

def read_ivecs(filename):
    """Reads .ivecs binary file into np.ndarray of shape (n, k)."""
    with open(filename, 'rb') as f:
        data = f.read()
    dim = struct.unpack('i', data[:4])[0]
    vecs = np.frombuffer(data, dtype=np.int32)
    vecs = vecs.reshape(-1, dim + 1)[:, 1:]
    return vecs

# 데이터셋 경로 (현재 구조에 맞춰 수정)
base_path = "./datasets"
# 데이터셋 경로
file_path = base_path + "/glove-200-angular.hdf5"

# h5py를 사용하여 파일 열기
with h5py.File(file_path, 'r') as f:
    # HDF5 파일 내의 데이터셋 키 확인 (어떤 데이터가 있는지 모를 경우 유용)
    print(f"Keys in HDF5 file: {list(f.keys())}")

    # 각 데이터셋을 numpy 배열로 불러오기
    train = np.array(f['train'])
    test = np.array(f['test'])
    neighbors = np.array(f['neighbors'])
    # distances 데이터셋이 있다면 같이 로드할 수 있습니다.
    # distances = np.array(f['distances'])

# random sample 100,000 from train
seed = 42
n_target = 100_000
rng = np.random.RandomState(seed)
idx = rng.choice(train.shape[0], n_target, replace=False)
train = train[idx]

dim = train.shape[1]
efConstruction = 100
paramM = 16
distance_method = 'cosine'

# --- 3. Naive vs Cluster-wise: 두 인덱스 빌드 & 시각화 ---
print("Building Faiss HNSW (naive vs cluster-wise) ...")

# (1) Cosine ~= Inner Product 를 위해 L2 정규화
train_norm = train.copy()
faiss.normalize_L2(train_norm)

# (2) 하나의 공통 UMAP 레이아웃(공정 비교)
print("Running UMAP for 2D layout...")
reducer = umap.UMAP(
    n_components=2,
    n_neighbors=15,
    min_dist=0.1,
    metric=distance_method,   # 'cosine'
    random_state=42
)
embedding_2d = reducer.fit_transform(train_norm)  # (N, 2)

def build_faiss_hnsw_index(vectors, order, M, efc):
    """지정한 삽입 순서(order)로 HNSW 인덱스를 빌드."""
    index = faiss.IndexHNSWFlat(dim, M, faiss.METRIC_INNER_PRODUCT)
    index.hnsw.efConstruction = efc
    index.add(vectors[order])  # 순서대로 add
    return index

# 쿼리를 아예 제외하고 생각해야함
def extract_layer0_edges(hnsw_index):
    """Faiss HNSW의 layer-0 엣지(인접리스트) 추출."""
    offsets = faiss.vector_to_array(hnsw_index.hnsw.offsets)
    neighbors_flat = faiss.vector_to_array(hnsw_index.hnsw.neighbors)

    edges = []
    for i in range(len(offsets) - 1):
        s, e = offsets[i], offsets[i + 1]
        for nb in neighbors_flat[s:e]:
            if nb >= 0:
                edges.append((i, int(nb)))
    return edges

def to_edge_set_in_original_ids(edges, order):
    """
    hnsw 내부 id로 표현된 edges 를 '원본 샘플 id' 쌍(무방향, 정렬된 튜플)으로 변환해 set으로 반환.
    - edges: [(u_internal, v_internal), ...]
    - order: 내부 id -> 원본 id 로의 매핑을 담은 배열(naive_order / clustered_order)
    """
    order = np.asarray(order)
    s = set()
    for u, v in edges:
        ou = int(order[u])
        ov = int(order[v])
        if ou == ov:
            continue
        # 무방향(edge 중복 제거)을 위해 정렬된 튜플로 저장
        if ou < ov:
            s.add((ou, ov))
        else:
            s.add((ov, ou))
    return s

def filter_by_umap_length(edge_set, coords, threshold=None):
    """
    UMAP 상에서 일정 길이(threshold) 이상인 엣지만 남기고 싶을 때 사용.
    threshold가 None이면 필터링 하지 않음.
    - edge_set: {(u_orig, v_orig), ...}  (원본 id 공간)
    - coords: UMAP 좌표 (원본 id 순서의 (N, 2) 배열)
    """
    if threshold is None:
        return edge_set
    out = set()
    for u, v in edge_set:
        if np.linalg.norm(coords[u] - coords[v]) >= threshold:
            out.add((u, v))
    return out

# (3-A) Naive(랜덤 삽입 순서)
rng_vis = np.random.RandomState(42)
naive_order = rng_vis.permutation(n_target)

index_naive = build_faiss_hnsw_index(train_norm, naive_order, paramM, efConstruction)
edges_naive = extract_layer0_edges(index_naive)
coords_naive = embedding_2d[naive_order]  # 인덱스의 내부 id와 좌표 정렬을 맞춤

# (3-B) Cluster-wise(클러스터 순서로 삽입)
kmeans = KMeans(n_clusters=100, n_init='auto', random_state=21).fit(train_norm)
cluster_data = defaultdict(list)
for i, lbl in enumerate(kmeans.labels_):
    cluster_data[int(lbl)].append(i)

clustered_order = []
for c in sorted(cluster_data.keys()):
    clustered_order.extend(cluster_data[c])

index_clustered = build_faiss_hnsw_index(train_norm, clustered_order, paramM, efConstruction)
edges_clustered = extract_layer0_edges(index_clustered)
coords_clustered = embedding_2d[clustered_order]


Keys in HDF5 file: ['distances', 'neighbors', 'test', 'train']
Building Faiss HNSW (naive vs cluster-wise) ...
Running UMAP for 2D layout...


  warn(


### Upper-layer Long-Bridge Audit & Repair (ULBAR) – scaffolding

In [2]:
import networkx as nx
from itertools import combinations

# --------------- FAISS HNSW helpers (levels & overlay) ---------------

def get_hnsw_levels(hnsw_index):
    """Return numpy array of per-node max level (0-based)."""
    return faiss.vector_to_array(hnsw_index.hnsw.levels)

levels_naive     = get_hnsw_levels(index_naive)
levels_clustered = get_hnsw_levels(index_clustered)

print("Max level (naive, clustered):", levels_naive.max(), levels_clustered.max())

# Build an overlay graph among nodes whose level >= min_level.
# Since FAISS Python API doesn't expose per-level adjacency, we approximate an upper-layer graph
# by constructing a small kNN graph within the upper-node set using cosine (IP on normalized).

def build_upper_overlay_knn(X_norm, node_ids, k=8):
    """Build a small undirected kNN overlay graph among node_ids.
    Returns a NetworkX Graph over original ids and an adjacency dict {u: set(vs)}.
    """
    if len(node_ids) == 0:
        G = nx.Graph(); G.add_nodes_from([])
        return G, {}
    sub = X_norm[node_ids]
    index_flat = faiss.IndexFlatIP(sub.shape[1])
    index_flat.add(sub)
    D, I = index_flat.search(sub, min(k+1, len(node_ids)))  # self included
    adj = {int(node_ids[i]): set() for i in range(len(node_ids))}
    for i in range(len(node_ids)):
        u = int(node_ids[i])
        for j in I[i]:
            j = int(j)
            if j == i:  # skip self
                continue
            v = int(node_ids[j])
            adj[u].add(v)
            adj[v].add(u)
    G = nx.Graph()
    for u, vs in adj.items():
        for v in vs:
            if u < v:
                G.add_edge(u, v)
    return G, adj

# Choose a target upper layer (e.g., min_level = 2 or max_level-1 if shallow)
min_level = int(max(2, int(levels_naive.max()) - 1))
upper_nodes_naive     = np.where(levels_naive     >= min_level)[0]
upper_nodes_clustered = np.where(levels_clustered >= min_level)[0]
print(f"Upper nodes (>=L{min_level}): naive={len(upper_nodes_naive)}, clustered={len(upper_nodes_clustered)}")

G_up_naive,     adj_up_naive     = build_upper_overlay_knn(train_norm, upper_nodes_naive,     k=8)
G_up_clustered, adj_up_clustered = build_upper_overlay_knn(train_norm, upper_nodes_clustered, k=8)

# --------------- Community & medoids ---------------

def louvain_partition(G):
    try:
        from networkx.algorithms.community import louvain_communities
        comms = louvain_communities(G, seed=42)
    except Exception:
        from networkx.algorithms.community import greedy_modularity_communities
        comms = greedy_modularity_communities(G)
    comm_id = {}
    for cid, C in enumerate(comms):
        for v in C:
            comm_id[int(v)] = cid
    return comm_id, [set(c) for c in comms]

def medoid_of_nodes(nodes, X_norm):
    nodes = list(nodes)
    if not nodes:
        return None
    sub = X_norm[nodes]
    S = sub @ sub.T
    Dsum = (1.0 - S).sum(axis=1)
    midx = int(np.argmin(Dsum))
    return int(nodes[midx])

comm_naive,     comms_naive     = louvain_partition(G_up_naive)
comm_clustered, comms_clustered = louvain_partition(G_up_clustered)
print(f"#Communities: naive={len(comms_naive)}, clustered={len(comms_clustered)}")

# --------------- Audit rules ---------------

def coverage_and_ratio(u, adj, comm_id):
    vs = list(adj.get(u, []))
    if not vs:
        return 0, 0.0
    cov = len({comm_id.get(v, -1) for v in vs})
    ic  = sum(1 for v in vs if comm_id.get(v, -1) != comm_id.get(u, -1)) / len(vs)
    return cov, ic

def cosine_dist(u, v, X_norm):
    return 1.0 - float(np.dot(X_norm[u], X_norm[v]))

def jaccard(u, v, adj):
    a, b = adj.get(u, set()), adj.get(v, set())
    if not a and not b:
        return 0.0
    inter = len(a & b); uni = len(a | b)
    return inter / max(uni, 1)

# (A) Isolated highway entrance candidates (low coverage/inter-cluster)
C_min, r_min = 2, 0.25

def find_isolated_nodes(adj, comm_id):
    cands = []
    for u in adj.keys():
        cov, ic = coverage_and_ratio(u, adj, comm_id)
        if cov < C_min or ic < r_min:
            cands.append((u, cov, ic))
    return sorted(cands, key=lambda x: (x[1], x[2]))

iso_naive     = find_isolated_nodes(adj_up_naive,     comm_naive)
iso_clustered = find_isolated_nodes(adj_up_clustered, comm_clustered)
print(f"Isolated@upper: naive={len(iso_naive)}, clustered={len(iso_clustered)} (show 5)")
print("naive examples:", iso_naive[:5])

# (B) Misplaced highway candidates (inter-comm & long & redundant)
length_q = 0.7

def find_long_edges(adj, X_norm, q=0.7):
    # collect all edge lengths
    lens = []
    seen = set()
    for u, vs in adj.items():
        for v in vs:
            a, b = (u, v) if u < v else (v, u)
            if (a, b) in seen: continue
            seen.add((a, b))
            lens.append(cosine_dist(a, b, X_norm))
    lens = np.array(lens) if lens else np.array([0.0])
    tau = float(np.quantile(lens, q))
    return tau


tau_na = find_long_edges(adj_up_naive, train_norm, q=length_q)

def bad_edge_candidates(adj, comm_id, X_norm, tau):
    bad = []
    seen = set()
    for u, vs in adj.items():
        for v in vs:
            a, b = (u, v) if u < v else (v, u)
            if (a, b) in seen: continue
            seen.add((a, b))
            if comm_id.get(a, -1) == comm_id.get(b, -1):
                continue
            d = cosine_dist(a, b, X_norm)
            if d < tau:
                continue  # not long enough
            jac = jaccard(a, b, adj)
            if jac > 0.5:
                bad.append((a, b, d, jac))
    return sorted(bad, key=lambda x: (-x[3], -x[2]))

bad_naive = bad_edge_candidates(adj_up_naive, comm_naive, train_norm, tau_na)
print(f"Bad long-edges (naive): {len(bad_naive)}; examples:", bad_naive[:5])

# (C) Missing-city detection: communities present at layer-0 but absent in upper
# For a quick proxy, use k-means on X_norm as 'semantic clusters' (or reuse existing kmeans above if available)
K_sem = 50
km_sem = KMeans(n_clusters=K_sem, n_init='auto', random_state=123).fit(train_norm)
labels_sem = km_sem.labels_

upper_presence = np.zeros(K_sem, dtype=bool)
for u in upper_nodes_naive:
    upper_presence[labels_sem[u]] = True
missing_clusters = np.where(~upper_presence)[0].tolist()
print(f"Missing (no upper node) semantic clusters: {len(missing_clusters)} (show 5):", missing_clusters[:5])

# For each missing cluster, pick the medoid (closest to its centroid) as a promotion candidate
def cluster_medoid_indices(X_norm, labels, cluster_ids):
    c2idx = {cid: np.where(labels == cid)[0] for cid in cluster_ids}
    promos = []
    for cid, idxs in c2idx.items():
        if len(idxs) == 0: continue
        sub = X_norm[idxs]
        centroid = sub.mean(axis=0)
        d = 1.0 - sub @ centroid  # cosine dist to centroid (approx)
        m = int(idxs[np.argmin(d)])
        promos.append((cid, m))
    return promos

promote_naive = cluster_medoid_indices(train_norm, labels_sem, missing_clusters)
print("Promotion candidates (cluster_id, medoid_id) examples:", promote_naive[:5])


Max level (naive, clustered): 7 7
Upper nodes (>=L6): naive=7, clustered=7
#Communities: naive=1, clustered=1
Isolated@upper: naive=7, clustered=7 (show 5)
naive examples: [(592, 1, 0.0), (1751, 1, 0.0), (3123, 1, 0.0), (3913, 1, 0.0), (5033, 1, 0.0)]
Bad long-edges (naive): 0; examples: []
Missing (no upper node) semantic clusters: 43 (show 5): [0, 1, 2, 3, 4]
Promotion candidates (cluster_id, medoid_id) examples: [(0, 5426), (1, 7359), (2, 9674), (3, 6944), (4, 8408)]


### Overlay proposal (no in-place modification)

In [3]:
# We won't mutate FAISS index. Instead, we propose overlay operations for experimental greedy descent.
# Proposed operations:
#  - ADD for isolated nodes: connect to top-L nearby communities' hubs/medoids
#  - DROP for bad edges

# Build per-community hub list on the upper overlay graph (by degree)

def top_hubs_by_comm(G, comm_id, top_p=0.1, min_per_comm=3):
    deg = dict(G.degree())
    per_comm = defaultdict(list)
    for v, d in deg.items():
        c = comm_id.get(v, -1)
        per_comm[c].append((d, v))
    hubs = {}
    for c, arr in per_comm.items():
        arr.sort(reverse=True)
        k = max(min_per_comm, int(len(arr) * top_p))
        hubs[c] = [v for _, v in arr[:k]]
    return hubs

hubs_naive = top_hubs_by_comm(G_up_naive, comm_naive, top_p=0.2, min_per_comm=2)

# Propose ADDs for the top-10 isolated
L_neigh = 3
K_per_comm = 3

def nearest_communities(node, comm_id, comms, X_norm, L=3):
    cu = comm_id.get(node, -1)
    # distance between node and other communities' medoids
    medoids = {}
    for cid, C in enumerate(comms):
        m = medoid_of_nodes(C, X_norm)
        if m is None: continue
        medoids[cid] = m
    cand = []
    for cid, m in medoids.items():
        if cid == cu: continue
        d = cosine_dist(node, m, X_norm)
        cand.append((d, cid))
    cand.sort()
    return [cid for _, cid in cand[:L]]

add_proposals = []
for (u, cov, ic) in iso_naive[:10]:
    ncomms = nearest_communities(u, comm_naive, comms_naive, train_norm, L=L_neigh)
    for cid in ncomms:
        for v in hubs_naive.get(cid, [])[:K_per_comm]:
            d = cosine_dist(u, v, train_norm)
            jac = jaccard(u, v, adj_up_naive)
            add_proposals.append((u, v, d, jac))

print("ADD proposals (u,v,dist,jacc) examples:", sorted(add_proposals, key=lambda x: (x[2], x[3]))[:10])

# DROP proposals from bad_naive
print("DROP proposals (a,b,dist,jacc) examples:", bad_naive[:10])

# NOTE: next step would be to build an overlay adjacency by applying a small number of ADD/DROP
# and run a custom greedy descent (overlay for upper layers, FAISS for layer-0) to measure recall/visited.


ADD proposals (u,v,dist,jacc) examples: []
DROP proposals (a,b,dist,jacc) examples: []


### Tiny overlay apply (dry run)

In [4]:
from copy import deepcopy

overlay_adj = {u: set(vs) for u, vs in adj_up_naive.items()}  # shallow copy

# Apply up to 10 DROP and 10 ADD from our proposals
for (a,b,dist,jac) in bad_naive[:10]:
    if b in overlay_adj.get(a, set()):
        overlay_adj[a].discard(b)
        overlay_adj.setdefault(b, set()).discard(a)

for (u,v,dist,jac) in sorted(add_proposals, key=lambda x: (x[2], -x[3]))[:10]:
    overlay_adj.setdefault(u, set()).add(v)
    overlay_adj.setdefault(v, set()).add(u)

# Report coverage/inter-cluster for a few isolated nodes before/after
def coverage_ratio_on(adj, comm_id, nodes):
    out = []
    for u in nodes:
        cov, ic = coverage_and_ratio(u, adj, comm_id)
        out.append((u, cov, ic))
    return out

print("Before (first 5 iso):", coverage_ratio_on(adj_up_naive, comm_naive, [u for u,_,_ in iso_naive[:5]]))
print("After  (first 5 iso):", coverage_ratio_on(overlay_adj,  comm_naive, [u for u,_,_ in iso_naive[:5]]))


Before (first 5 iso): [(592, 1, 0.0), (1751, 1, 0.0), (3123, 1, 0.0), (3913, 1, 0.0), (5033, 1, 0.0)]
After  (first 5 iso): [(592, 1, 0.0), (1751, 1, 0.0), (3123, 1, 0.0), (3913, 1, 0.0), (5033, 1, 0.0)]
