# This notebook shows a step-by-step on how the semantic network is built

## 0. Imports

In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import NearestNeighbors
from collections import defaultdict

## 1. Config

In [2]:
DATA_PATH = "../data/materials_data_final.csv"   # ['source', 'functions', 'application']

# sBERT model (small, fast, good quality)
SBERT_MODEL = "all-MiniLM-L6-v2"

# Similarity graph parameters for *within-category* edges
SIM_THRESHOLD = 0.62     # only add intra-category edges if cosine sim >= this
TOP_K = 10               # OR, keep up to top-K neighbors per node (after threshold)

# Function splitting
FUNCTION_SEPARATORS = [";", "|", "\n", ","]  # adjust if needed
LOWERCASE_KEYS = True    # normalize node keys to avoid duplicates

## 2. Load and check data

In [3]:
df = pd.read_csv(DATA_PATH)

# Basic sanity checks
expected_cols = {"source", "functions", "application"}
missing = expected_cols - set(df.columns)
assert not missing, f"Missing required columns: {missing}"

# Clean helpers
def norm_text(s):
    if pd.isna(s): 
        return ""
    s = str(s).strip()
    return s.lower() if LOWERCASE_KEYS else s

def split_functions(s):
    if pd.isna(s) or not str(s).strip():
        return []
    text = str(s)
    for sep in FUNCTION_SEPARATORS:
        text = text.replace(sep, ";")
    parts = [p.strip() for p in text.split(";")]
    parts = [p for p in parts if p]  # drop empties
    return parts

# Normalize/expand rows
rows = []
for _, r in df.iterrows():
    src = norm_text(r["source"])
    app = norm_text(r["application"])
    funcs = [norm_text(f) for f in split_functions(r["functions"])]
    # If no functions, keep an empty list (we'll still add source<->application)
    rows.append({"source": src, "functions": funcs, "application": app})

df_norm = pd.DataFrame(rows)
df_norm.head()


Unnamed: 0,source,functions,application
0,denim,"[recycle-yarn, weave-fabric, support-innovatio...",large work of art presented to the dutch royal...
1,bacterial dye,"[produce pigments, create sustainable alternat...",microbial colour library for dyeing textiles
2,basalt,"[reinforce-fabric, prevent-algal growth, exten...",reinforcement fabric for maritime applications
3,flax fibers,"[provide structural support, reduce environmen...",interior wall panels
4,cotton,"[recycle-textiles, create-carpets, reduce-wast...",high-quality recycled carpets


## 3. Build node dictionaries

In [4]:
# We’ll keep canonical keys for graph nodes, and also store a display label.
# (Here, since we normalized to lower-case, the key == label; if you want
#  nicer labels, keep a separate mapping.)

sources = set()
functions = set()
applications = set()

for r in rows:
    if r["source"]: sources.add(r["source"])
    if r["application"]: applications.add(r["application"])
    for f in r["functions"]:
        functions.add(f)

print(f"#sources={len(sources)}, #functions={len(functions)}, #applications={len(applications)}")


#sources=1314, #functions=4144, #applications=1342


## 4. Initialize graph and add nodes

In [5]:
G = nx.Graph()

def add_nodes(category, items):
    for x in items:
        if not x: 
            continue
        G.add_node(
            x,
            label=x,           # you can store original (non-lowercased) label if you kept it
            category=category  # 'source' | 'function' | 'application'
        )

add_nodes("source", sources)
add_nodes("function", functions)
add_nodes("application", applications)

print(f"Graph now has {G.number_of_nodes()} nodes.")


Graph now has 6800 nodes.


## 5. Add inter-category edges

In [6]:
def bump_edge(u, v, edge_type="cooccurrence", weight=1.0):
    if u == v:
        return
    if G.has_edge(u, v):
        # accumulate weight if already present
        G[u][v]["weight"] += weight
        # if we’re mixing edge types, we can keep a set; here we keep the last
        G[u][v]["edge_type"] = edge_type
    else:
        G.add_edge(u, v, weight=weight, edge_type=edge_type)

for r in rows:
    s = r["source"]
    a = r["application"]
    f_list = r["functions"]

    # source <-> each function
    for f in f_list:
        if s and f:
            bump_edge(s, f, edge_type="cooccurrence", weight=1.0)

    # source <-> application
    #if s and a:
    #    bump_edge(s, a, edge_type="cooccurrence", weight=1.0)

    # each function <-> application
    for f in f_list:
        if f and a:
            bump_edge(f, a, edge_type="cooccurrence", weight=1.0)

print(f"After co-occurrence edges: {G.number_of_edges()} edges.")


After co-occurrence edges: 10764 edges.


## 6. Embed each category with sBERT

In [7]:
model = SentenceTransformer(SBERT_MODEL)

def embed_list(items):
    # items: list of node keys (strings)
    # returns np.array (n, d)
    if not items:
        return np.empty((0, 384))  # MiniLM dim
    return np.array(model.encode(items, show_progress_bar=True, normalize_embeddings=True))

# Prepare per-category lists for stable ordering
src_list = sorted(list(sources))
fun_list = sorted(list(functions))
app_list = sorted(list(applications))

src_emb = embed_list(src_list)
fun_emb = embed_list(fun_list)
app_emb = embed_list(app_list)

src_emb.shape, fun_emb.shape, app_emb.shape

Batches:   0%|          | 0/42 [00:00<?, ?it/s]

Batches:   0%|          | 0/130 [00:00<?, ?it/s]

Batches:   0%|          | 0/42 [00:00<?, ?it/s]

((1314, 384), (4144, 384), (1342, 384))

## 7. Add intra-category semantic edges with nearest neighbors

In [8]:
def add_semantic_edges(labels, embeddings, edge_type="semantic",
                       sim_threshold=SIM_THRESHOLD, top_k=TOP_K):
    n = len(labels)
    if n <= 1:
        return 0

    # NearestNeighbors with cosine metric returns distances; similarity = 1 - distance
    nbrs = NearestNeighbors(n_neighbors=min(top_k+1, n), metric="cosine", algorithm="auto")
    nbrs.fit(embeddings)
    distances, indices = nbrs.kneighbors(embeddings)

    added = 0
    for i, (dists, nbr_idx) in enumerate(zip(distances, indices)):
        u = labels[i]
        for dist, j in zip(dists, nbr_idx):
            if i == j:
                continue
            sim = 1.0 - float(dist)
            if sim >= sim_threshold:
                v = labels[j]
                # Avoid inter-category mixing here — we're passed a single category list
                if not G.has_edge(u, v):
                    G.add_edge(u, v, weight=sim, edge_type=edge_type)
                    added += 1
                else:
                    # If an edge already exists (e.g., from cooccurrence), keep the larger weight?
                    # Here we keep both concepts: cooccurrence edges are counts ≥1; semantic are ≤1
                    # We’ll simply set the edge_type to 'semantic+cooccurrence' if both existed.
                    e = G[u][v]
                    if e.get("edge_type") == "cooccurrence":
                        e["edge_type"] = "semantic+cooccurrence"
                        # Keep the cooccurrence weight (count) as-is.
                        # Optionally store a separate field:
                        e["semantic_sim"] = sim
                    else:
                        # Another semantic candidate; update to max sim
                        e["weight"] = max(e["weight"], sim)
                        e["edge_type"] = "semantic"
    return added

added_src = add_semantic_edges(src_list, src_emb, edge_type="semantic")
added_fun = add_semantic_edges(fun_list, fun_emb, edge_type="semantic")
added_app = add_semantic_edges(app_list, app_emb, edge_type="semantic")

print(f"Semantic edges added — sources: {added_src}, functions: {added_fun}, applications: {added_app}")
print(f"Total edges now: {G.number_of_edges()}")


Semantic edges added — sources: 4971, functions: 15845, applications: 5115
Total edges now: 36695


## 8. Diagnostics

In [9]:
# Node counts by category
from collections import Counter
cats = Counter(nx.get_node_attributes(G, "category").values())
print("Node categories:", cats)

# Edge-type distribution
etype_counts = Counter(nx.get_edge_attributes(G, "edge_type").values())
print("Edge types:", etype_counts)

# Example: top neighbors by weight for a sample node
def top_neighbors(node, k=10):
    if node not in G:
        return []
    nbrs = []
    for v in G[node]:
        nbrs.append((v, G[node][v].get("weight", 0.0), G[node][v].get("edge_type", "")))
    return sorted(nbrs, key=lambda x: x[1], reverse=True)[:k]

sample = next(iter(sources)) if sources else None
if sample:
    print(f"\nTop neighbors for: {sample}")
    for v, w, et in top_neighbors(sample, k=10):
        print(f"  - {v}  (w={w:.3f}, type={et})")


Node categories: Counter({'function': 4144, 'application': 1342, 'source': 1314})
Edge types: Counter({'semantic': 25931, 'cooccurrence': 10764})

Top neighbors for: optical tiles
  - reflect-light  (w=1.000, type=cooccurrence)
  - sculpt-image  (w=1.000, type=cooccurrence)
  - transform-surface  (w=1.000, type=cooccurrence)
  - ceramic tiles  (w=0.667, type=semantic)


## 9. Save graph

In [10]:
# Graph files
nx.write_gexf(G, "materials_semantic_network.gexf")     # Gephi/pyvis friendly
nx.write_graphml(G, "materials_semantic_network.graphml")

# Optional: CSV edge list
edge_rows = []
for u, v, data in G.edges(data=True):
    edge_rows.append({
        "u": u,
        "v": v,
        "weight": data.get("weight", 1.0),
        "edge_type": data.get("edge_type", "")
    })
pd.DataFrame(edge_rows).to_csv("materials_semantic_edges.csv", index=False)

# Optional: Save node table
node_rows = []
for n, data in G.nodes(data=True):
    node_rows.append({
        "node": n,
        "label": data.get("label", n),
        "category": data.get("category", "")
    })
pd.DataFrame(node_rows).to_csv("materials_semantic_nodes.csv", index=False)

print("Saved: GEXF, GraphML, nodes.csv, edges.csv")


Saved: GEXF, GraphML, nodes.csv, edges.csv
