# Graph ML Template – Node Classification & Link Signals (Classical)

This is a **lightweight skeleton** for graph-based problems, where you have:

- **Nodes**: entities (players, users, items)  
- **Edges**: relationships (teammates, co-occurrence, similarity)  

Instead of full GNNs, this template uses:

- NetworkX to build the graph  
- Graph-based node features (degree, centrality, PageRank)  
- Classical ML for **node classification**  
- Simple link features sketch for **link prediction**  


In [None]:
# ========== 1. Imports & Config (Graph ML) ==========

from pathlib import Path
from typing import Optional

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import networkx as nx

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier

pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)
plt.rcParams["figure.dpi"] = 100

# ---- Config ----
DATA_DIR = Path("../input")


NODES_FILE = "nodes.csv"  # columns: node_id, optional features, optional label
EDGES_FILE = "edges.csv"  # columns: source, target

NODE_ID_COL = "node_id"
NODE_LABEL_COL: Optional[str] = "label"   # set to None for unsupervised


In [None]:
# ========== 2. Load Nodes & Edges, Build Graph ==========

def load_nodes_edges(data_dir: Path = DATA_DIR, nodes_file: str = NODES_FILE, edges_file: str = EDGES_FILE):
    nodes_path = data_dir / nodes_file
    edges_path = data_dir / edges_file

    if not nodes_path.exists():
        raise FileNotFoundError(f"Nodes file not found: {nodes_path}")
    if not edges_path.exists():
        raise FileNotFoundError(f"Edges file not found: {edges_path}")

    nodes = pd.read_csv(nodes_path)
    edges = pd.read_csv(edges_path)

    print("Nodes shape:", nodes.shape)
    print("Edges shape:", edges.shape)
    display(nodes.head())
    display(edges.head())

    return nodes, edges


nodes_df, edges_df = load_nodes_edges()

G = nx.from_pandas_edgelist(
    edges_df,
    source="source",
    target="target",
    create_using=nx.Graph(),
)

attr_dict = nodes_df.set_index(NODE_ID_COL).to_dict(orient="index")
nx.set_node_attributes(G, attr_dict)

print("Graph built:", G.number_of_nodes(), "nodes,", G.number_of_edges(), "edges")


### 3️⃣ Node Features from Graph Structure

We compute simple graph features:

- Degree  
- Clustering coefficient  
- PageRank  

For large graphs you may skip expensive centralities.


In [None]:
deg = dict(G.degree())
nx.set_node_attributes(G, deg, "degree")


clust = nx.clustering(G)
nx.set_node_attributes(G, clust, "clustering")


pr = nx.pagerank(G, alpha=0.85)
nx.set_node_attributes(G, pr, "pagerank")


nodes_feat = pd.DataFrame.from_dict(dict(G.nodes(data=True)), orient="index")
nodes_feat.index.name = NODE_ID_COL
nodes_feat.reset_index(inplace=True)

print("Node features from graph:")
display(nodes_feat.head())

nodes_full = nodes_df.merge(nodes_feat, on=NODE_ID_COL, suffixes=("_orig", ""))
display(nodes_full.head())


### 4️⃣ Node Classification Baseline

If labels are present, use the graph-derived features (and any original node features) to predict node labels.


In [None]:
if NODE_LABEL_COL is not None and NODE_LABEL_COL in nodes_full.columns:
    drop_cols = [NODE_ID_COL]
    X = nodes_full.drop(columns=drop_cols + [NODE_LABEL_COL])
    y = nodes_full[NODE_LABEL_COL]

    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    clf = RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        n_jobs=-1,
        random_state=42,
    )
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_valid)

    print("Node classification baseline:")
    print("Accuracy:", accuracy_score(y_valid, y_pred))
    print("F1 (weighted):", f1_score(y_valid, y_pred, average="weighted"))
    print(classification_report(y_valid, y_pred, digits=4))
else:
    print("NODE_LABEL_COL not set or missing – skipping node classification.")


### 5️⃣ Simple Link Prediction Features (Sketch)

For link prediction you usually:

1. Build examples of existing edges (positives) and sampled non-edges (negatives).  
2. Compute edge-level features:
   - Common neighbors count  
   - Jaccard coefficient  
   - Preferential attachment, etc.  
3. Train a classifier on these features.

Below we show an example feature (common neighbors) for existing edges.


In [None]:
def common_neighbors_count(G, u, v):
    return len(list(nx.common_neighbors(G, u, v)))


edge_features = []
for _, row in edges_df.iterrows():
    u = row["source"]
    v = row["target"]
    cn = common_neighbors_count(G, u, v)
    edge_features.append({"source": u, "target": v, "common_neighbors": cn})

edge_feat_df = pd.DataFrame(edge_features)
display(edge_feat_df.head())

print("For real link prediction, sample non-edges as negatives,")
print("compute same features, and train a binary classifier.")
