# Tlearn2rec: Unsupervised GNN on Colab

This notebook runs the unsupervised GNN training pipeline on Google Colab using your repository.

- Installs PyTorch Geometric and dependencies
- Uses your `new/` modules (`config.py`, `datasets.py`, `model.py`, `train_unsupervised.py`)
- Trains on `data/processed/facilities` if present; otherwise falls back to a KarateClub demo
- Saves best models under `models/facilities/UnsupervisedGNN`

Tip: On Colab, place this project folder at `/content/Tlearn2rec` (e.g., upload or mount Google Drive), or set `TLEARN2REC_ROOT` below.


In [None]:
import os, sys, subprocess, pathlib, re

# Detect Colab
IN_COLAB = 'COLAB_RELEASE_TAG' in os.environ or 'COLAB_GPU' in os.environ
print(f"Running in Colab: {IN_COLAB}")

# Install deps (PyG wheels matched to current torch/cuda)
def pip_install(args):
    print('pip install', ' '.join(args))
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q'] + args)

# Torch is usually preinstalled on Colab; ensure availability
try:
    import torch  # noqa: F401
except Exception:
    pip_install(['torch', 'torchvision', 'torchaudio'])
import torch

# Compute wheel index URL for PyG
cuda_tag = 'cpu' if not torch.cuda.is_available() else 'cu121'
# Example: 2.4.0 from '2.4.0+cu121'
torch_ver = torch.__version__.split('+')[0]
whl_url = f"https://data.pyg.org/whl/torch-{torch_ver}+{cuda_tag}.html"
print('Torch version:', torch.__version__, '| CUDA:', torch.version.cuda, '| wheel idx:', whl_url)

# Core scientific deps
pip_install(['scikit-learn', 'matplotlib', 'networkx'])
# PyG deps
pip_install(['pyg_lib', 'torch_scatter', 'torch_sparse', 'torch_cluster', 'torch_spline_conv', '-f', whl_url])
pip_install(['torch_geometric'])
# Optimizers
pip_install(['torch-optimizer'])

print('CUDA available:', torch.cuda.is_available())
if torch.cuda.is_available():
    print('Device name:', torch.cuda.get_device_name(0))


In [None]:
# Project location
# Option A (default): project at /content/Tlearn2rec (upload or clone)
# Option B: set env var TLEARN2REC_ROOT to your path (e.g., on Drive)
DEFAULT_ROOT = '/content/Tlearn2rec' if os.path.exists('/content') else str(pathlib.Path.cwd())
PROJECT_ROOT = os.environ.get('TLEARN2REC_ROOT', DEFAULT_ROOT)
print('PROJECT_ROOT =', PROJECT_ROOT)

assert os.path.exists(PROJECT_ROOT), ("Project folder not found.\n"
    "- Upload the Tlearn2rec folder to /content, or\n"
    "- Mount Drive and set os.environ['TLEARN2REC_ROOT'] to the folder path.")

# Make sure Python can import from the 'new/' package
NEW_DIR = str(pathlib.Path(PROJECT_ROOT) / 'new')
os.chdir(NEW_DIR)
if NEW_DIR not in sys.path:
    sys.path.insert(0, NEW_DIR)
print('Working dir =', os.getcwd())


In [None]:
# Imports from your project
import os
from pathlib import Path
import torch

import config  # resolved from new/config.py due to cwd
from datasets import build_loaders, load_karate
from model import SimpleGCN, SimpleGAT, SimpleSAGE

# Optional: quiet shape prints and reduce workers on Colab
config.DEBUG_SHAPES = False
config.NUM_WORKERS = min(2, config.NUM_WORKERS)

# Shorten epochs for Colab demos; override via EPOCHS env var
try:
    config.MAX_EPOCHS = int(os.environ.get('EPOCHS', 50))
except Exception:
    pass

# Check dataset availability
TRAIN_DIR = config.TRAIN_DIR
VALID_DIR = config.VALID_DIR

DEMO_MODE = not (Path(TRAIN_DIR).exists() and any(Path(TRAIN_DIR).glob('*.pt')))
print('DEMO_MODE =', DEMO_MODE)
print('Train dir:', TRAIN_DIR)
print('Valid dir:', VALID_DIR)


In [None]:
# Training utilities from your script
import importlib
import train_unsupervised as tu  # uses new/train_unsupervised.py via cwd

from train_unsupervised import ContrastiveLoss, train_one_epoch, eval_loss, eval_on_karate

# Try Muon optimizer
try:
    from torch_optimizer import Muon
    _HAS_MUON = True
except Exception:
    Muon = None  # type: ignore
    _HAS_MUON = False

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device)

# Recreate the training loop with optional demo mode
from torch.utils.data import DataLoader
from torch_geometric.data import Data
from torch_geometric.utils import to_undirected

if not DEMO_MODE:
    # Full training using your existing main() across three models
    tu.main()
else:
    # Demo on KarateClub: build a small loader of repeated graphs
    data = load_karate().to(device)
    if data.x is None:
        data.x = torch.eye(data.num_nodes, device=device)
    repeats = 8
    demo_loader = DataLoader([data.cpu()] * repeats, batch_size=1, shuffle=True)

    in_channels = data.x.size(-1)
    models = [SimpleGAT, SimpleSAGE, SimpleGCN]

    for M in models:
        model = M(in_channels=in_channels, hidden_channels=config.HIDDEN_DIM, embedding_dim=config.EMBED_DIM).to(device)
        if _HAS_MUON and Muon is not None:
            optimizer = Muon(model.parameters(), lr=config.LR)
        else:
            optimizer = torch.optim.Adam(model.parameters(), lr=config.LR)
        criterion = ContrastiveLoss(temperature=config.TEMPERATURE)

        best_val = float('inf')
        best_path = config.EXPERIMENT_DIR / f'best_{M.__name__}.pt'

        for epoch in range(1, config.MAX_EPOCHS + 1):
            train_loss = train_one_epoch(model, demo_loader, criterion, optimizer, device)
            # reuse demo loader for val as well in demo mode
            val_loss = eval_loss(model, demo_loader, criterion, device)
            karate_score = eval_on_karate(model, device)
            if epoch % 5 == 0 or epoch == 1:
                print(f"{M.__name__} | epoch {epoch:03d} | train {train_loss:.4f} | val {val_loss:.4f} | karate {karate_score:.4f}")
            if val_loss < best_val - 1e-6:
                best_val = val_loss
                torch.save({'state_dict': model.state_dict(), 'in_channels': in_channels}, best_path)
        print(f"Saved best {M.__name__} to {best_path}")


In [None]:
# Visualization on KarateClub using the best SimpleGCN (as an example)
from pathlib import Path
from sklearn.cluster import KMeans, SpectralClustering, AgglomerativeClustering
import torch.nn.functional as F
from visualize import plot_embeddings_and_clusters, plot_network_clusters

ckpt_path = Path(config.EXPERIMENT_DIR) / 'best_SimpleGCN.pt'
assert ckpt_path.exists(), f"Checkpoint not found: {ckpt_path}"

# Load model
# Infer in_channels
karate = load_karate().to(device)
if karate.x is None:
    karate.x = torch.eye(karate.num_nodes, device=device)

in_channels = karate.x.size(-1)
model = SimpleGCN(in_channels=in_channels, hidden_channels=config.HIDDEN_DIM, embedding_dim=config.EMBED_DIM).to(device)
state = torch.load(ckpt_path, map_location=device)
model.load_state_dict(state['state_dict'])
model.eval()

with torch.no_grad():
    emb = model(karate)

# Prepare clustering
k = int(torch.unique(karate.y).numel()) if hasattr(karate, 'y') and karate.y is not None else 2
emb_np = emb.detach().cpu().numpy()

kmeans_labels = KMeans(n_clusters=k, n_init=10, random_state=0).fit_predict(emb_np)
spectral_labels = SpectralClustering(n_clusters=k, affinity='nearest_neighbors', assign_labels='kmeans', random_state=0).fit_predict(emb_np)
hier_labels = AgglomerativeClustering(n_clusters=k).fit_predict(emb_np)

clustering_results = {
    'kmeans': kmeans_labels,
    'spectral': spectral_labels,
    'hierarchical': hier_labels,
}

true_labels = karate.y.detach().cpu().numpy() if hasattr(karate, 'y') and karate.y is not None else kmeans_labels

plot_embeddings_and_clusters(emb, clustering_results, true_labels, karate)
plot_network_clusters(karate.cpu(), clustering_results, true_labels)


## Notes
- To use Google Drive, add a cell to mount: `from google.colab import drive; drive.mount('/content/drive')`, then set `os.environ['TLEARN2REC_ROOT'] = '/content/drive/MyDrive/Tlearn2rec'` before running the setup cell.
- Adjust `config.MAX_EPOCHS` via the `EPOCHS` environment variable for quicker runs on Colab.
- Checkpoints are saved to `models/facilities/UnsupervisedGNN` under your project folder.
