# HNet SMILES on Google Colab

This notebook installs all dependencies on a Colab GPU runtime and runs training/generation for the SMILES dataset in this repository.

Requirements:

- Colab runtime set to GPU (Runtime -> Change runtime type -> Hardware accelerator: GPU)
- Internet access (to install dependencies and optionally clone the repo)

Tips:

- You can save checkpoints and outputs to Google Drive by mounting it (optional cell included).


In [None]:
# Verify GPU runtime
import torch, platform

print("Python:", platform.python_version())
print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
else:
    print("Please enable GPU in Runtime -> Change runtime type")

In [None]:
# Optional: mount Google Drive to persist checkpoints
USE_DRIVE = False  # set to True to enable

if USE_DRIVE:
    from google.colab import drive

    drive.mount("/content/drive")
    import os

    os.system("mkdir -p /content/drive/MyDrive/hnet_smiles")
    WORKDIR = "/content/drive/MyDrive/hnet_smiles"
else:
    WORKDIR = "/content"

import os

os.environ["WORKDIR"] = WORKDIR
print("Working directory:", WORKDIR)

In [None]:
# Get the repo into the Colab runtime
import os, subprocess

repo_url = "https://github.com/jordiferrero/hnet_smiles"
workdir = os.environ.get("WORKDIR", "/content")
target_dir = os.path.join(workdir, "hnet_smiles")
if not os.path.isdir(os.path.join(target_dir, ".git")):
    subprocess.run(["git", "clone", repo_url, target_dir], check=True)
else:
    subprocess.run(
        ["bash", "-lc", f'cd "{target_dir}" && git pull --ff-only || true'], check=False
    )
os.chdir(target_dir)
print("CWD:", os.getcwd())

In [None]:
# Install core Python deps and CUDA-specific libs
import sys, subprocess

# Upgrade pip
subprocess.run([sys.executable, "-m", "pip", "install", "--upgrade", "pip"], check=True)

# Core deps
subprocess.run(
    [sys.executable, "-m", "pip", "install", "-r", "setup/requirements.txt"],
    check=False,
)

# CUDA-specific libs (Colab has CUDA; these may take a while). If wheels exist, they will be used.
# FlashAttention
subprocess.run(
    [
        sys.executable,
        "-m",
        "pip",
        "install",
        "flash-attn==2.6.3",
        "--no-build-isolation",
    ],
    check=False,
)

# mamba_ssm and causal-conv1d pinned to known SHAs
subprocess.run(
    [
        sys.executable,
        "-m",
        "pip",
        "install",
        "git+https://github.com/state-spaces/mamba.git@a6a1dae6efbf804c9944a0c2282b437deb4886d8",
    ],
    check=False,
)
subprocess.run(
    [
        sys.executable,
        "-m",
        "pip",
        "install",
        "git+https://github.com/Dao-AILab/causal-conv1d.git@e940ead2fd962c56854455017541384909ca669f",
    ],
    check=False,
)

# Install local hnet package last (allow resolving deps already installed)
subprocess.run(
    [
        sys.executable,
        "-m",
        "pip",
        "install",
        "-e",
        "original_resources/hnet-github-repo",
        "--no-deps",
    ],
    check=True,
)

import torch

print("Torch:", torch.__version__, "| CUDA available:", torch.cuda.is_available())

In [None]:
# (Optional) Quick data analysis plot
import json
from pathlib import Path

!python data/analyze_smiles.py --csv-path datasets/PI1M/PI1M_v2.csv --plot --output-dir visualizations

stats_path = Path('visualizations/smiles_statistics.json')
if stats_path.exists():
    with open(stats_path) as f:
        stats = json.load(f)
    print({k: stats['length_stats'][k] for k in ['mean','median','q90','q95']})
else:
    print('Stats file not found (skipping)')



In [None]:
# Train: Small phase (1K samples)
!python train_smiles.py \
  --config configs/hnet_smiles_small.json \
  --phase small \
  --max-samples 1000 \
  --batch-size 8 \
  --epochs 5 \
  --output-dir checkpoints



In [None]:
# Generate a few tokens from the trained small-phase checkpoint
from pathlib import Path
import subprocess, sys

ckpt_candidates = (
    sorted(Path("checkpoints").glob("checkpoint_phase_small_epoch_*.pt"))
    if Path("checkpoints").exists()
    else []
)
ckpt = ckpt_candidates[-1] if ckpt_candidates else None
print("Using checkpoint:", ckpt)

if ckpt is not None:
    subprocess.run(
        [
            sys.executable,
            "generate_smiles.py",
            "--checkpoint",
            str(ckpt),
            "--config",
            "configs/hnet_smiles_small.json",
            "--prompt",
            "*",
            "--max-tokens",
            "256",
            "--temperature",
            "1.0",
        ],
        check=True,
    )
else:
    print("No checkpoint found; run the training cell first.")

In [None]:
# Visualize dynamic chunking (creates an animated GIF)
from pathlib import Path
import subprocess, sys

ckpt_candidates = (
    sorted(Path("checkpoints").glob("checkpoint_phase_small_epoch_*.pt"))
    if Path("checkpoints").exists()
    else []
)
ckpt = ckpt_candidates[-1] if ckpt_candidates else None
print("Using checkpoint:", ckpt)

out_dir = Path("visualizations/output")
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / "chunking_example.gif"

if ckpt is not None:
    subprocess.run(
        [
            sys.executable,
            "visualizations/visualize_chunking.py",
            "--checkpoint",
            str(ckpt),
            "--config",
            "configs/hnet_smiles_small.json",
            "--text",
            "*CCC[Fe]CCCC(=O)OCCCCOCCCNCC(*)=O",
            "--output",
            str(out_path),
        ],
        check=True,
    )
    print("Saved:", out_path)
else:
    print("No checkpoint found; run the training cell first.")