# Simplified Constraint-Based Molecule Generation
## GPT2-Zinc Baseline + SmileyLlama

In [None]:
!pip install -r requirements.txt
import torch
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Device:", torch.cuda.get_device_name(0))


## 1. Analyze Training Data and Compute Constraint Ranges

In [None]:
from src.analyze_train_data import main as analyze_train_data

# Analyze training data and compute constraint ranges (loose/tight/ultra_tight)
analyze_train_data()


In [None]:
import json
import pandas as pd

# View computed constraint ranges
with open("data/train_property_ranges.json", "r", encoding="utf-8") as fp:
    ranges = json.load(fp)

# Show constraint levels for Combined dataset
print("Constraint Levels (Combined dataset):")
for level in ["loose", "tight", "ultra_tight"]:
    if level in ranges.get("Combined", {}):
        print(f"\n{level.upper()}:")
        for prop, (low, high) in ranges["Combined"][level].items():
            print(f"  {prop}: {low:.2f} - {high:.2f}")


## 2. View Available Prompts

In [None]:
from src.utils import SMILEY_PROMPTS, GPT_ZINC_PROMPTS

# View SmileyLlama prompts (instruction-style)
print("SmileyLlama Prompts (instruction-style):")
smiley_df = pd.DataFrame({"name": [p.name for p in SMILEY_PROMPTS], "text": [p.text for p in SMILEY_PROMPTS]})
smiley_df

# View GPT2-Zinc prefixes (prefix-style)
print("\nGPT2-Zinc Prefixes (prefix-style):")
gpt_zinc_df = pd.DataFrame({"name": [p.name for p in GPT_ZINC_PROMPTS], "text": [p.text for p in GPT_ZINC_PROMPTS]})
gpt_zinc_df


## 3. GPT2-Zinc Baseline Generation (Constraint-Based)

In [None]:
from src.baseline_generate_constraint import run_constraint_experiment

# Run experiments for each constraint level
for level in ["loose", "tight", "ultra_tight"]:
    print(f"\nRunning {level} constraints...")
    run_constraint_experiment(
        constraint_level=level,
        property_ranges_path="data/train_property_ranges.json",
        dataset="Combined",
        n=1_000,
        temperature=1.0,
        top_p=0.9,
        batch_size=256,
        out_csv=f"results/baseline_{level}_results.csv",
        summary_csv=f"results/baseline_{level}_summary.csv",
    )

# Combine results
import glob
files = sorted(glob.glob('results/baseline_*_results.csv'))
if files:
    dfs = [pd.read_csv(f) for f in files]
    combined = pd.concat(dfs, ignore_index=True)
    combined.to_csv('results/baseline_results.csv', index=False)
    print(f"\nCombined {len(files)} files into baseline_results.csv")


## 4. SmileyLlama Generation (Constraint-Based)

In [None]:
from src.smiley_generate_constraint import run_constraint_experiment

# Run experiments for each constraint level
for level in ["loose", "tight", "ultra_tight"]:
    print(f"\nRunning {level} constraints...")
    run_constraint_experiment(
        constraint_level=level,
        property_ranges_path="data/train_property_ranges.json",
        dataset="Combined",
        base_prompt_name="mw_logp_rotb",
        n=1_000,
        temperature=1.0,
        top_p=0.9,
        batch_size=128,
        quantize=True,
        out_csv=f"results/smiley_{level}_results.csv",
        summary_csv=f"results/smiley_{level}_summary.csv",
    )

# Combine results
files = sorted(glob.glob('results/smiley_*_results.csv'))
if files:
    dfs = [pd.read_csv(f) for f in files]
    combined = pd.concat(dfs, ignore_index=True)
    combined.to_csv('results/smiley_results.csv', index=False)
    print(f"\nCombined {len(files)} files into smiley_results.csv")


## 5. Evaluation

In [None]:
from src import evaluate

evaluate.main()


In [None]:
# View summary table
pd.read_csv("results/summary_table.csv")


In [None]:
pd.read_csv("results/panel_table.csv")


## 6. Generate Figures

In [None]:
from src import plots

plots.main()
