# Clustering Amines and Aldehydes

This notebook runs the clustering script on `data.csv` and displays outputs.
Make sure `data.csv` and `clustering_amines_aldehydes.py` are in the repo root.

In [None]:
import pandas as pd
pd.options.display.max_columns = 200

# Load CSV (semicolon-separated)
try:
    df = pd.read_csv('data.csv', sep=';')
except Exception:
    df = pd.read_csv('data.csv', sep=None, engine='python')

df.head()

In [None]:
# Run the clustering script
import subprocess, sys
cmd = [sys.executable, 'clustering_amines_aldehydes.py', '--input', 'data.csv', '--out_prefix', 'binder_results']
print('Running:', ' '.join(cmd))
proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
print(proc.stdout)
if proc.stderr:
    print('--- STDERR ---')
    print(proc.stderr)

In [None]:
# Load summaries
import os

def safe_read(p):
    if os.path.exists(p):
        return pd.read_csv(p)
    else:
        print(f'File not found: {p}')
        return None

a_cluster = safe_read('binder_results_amines_cluster_summary.csv')
d_cluster = safe_read('binder_results_aldehydes_cluster_summary.csv')
a_per = safe_read('binder_results_amines_per_smiles.csv')
d_per = safe_read('binder_results_aldehydes_per_smiles.csv')

print('Amines cluster summary:')
display(a_cluster)
print('\nAldehydes cluster summary:')
display(d_cluster)

In [None]:
# Display per-smiles and UMAP images
from IPython.display import display, Image, Markdown

print('Amines per-smiles:')
display(a_per)
print('\nAldehydes per-smiles:')
display(d_per)

if os.path.exists('binder_results_amines_umap.png'):
    display(Markdown('**Amines UMAP**'))
    display(Image('binder_results_amines_umap.png'))
if os.path.exists('binder_results_aldehydes_umap.png'):
    display(Markdown('**Aldehydes UMAP**'))
    display(Image('binder_results_aldehydes_umap.png'))

---

You can modify `clustering_amines_aldehydes.py` parameters (e.g., Butina cutoff, smoothing_alpha) and re-run the script cell.