# LibraryA Reaction Clustering Notebook

Этот ноутбук загружает CSV с реакциями, генерирует отпечатки (Morgan fingerprints), делает кластеризацию с помощью UMAP + HDBSCAN и визуализирует успешность реакций.

In [None]:
# Импорты
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import umap
import hdbscan


In [None]:
# Загрузка CSV
from google.colab import files
print("Upload your CSV with columns: AMINE, ALDEHYDE, SUCCESS")
uploaded = files.upload()
fname = list(uploaded.keys())[0]
df = pd.read_csv(fname)
df.head()

In [None]:
# Функции для обработки SMILES и генерации отпечатков
def mol_from_smiles(smi):
    try:
        return Chem.MolFromSmiles(smi)
    except:
        return None

def fp_array_from_mol(mol, nBits=1024, radius=2):
    if mol is None:
        return np.zeros(nBits, dtype=int)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits)
    arr = np.zeros((nBits,), dtype=int)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

def smiles_to_fp(smi):
    mol = mol_from_smiles(smi)
    return fp_array_from_mol(mol)

In [None]:
# Генерация отпечатков для аминов и альдегидов
df['AMINE_FP'] = df['AMINE'].apply(smiles_to_fp)
df['ALDEHYDE_FP'] = df['ALDEHYDE'].apply(smiles_to_fp)

# Удаляем невалидные SMILES
df = df.dropna(subset=['AMINE_FP','ALDEHYDE_FP']).reset_index(drop=True)

In [None]:
# Конкатенируем отпечатки амин+альдегид
df['COMBO_FP'] = df.apply(lambda row: np.concatenate([row['AMINE_FP'], row['ALDEHYDE_FP']]), axis=1)
X = np.stack(df['COMBO_FP'].values)

In [None]:
# Масштабирование и UMAP
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, random_state=42)
X_umap = reducer.fit_transform(X_scaled)
df['UMAP1'] = X_umap[:,0]
df['UMAP2'] = X_umap[:,1]

In [None]:
# HDBSCAN кластеризация
clusterer = hdbscan.HDBSCAN(min_cluster_size=8, min_samples=5)
df['CLUSTER'] = clusterer.fit_predict(X_umap)

In [None]:
# Визуализация кластеров
plt.figure(figsize=(8,6))
sns.scatterplot(data=df, x='UMAP1', y='UMAP2', hue='CLUSTER', palette='tab10')
plt.title('Clusters of Reagents')
plt.show()

In [None]:
# Анализ успешности реакций по кластерам
cluster_success = df.groupby('CLUSTER')['SUCCESS'].mean()
cluster_count = df.groupby('CLUSTER').size()
pd.DataFrame({'n_reactions': cluster_count, 'avg_success': cluster_success})