# LibraryA Reaction Analysis Notebook

Анализируем вашу библиотеку реакций:
1. Наиболее успешные амины
2. Наиболее успешные альдегиды
3. Кластеры реагентов по структуре и их success rate

In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
import umap
import hdbscan
import matplotlib.pyplot as plt
import seaborn as sns
from io import StringIO
from ipywidgets import FileUpload
sns.set(style='whitegrid')

In [None]:
# Загрузка CSV локально через FileUpload
upload = FileUpload(accept='.csv', multiple=False)
display(upload)

if upload.value:
    fname = list(upload.value.keys())[0]
    content = upload.value[fname]['content']
    # auto detect separator
    df = pd.read_csv(StringIO(content.decode('utf-8')), sep=None, engine='python')
    # преобразуем SUCCESS в числовой тип
    df['SUCCESS'] = pd.to_numeric(df['SUCCESS'], errors='coerce')
    df = df.dropna(subset=['SUCCESS']).reset_index(drop=True)
    df.head()

## Функции для генерации Morgan fingerprints

In [None]:
def mol_from_smiles(smi):
    try:
        return Chem.MolFromSmiles(smi)
    except:
        return None

def fp_array_from_mol(mol, nBits=1024, radius=2):
    if mol is None:
        return np.zeros(nBits, dtype=int)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits)
    arr = np.zeros((nBits,), dtype=int)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

def smiles_to_fp(smi):
    mol = mol_from_smiles(smi)
    return fp_array_from_mol(mol)

In [None]:
# Генерация отпечатков
df['AMINE_FP'] = df['AMINE'].apply(smiles_to_fp)
df['ALDEHYDE_FP'] = df['ALDEHYDE'].apply(smiles_to_fp)
df = df.dropna(subset=['AMINE_FP','ALDEHYDE_FP']).reset_index(drop=True)

# Комбинируем отпечатки
df['COMBO_FP'] = df.apply(lambda row: np.concatenate([row['AMINE_FP'], row['ALDEHYDE_FP']]), axis=1)
X = np.stack(df['COMBO_FP'].values)

## UMAP + HDBSCAN кластеризация

In [None]:
# Масштабирование и UMAP
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, random_state=42)
X_umap = reducer.fit_transform(X_scaled)
df['UMAP1'] = X_umap[:,0]
df['UMAP2'] = X_umap[:,1]

# HDBSCAN кластеризация
clusterer = hdbscan.HDBSCAN(min_cluster_size=8, min_samples=5)
df['CLUSTER'] = clusterer.fit_predict(X_umap)

# Визуализация
plt.figure(figsize=(8,6))
sns.scatterplot(data=df, x='UMAP1', y='UMAP2', hue='CLUSTER', palette='tab10')
plt.title('UMAP + HDBSCAN Clusters')
plt.show()

## Наиболее успешные амины и альдегиды

In [None]:
# Средний Success по амину
amine_success = df.groupby('AMINE')['SUCCESS'].mean().sort_values(ascending=False)
print("Топ-10 аминов по успеху реакции:")
display(amine_success.head(10))

# Средний Success по альдегиду
aldehyde_success = df.groupby('ALDEHYDE')['SUCCESS'].mean().sort_values(ascending=False)
print("Топ-10 альдегидов по успеху реакции:")
display(aldehyde_success.head(10))

## Кластеры и success rate

In [None]:
cluster_stats = df.groupby('CLUSTER').agg(
    n_reactions=('SUCCESS','count'),
    avg_success=('SUCCESS','mean')
).sort_values('avg_success', ascending=False)
print("Статистика по кластерам:")
display(cluster_stats)