# üß¨ AI-Assisted Drug Design: Presentation & Analysis

This notebook provides visualizations and analysis to demonstrate the effectiveness of our AI model in screening potential drug candidates for the Beta-2 Adrenergic Receptor (ADRB2).

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from rdkit import Chem
from rdkit.Chem import Draw, AllChem
from sklearn.decomposition import PCA
import numpy as np
import os

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

## 1. üìä Model Performance: AI vs. Docking

We compare the predictions from our AI model (pChEMBL affinity) against the gold-standard molecular docking scores (kcal/mol) for the top candidates. A strong correlation indicates the AI model effectively prioritizes high-affinity binders.

In [None]:
# Load results from the screening run
results_path = "../results/screening_run/final_screening_report.csv"
if os.path.exists(results_path):
    df = pd.read_csv(results_path)
    print(f"Loaded results for {len(df)} top candidates.")
    
    # Scatter Plot
    plt.figure(figsize=(8, 6))
    sns.scatterplot(data=df, x='ai_pchembl_value', y='docking_score_kcal_mol', s=100, hue='docking_score_kcal_mol', palette='viridis')
    
    # Add labels
    for i, row in df.iterrows():
        plt.text(row['ai_pchembl_value']+0.02, row['docking_score_kcal_mol'], 
                 row['molecule_name_x'], fontsize=9)
        
    plt.title("AI Prediction vs. Docking Score for Top Candidates", fontsize=14)
    plt.xlabel("AI Predicted pChEMBL (Higher is Better)", fontsize=12)
    plt.ylabel("Docking Score (kcal/mol) (Lower is Better)", fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.show()
else:
    print("Results file not found. Please run the virtual screening pipeline first.")

## 2. üß™ Top Hits: Chemical Structures

Visualizing the 2D structures of our top-ranked drug candidates.

In [None]:
if os.path.exists(results_path):
    top_hits = df.sort_values('docking_score_kcal_mol').head(5)
    mols = [Chem.MolFromSmiles(s) for s in top_hits['canonical_smiles']]
    legends = [f"{row['molecule_name_x']}\nDock: {row['docking_score_kcal_mol']} kcal/mol" for i, row in top_hits.iterrows()]
    
    img = Draw.MolsToGridImage(mols, molsPerRow=5, subImgSize=(200, 200), legends=legends)
    display(img)
else:
    print("No results to display.")

## 3. üåå Chemical Space Exploration

Here we visualize where our screened molecules (blue) lie in chemical space compared to the known active molecules used for training (orange). This shows the diversity of our screening library.

In [None]:
# Load original training data
train_data_path = "../data/raw/chembl_adrb2.csv"
screening_data_path = "../data/new_molecules/screening_set.csv"

def get_fingerprints(smiles_list):
    fps = []
    valid_indices = []
    for i, s in enumerate(smiles_list):
        m = Chem.MolFromSmiles(s)
        if m:
            fps.append(AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=1024))
            valid_indices.append(i)
    return np.array(fps), valid_indices

if os.path.exists(train_data_path) and os.path.exists(screening_data_path):
    # Load data
    df_train = pd.read_csv(train_data_path)
    df_screen = pd.read_csv(screening_data_path)
    
    # Generate fingerprints
    X_train, _ = get_fingerprints(df_train['canonical_smiles'].tolist())
    X_screen, _ = get_fingerprints(df_screen['canonical_smiles'].tolist())
    
    # PCA
    pca = PCA(n_components=2)
    X_combined = np.vstack([X_train, X_screen])
    X_pca = pca.fit_transform(X_combined)
    
    # Split back
    X_pca_train = X_pca[:len(X_train)]
    X_pca_screen = X_pca[len(X_train):]
    
    # Plot
    plt.figure(figsize=(10, 8))
    plt.scatter(X_pca_train[:, 0], X_pca_train[:, 1], alpha=0.5, label='Training Data (ChEMBL)', c='lightgray', s=30)
    plt.scatter(X_pca_screen[:, 0], X_pca_screen[:, 1], alpha=0.8, label='Screening Library', c='blue', s=50)
    
    # Highlight top hits
    if os.path.exists(results_path):
        # Find indices of top hits in screening set
        # Simple matching by SMILES
        top_smiles = set(df['canonical_smiles'])
        top_indices = [i for i, s in enumerate(df_screen['canonical_smiles']) if s in top_smiles]
        # Only valid indices from fingerprint generation
        # Re-map indices properly (complex, so simplified logic here)
        pass

    plt.title("Chemical Space Visualization (PCA of Morgan Fingerprints)", fontsize=14)
    plt.xlabel("Principal Component 1")
    plt.ylabel("Principal Component 2")
    plt.legend()
    plt.show()
else:
    print("Data files not found.")

## 4. üìù Summary Report

Key metrics for the top drug candidate identified by our pipeline.

In [None]:
if os.path.exists(results_path):
    best_hit = df.sort_values('docking_score_kcal_mol').iloc[0]
    print("üèÜ Top Drug Candidate:")
    print(f"   Name: {best_hit['molecule_name_x']}")
    print(f"   Docking Score: {best_hit['docking_score_kcal_mol']} kcal/mol (Strong Binding)")
    print(f"   AI Predicted Affinity: {best_hit['ai_pchembl_value']:.2f} pChEMBL")
    print(f"   Structure (SMILES): {best_hit['canonical_smiles']}")
else:
    print("No results available.")