In [None]:
import pandas as pd
# Draw PCA plot

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


def plot_pca(data, n_components=2, feature_cols=None, color_col=None, title='PCA Plot'):
    """
    Generate and plot PCA visualization

    Parameters:
    -----------
    data : pandas.DataFrame
        Input dataframe
    n_components : int, default=2
        Number of principal components (2 or 3)
    feature_cols : list, optional
        List of column names to use as features. If None, uses all numeric columns
    color_col : str, optional
        Column name to use for color coding points
    title : str, default='PCA Plot'
        Title for the plot

    Returns:
    --------
    pca : PCA object
        Fitted PCA transformer
    transformed_data : numpy.ndarray
        Transformed data in PCA space
    """
    # Select features
    if feature_cols is None:
        feature_cols = data.select_dtypes(include=[np.number]).columns.tolist()
        if color_col and color_col in feature_cols:
            feature_cols.remove(color_col)

    X = data[feature_cols].dropna()

    # Perform PCA
    pca = PCA(n_components=n_components)
    transformed_data = pca.fit_transform(X)

    # Plot
    fig, ax = plt.subplots(figsize=(10, 8))

    if n_components == 2:
        if color_col and color_col in data.columns:
            colors = data.loc[X.index, color_col]
            scatter = ax.scatter(transformed_data[:, 0], transformed_data[:, 1],
                                 c=colors, cmap='viridis', alpha=0.6)
            plt.colorbar(scatter, ax=ax, label=color_col)
        else:
            ax.scatter(transformed_data[:, 0], transformed_data[:, 1], alpha=0.6)

        ax.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
        ax.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')

    elif n_components == 3:
        from mpl_toolkits.mplot3d import Axes3D
        fig = plt.figure(figsize=(12, 10))
        ax = fig.add_subplot(111, projection='3d')

        if color_col and color_col in data.columns:
            colors = data.loc[X.index, color_col]
            scatter = ax.scatter(transformed_data[:, 0], transformed_data[:, 1],
                                 transformed_data[:, 2], c=colors, cmap='viridis', alpha=0.6)
            plt.colorbar(scatter, ax=ax, label=color_col)
        else:
            ax.scatter(transformed_data[:, 0], transformed_data[:, 1],
                       transformed_data[:, 2], alpha=0.6)

        ax.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
        ax.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
        ax.set_zlabel(f'PC3 ({pca.explained_variance_ratio_[2]:.2%} variance)')

    ax.set_title(title)
    plt.tight_layout()
    plt.show()

    # Print explained variance
    print(f"Total variance explained: {pca.explained_variance_ratio_.sum():.2%}")
    for i, var in enumerate(pca.explained_variance_ratio_):
        print(f"PC{i + 1}: {var:.2%}")

    return pca, transformed_data


In [28]:
human_t12 = pd.concat([pd.read_csv(
    '../data/admet_transfer/liver/human_liver/property_T1.2/DOWNLOAD-0W2aR9HOKc-e6jQguSiw6Da3-xgRgX3_uFeK3mjAFnU_eq_.csv', sep=';'),
    pd.read_csv('../data/admet_transfer/liver/human_liver/property_T1.2/DOWNLOAD-C5ZUF5VlzwrTyQGo_VzTqCsTljdxv6Mj4lqs2ZUOSKQ_eq_.csv', sep=';')]
)

rat_t12 = pd.concat([pd.read_csv('../data/admet_transfer/liver/rat_liver/property_T1.2/DOWNLOAD-U3s1LO542aSgGMcS9uc2G38-UMz1FIlZL_ptLVDBaUA_eq_.csv', sep=';'), pd.read_csv('../data/admet_transfer/liver/rat_liver/property_T1.2/DOWNLOAD-UWN-ICI0Cnm2tn2RZCmzlAKx0Zcd_7pS6Ui6ZC8tNr4_eq_.csv', sep=';')])

mouse_t12 = pd.concat([pd.read_csv('../data/admet_transfer/liver/mouse_liver/property_T1.2/DOWNLOAD-C6jc_zSTvXC8ADTTm2jX14YCnEKSjAAP7LURqk645-4_eq_.csv', sep=';')])

In [36]:
import rdkit.Chem as Chem
from rdkit.Chem.SaltRemover import SaltRemover
from rdkit.Chem.MolStandardize.rdMolStandardize import Uncharger
from typing import List
from rdkit import RDLogger
import logging

def clean_smiles(smiles_list: List[str]) -> List[str | None]:
    """Remove invalid SMILES from a list of SMILES strings, strip salts, and remove duplicates."""
    un = Uncharger()
    salt_remover = SaltRemover()
    cleaned_smiles = []

    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            # Remove salts
            mol = salt_remover.StripMol(mol)
            # Uncharge the molecule
            mol = un.uncharge(mol)
            # Convert back to SMILES
            cleaned_smiles.append(Chem.MolToSmiles(mol))
        else:
            logging.debug(f"Invalid SMILES in the dataset: {smiles}")
            cleaned_smiles.append(None)

    return cleaned_smiles

human_t12['Smiles'] = clean_smiles(human_t12['Smiles'])
rat_t12['Smiles'] = clean_smiles(rat_t12['Smiles'])
mouse_t12['Smiles'] = clean_smiles(mouse_t12['Smiles'])

In [38]:
human_t12

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Compound Key,Smiles,Standard Type,Standard Relation,...,Document ChEMBL ID,Source ID,Source Description,Document Journal,Document Year,Cell ChEMBL ID,Properties,Action Type,Standard Text Value,Value
0,CHEMBL1587877,,,305.36,0.0,2.28,IND116071,COc1ccc(S(=O)(=O)N2CCOc3ccccc32)cc1,T1/2,'=',...,CHEMBL3120106,1,Scientific Literature,Bioorg Med Chem,2014,,,,,2.00
1,CHEMBL3235780,,,396.72,0.0,3.50,26f,Cc1ccc2c(c1)n(C)c(=N)n2CCOc1ccc(Cl)cc1,T1/2,'>',...,CHEMBL3232983,1,Scientific Literature,Eur J Med Chem,2014,,,,,60.00
2,CHEMBL3353869,,,335.45,0.0,2.80,10,CC(C)(C)c1cc(NC(=O)[C@@H]2CCCN2CC2CCOCC2)no1,T1/2,'=',...,CHEMBL3352390,1,Scientific Literature,Bioorg Med Chem Lett,2015,,,,,13.00
3,CHEMBL3353878,,,335.45,0.0,2.68,19,CC(C)(C)c1cc(NC(=O)[C@@H]2C[C@@H](O)CN2C2CCCCC...,T1/2,'=',...,CHEMBL3352390,1,Scientific Literature,Bioorg Med Chem Lett,2015,,,,,88.00
4,CHEMBL3353879,,,363.85,0.0,3.20,20,CC(C)(C)c1cc(NC(=O)[C@@H]2C[C@@H](O)CN2c2ccc(C...,T1/2,'=',...,CHEMBL3352390,1,Scientific Literature,Bioorg Med Chem Lett,2015,,,,,35.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3482,CHEMBL45,MELATONIN,4.0,232.28,0.0,1.86,Melatonin,COc1ccc2[nH]cc(CCNC(C)=O)c2c1,T1/2,'=',...,CHEMBL2321713,1,Scientific Literature,Bioorg Med Chem,2013,,,,,73.20
3483,CHEMBL17157,TERFENADINE,4.0,471.69,1.0,6.45,terfenadine,CC(C)(C)c1ccc(C(O)CCCN2CCC(C(O)(c3ccccc3)c3ccc...,T1/2,'=',...,CHEMBL2057155,1,Scientific Literature,Bioorg Med Chem Lett,2012,,,,,29.87
3484,CHEMBL2408322,,,324.27,0.0,3.27,39,O=C(Nc1ccc(F)c(-c2nc3ncccc3o2)c1)c1cnco1,T1/2,'=',...,CHEMBL2407056,1,Scientific Literature,Eur J Med Chem,2013,,,,,124.00
3485,CHEMBL2325503,,,529.97,1.0,4.48,"66, G007-LK",CS(=O)(=O)c1ccc(-c2nnc(/C=C/c3nnc(-c4ccc(C#N)c...,T1/2,'=',...,CHEMBL2321812,1,Scientific Literature,J Med Chem,2013,,,,,101.00
