In [1]:
import pandas as pd 
import numpy as np 
import pickle 

In [2]:
with open('/home/warmachine/Desktop/Data_for_publication/fsmol/all_data_fp_.pkl', 'rb') as f:
    data = pickle.load(f)

In [5]:
data.assay_dic['CHEMBL1243966'].experiments[0].smiles

'CC(C)n1nc(-c2cc(O)cc(Br)c2)c2c(N)ncnc21'

In [6]:
data.assay_dic['CHEMBL1243966'].experiments[0].cpd_id

array([0, 1, 0, ..., 0, 0, 0], dtype=int32)

In [29]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import MACCSkeys
from typing import Union, List, Dict
import warnings
from tqdm import tqdm

class MACCSFingerprints:
    def __init__(self):
        """
        Initialize the MACCS fingerprint generator
        """
        self.maccs_length = 167  # MACCS keys are always 167 bits
        
    def sanitize_smiles(self, smiles: str) -> Union[str, None]:
        """
        Sanitize SMILES and return canonical SMILES
        
        Args:
            smiles (str): Input SMILES string
            
        Returns:
            str or None: Canonical SMILES if valid, None if invalid
        """
        try:
            mol = Chem.MolFromSmiles(smiles)
            if mol is not None:
                return Chem.MolToSmiles(mol, canonical=True)
            return None
        except:
            return None
    
    def generate_mol_from_smiles(self, smiles: str) -> Union[Chem.Mol, None]:
        """
        Generate RDKit molecule object from SMILES
        
        Args:
            smiles (str): Input SMILES string
            
        Returns:
            RDKit Mol object or None
        """
        try:
            mol = Chem.MolFromSmiles(smiles)
            if mol is not None:
                return mol
            return None
        except:
            return None
    
    def generate_maccs(self, mol: Chem.Mol) -> np.ndarray:
        """
        Generate MACCS fingerprint
        
        Args:
            mol (RDKit Mol): RDKit molecule object
            
        Returns:
            np.ndarray: MACCS fingerprint as numpy array
        """
        maccs = MACCSkeys.GenMACCSKeys(mol)
        return np.array(list(maccs.ToBitString())).astype(int)
    
    def generate_maccs_dict(self, smiles_dict: Dict[str, str]) -> Dict[str, np.ndarray]:
        """
        Generate MACCS fingerprints from dictionary of compound IDs and SMILES
        
        Args:
            smiles_dict: Dictionary with compound IDs as keys and SMILES as values
            
        Returns:
            Dictionary with compound IDs as keys and MACCS fingerprints as values
        """
        results = {}
        invalid_smiles = []
        
        for cpd_id, smi in tqdm(smiles_dict.items(), desc="Generating MACCS fingerprints"):
            # Sanitize SMILES
            canonical_smiles = self.sanitize_smiles(smi)
            if canonical_smiles is None:
                invalid_smiles.append((cpd_id, smi))
                continue
                
            # Generate molecule
            mol = self.generate_mol_from_smiles(canonical_smiles)
            if mol is None:
                invalid_smiles.append((cpd_id, smi))
                continue
                
            # Generate fingerprint
            try:
                maccs = self.generate_maccs(mol)
                results[cpd_id] = maccs
                    
            except Exception as e:
                invalid_smiles.append((cpd_id, smi))
                continue
            
        # Report invalid SMILES
        if invalid_smiles:
            warnings.warn(f"Failed to process {len(invalid_smiles)} SMILES strings")
            
        return results

    def combine_with_ecfp(self, maccs_dict: Dict[str, np.ndarray], 
                         ecfp_dict: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
        """
        Combine MACCS fingerprints with existing ECFP dictionary
        
        Args:
            maccs_dict: Dictionary with compound IDs and MACCS fingerprints
            ecfp_dict: Dictionary with compound IDs and ECFP fingerprints
            
        Returns:
            Dictionary with compound IDs and combined fingerprints
        """
        combined_dict = {}
        
        # Find common compound IDs
        common_cpds = set(maccs_dict.keys()) & set(ecfp_dict.keys())
        
        for cpd_id in common_cpds:
            combined_dict[cpd_id] = np.concatenate([ecfp_dict[cpd_id], maccs_dict[cpd_id]])
            
        print(f"Combined fingerprints for {len(combined_dict)} compounds")
        print(f"Missing compounds: {len(set(maccs_dict.keys()) ^ set(ecfp_dict.keys()))}")
        
        return combined_dict

# Example usage
def main():
    # Example data
    smiles_dict = {
        'CPD1': "CC(=O)OC1=CC=CC=C1C(=O)O",  # Aspirin
        'CPD2': "CN1C=NC2=C1C(=O)N(C(=O)N2C)C",  # Caffeine
        'CPD3': "CC12CCC3C(C1CCC2O)CCC4=CC(=O)CCC34C",  # Testosterone
        'CPD4': "INVALID_SMILES"  # Invalid SMILES
    }
    
    # Example ECFP dictionary (previously generated)
    example_ecfp_dict = {
        'CPD1': np.zeros(1024),  # Replace with actual ECFP values
        'CPD2': np.ones(1024),
        'CPD3': np.ones(1024)
    }
    
    # Initialize fingerprint generator
    fp_gen = MACCSFingerprints()
    
    # Generate MACCS fingerprints
    maccs_dict = fp_gen.generate_maccs_dict(smiles_dict)
    
    # Combine with existing ECFP
    combined_dict = fp_gen.combine_with_ecfp(maccs_dict, example_ecfp_dict)
    
    # Print information
    print("\nFingerprint Information:")
    print(f"Number of compounds with MACCS keys: {len(maccs_dict)}")
    print(f"MACCS fingerprint length: {fp_gen.maccs_length}")
    
    # Example of using fingerprints for similarity calculation
    def calculate_similarity(fp1, fp2):
        return np.sum(fp1 == fp2) / len(fp1)
    
    # Calculate similarity between first two compounds if they exist
    cpd_ids = list(combined_dict.keys())
    if len(cpd_ids) >= 2:
        similarity = calculate_similarity(
            combined_dict[cpd_ids[0]],
            combined_dict[cpd_ids[1]]
        )
        print(f"\nSimilarity between {cpd_ids[0]} and {cpd_ids[1]}: {similarity:.3f}")

if __name__ == "__main__":
    main()

Generating MACCS fingerprints:   0%|          | 0/4 [00:00<?, ?it/s][10:09:26] SMILES Parse Error: syntax error while parsing: INVALID_SMILES
[10:09:26] SMILES Parse Error: Failed parsing SMILES 'INVALID_SMILES' for input: 'INVALID_SMILES'
Generating MACCS fingerprints: 100%|██████████| 4/4 [00:00<00:00, 673.08it/s]

Combined fingerprints for 3 compounds
Missing compounds: 0

Fingerprint Information:
Number of compounds with MACCS keys: 3
MACCS fingerprint length: 167

Similarity between CPD2 and CPD1: 0.107





In [31]:
fp_gen = MACCSFingerprints()
smiles_dict = {
        'CPD1': "CC(=O)OC1=CC=CC=C1C(=O)O",  # Aspirin
        'CPD2': "CN1C=NC2=C1C(=O)N(C(=O)N2C)C",  # Caffeine
        'CPD3': "CC12CCC3C(C1CCC2O)CCC4=CC(=O)CCC34C",  # Testosterone
        'CPD4': "INVALID_SMILES"  # Invalid SMILES
    }
maccs_dict = fp_gen.generate_maccs_dict(smiles_dict)

Generating MACCS fingerprints:   0%|          | 0/4 [00:00<?, ?it/s][10:10:21] SMILES Parse Error: syntax error while parsing: INVALID_SMILES
[10:10:21] SMILES Parse Error: Failed parsing SMILES 'INVALID_SMILES' for input: 'INVALID_SMILES'
Generating MACCS fingerprints: 100%|██████████| 4/4 [00:00<00:00, 315.97it/s]


In [32]:
maccs_dict

{'CPD1': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0,
        1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0]),
 'CPD2': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0,
        0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
        1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 