In [None]:
!pip install jpype1

In [None]:
!pip install rdkit-pypi

In [None]:
from jpype import isJVMStarted, startJVM, getDefaultJVMPath, JPackage
if not isJVMStarted():
    cdk_path = '/kaggle/input/cdk-2-7-1/cdk-2.7.1.jar'
    startJVM(getDefaultJVMPath(), "-ea", "-Djava.class.path=%s" % cdk_path)
    cdk =  JPackage('org').openscience.cdk
    print("JVMStarted is success")

In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np

def featsmi(fp_type, smis, size=1024, depth=6):
    fg = {
        "AP2D": cdk.fingerprint.AtomPairs2DFingerprinter(),
        "CKD": cdk.fingerprint.Fingerprinter(size, depth),
        "CKDExt": cdk.fingerprint.ExtendedFingerprinter(size, depth),
        "CKDGraph": cdk.fingerprint.GraphOnlyFingerprinter(size, depth),
        "MACCS": cdk.fingerprint.MACCSFingerprinter(),
        "PubChem": cdk.fingerprint.PubchemFingerprinter(cdk.silent.SilentChemObjectBuilder.getInstance()),
        "Estate": cdk.fingerprint.EStateFingerprinter(),
        "KR": cdk.fingerprint.KlekotaRothFingerprinter(),
        "FP4": cdk.fingerprint.SubstructureFingerprinter(),
        "FP4C": cdk.fingerprint.SubstructureFingerprinter(),
        "Circle": cdk.fingerprint.CircularFingerprinter(),
        "Hybrid": cdk.fingerprint.HybridizationFingerprinter(),
        "KRC": cdk.fingerprint.KlekotaRothFingerprinter(),
        "RDKit": None,  # Placeholder; we handle RDKit separately below
    }

    # Handle CDK smiles parser
    sp = cdk.smiles.SmilesParser(cdk.DefaultChemObjectBuilder.getInstance())

    for i, smi in enumerate(smis):
        if fp_type == "RDKit":
            # Convert SMILES to RDKit Mol object
            mol = Chem.MolFromSmiles(smi)
            if mol is None:
                raise ValueError(f"Invalid SMILES: {smi}")
            
            # Compute RDKit Morgan Fingerprint
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)

            #fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=size)
            feat = np.zeros((1, size), dtype=int)
            AllChem.DataStructs.ConvertToNumpyArray(fp, feat)
            feat = feat.flatten()
        else:
            # Handle CDK-based fingerprints
            mol = sp.parseSmiles(smi)
            fingerprinter = fg[fp_type]
            nbit = fingerprinter.getSize()
            if fp_type == "KRC" or fp_type == "FP4C":
                fp = fingerprinter.getCountFingerprint(mol)
                feat = np.array([int(fp.getCount(i)) for i in range(nbit)])
            else:
                fp = fingerprinter.getFingerprint(mol)
                feat = np.array([int(fp.get(i)) for i in range(nbit)])

        # Stack the features
        if i == 0:
            featx = feat.reshape(1, -1)
        else:
            featx = np.vstack((featx, feat.reshape(1, -1)))

    return featx


In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem

#def featex(smis, y_all):
def featex(smis):
    fname = []
    fused = []
    #y_df = pd.DataFrame(y_all)  # Convert feat0 to a DataFrame

    
    feat0 = featsmi("AP2D",smis);fname.append("AP2D");fused.append(0)    
    feat0_df = pd.DataFrame(feat0)  # Convert feat0 to a DataFrame
    #feat0_df = pd.concat([feat0_df, y_df], axis=1)
    feat0_df.to_csv("AP2D.csv", index=True)

    feat1 = featsmi("CKD", smis); fname.append("CKD"); fused.append(1)
    feat1_df = pd.DataFrame(feat1)  # Convert feat1 to a DataFrame
    #feat1_df = pd.concat([feat1_df, y_df], axis=1)
    feat1_df.to_csv("CKD.csv", index=True)

    feat2 = featsmi("CKDExt", smis); fname.append("CKDExt"); fused.append(2)
    feat2_df = pd.DataFrame(feat2)  # Convert feat2 to a DataFrame
    #feat2_df = pd.concat([feat2_df, y_df], axis=1)
    feat2_df.to_csv("CKDExt.csv", index=True)

    feat3 = featsmi("CKDGraph", smis); fname.append("CKDGraph"); fused.append(3)
    feat3_df = pd.DataFrame(feat3)  # Convert feat3 to a DataFrame
    #feat3_df = pd.concat([feat3_df, y_df], axis=1)
    feat3_df.to_csv("CKDGraph.csv", index=True)

    feat4 = featsmi("MACCS", smis); fname.append("MACCS"); fused.append(4)
    feat4_df = pd.DataFrame(feat4)  # Convert feat4 to a DataFrame
    #feat4_df = pd.concat([feat4_df, y_df], axis=1)
    feat4_df.to_csv("MACCS.csv", index=True)

    feat5 = featsmi("PubChem", smis); fname.append("PubChem"); fused.append(5)
    feat5_df = pd.DataFrame(feat5)  # Convert feat5 to a DataFrame
    #feat5_df = pd.concat([feat5_df, y_df], axis=1)
    feat5_df.to_csv("PubChem.csv", index=True)

    feat6 = featsmi("Estate", smis); fname.append("Estate"); fused.append(6)
    feat6_df = pd.DataFrame(feat6)  # Convert feat6 to a DataFrame
    #feat6_df = pd.concat([feat6_df, y_df], axis=1)
    feat6_df.to_csv("Estate.csv", index=True)

    feat7 = featsmi("KR", smis); fname.append("KR"); fused.append(7)
    feat7_df = pd.DataFrame(feat7)  # Convert feat7 to a DataFrame
    #feat7_df = pd.concat([feat7_df, y_df], axis=1)
    feat7_df.to_csv("KR.csv", index=True)

    feat8 = featsmi("FP4", smis); fname.append("FP4"); fused.append(8)
    feat8_df = pd.DataFrame(feat8)  # Convert feat8 to a DataFrame
    #feat8_df = pd.concat([feat8_df, y_df], axis=1)
    feat8_df.to_csv("FP4.csv", index=True)

    feat9 = featsmi("FP4C", smis); fname.append("FP4C"); fused.append(9)
    feat9_df = pd.DataFrame(feat9)  # Convert feat9 to a DataFrame
    #feat9_df = pd.concat([feat9_df, y_df], axis=1)
    feat9_df.to_csv("FP4C.csv", index=True)

    feat10 = featsmi("Circle", smis); fname.append("Circle"); fused.append(10)
    feat10_df = pd.DataFrame(feat10)  # Convert feat10 to a DataFrame
    #feat10_df = pd.concat([feat10_df, y_df], axis=1)
    feat10_df.to_csv("Circle.csv", index=True)

    feat11 = featsmi("Hybrid", smis); fname.append("Hybrid"); fused.append(11)
    feat11_df = pd.DataFrame(feat11)  # Convert feat11 to a DataFrame
    #feat11_df = pd.concat([feat11_df, y_df], axis=1)
    feat11_df.to_csv("Hybrid.csv", index=True)

    feat12 = featsmi("KRC", smis); fname.append("KRC"); fused.append(12)
    feat12_df = pd.DataFrame(feat12)  # Convert feat11 to a DataFrame
    #feat12_df = pd.concat([feat12_df, y_df], axis=1)
    feat12_df.to_csv("KRC.csv", index=True)
    

    
    allfeat_pos = np.hstack((
                             feat0, 
                             feat1, 
                             feat2, 
                             feat3, 
                             feat4,
                             feat5, 
                             feat6, 
                             feat7, 
                             feat8, 
                             feat9,
                             feat10,
                             feat11,
                             feat12

                            ))
    f = []
    before = 0
    for i in fused:
        after = before + eval('feat%d.shape[1]'% (i))
        f.append(list(range(before, after)))
        before = after
        
    return allfeat_pos, f, fname 

In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv('file.csv')

smiles = df['SMILES'].values
#y_all = df['Class'].values

#Xall, f, fname = featex(smiles,y_all)
Xall, f, fname = featex(smiles)