In [138]:
import numpy as np
import pandas as pd
import time

# rklearn
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from rdkit.Chem import AllChem

# rdkit
from rdkit import Chem
import rdkit.Chem.rdMolDescriptors as d
import rdkit.Chem.Fragments as f
import rdkit.Chem.Lipinski as l

In [139]:
from platform import python_version

print(f"Python version: {python_version()}")
print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")

Python version: 3.9.13
NumPy version: 1.21.5
Pandas version: 1.4.4


# Features selection

In [149]:
class SMILEActive:
    def __init__(self):
        column_filter = None
        imputation = None
        one_hot = None
        labels = None
        model = None

    def featureExtraction(self, df):
        #Involve features:
        df_copy = df.copy()
        df_feature = df.copy()

        # Adding columns for desired feature
        # basic info
        df_feature["NumAtoms"] = np.nan
        df_feature["MolWt"] = np.nan
        df_feature["HeavyAtom"] = np.nan
        # specific features or fragment
        df_feature["AroRing"] = np.nan
        df_feature["AmideBond"] = np.nan
        df_feature["RotatableBond"] = np.nan
        df_feature["SaturatedRing"] = np.nan
        df_feature["AL_COO"] = np.nan
        df_feature["Benzene"] = np.nan

        # Loop through index, calculate and adding features to the feature dataframe
        for i in df_copy.index:
            m = Chem.MolFromSmiles(df_copy['SMILES'][i])
            numatoms = m.GetNumAtoms()  # numbers of atoms
            molwt = d.CalcExactMolWt(m)  # molecule's exact molecular weight
            heavyatom = l.HeavyAtomCount(m)  # Number of heavy atoms a molecule
            aroring = d.CalcNumAromaticRings(m)  # number of aromatic rings for a molecule, they are very stable and do not break apart easily
            amidebond = d.CalcNumAmideBonds(m)  # number of amide bonds in a molecule
            rotabond= d.CalcNumRotatableBonds(m)  # number of rotatable bonds for a molecule
            saturatedring = d.CalcNumSaturatedRings(m)  # returns the number of saturated rings for a molecule
            alcoo = f.fr_Al_COO(m)  # Number of aliphatic carboxylic acids
            benzene = f.fr_benzene(m) # Number of benzene rings

            df_feature.loc[i, 'NumAtoms'] = numatoms
            df_feature.loc[i, 'MolWt'] = molwt
            df_feature.loc[i, 'HeavyAtom'] = heavyatom
            df_feature.loc[i, 'AroRing'] = aroring
            df_feature.loc[i, 'AmideBond'] = amidebond
            df_feature.loc[i, 'RotatableBond'] = rotabond
            df_feature.loc[i, 'SaturatedRing'] = saturatedring
            df_feature.loc[i, 'AL_COO'] = alcoo
            df_feature.loc[i, 'Benzene'] = benzene


        # move ACTIVE to the end of the dataframe
        #tempcolumn = df_feature.columns.tolist()
        #activeindex = df_feature.columns.get_loc("ACTIVE")
        #newcolumn = tempcolumn[0:activeindex] + tempcolumn[activeindex + 1:] + tempcolumn[activeindex:activeindex + 1]
        #print(newcolumn)
        #df_feature = df_feature[newcolumn]

        return df_feature
    
    def featureCollection(self, df, option):

        #Involve features:
        # Chem
        # rdMolDescriptors:
        # Fragments:
        # Lipinski:
        df_copy = df.copy()
        df_featureCollection = df.copy() #make a copy of the DataFrame for the collection of

        if (option == "library"): #option 1: use original library to collect the features from it

            # Adding columns for desired feature
            # basic info
            df_featureCollection["NumAtoms"] = np.nan
            df_featureCollection["MolWt"] = np.nan
            df_featureCollection["HeavyAtom"] = np.nan
            # specific features or fragment
            df_featureCollection["AroRing"] = np.nan
            df_featureCollection["AmideBond"] = np.nan
            df_featureCollection["RotatableBond"] = np.nan
            df_featureCollection["SaturatedRing"] = np.nan
            df_featureCollection["AL_COO"] = np.nan
            df_featureCollection["Benzene"] = np.nan

            # Loop through index, calculate and adding features to the feature dataframe
            for i in df_copy.index:
                m = Chem.MolFromSmiles(df_copy['SMILES'][i]) # convert SMILES to molecule. SMILES string and m will be assigned an object representing the corresonding
                #chemical compound, for which various properties might be derived
                numatoms = m.GetNumAtoms()  # numbers of atoms
                molwt = d.CalcExactMolWt(m)  # molecule's exact molecular weight
                heavyatom = l.HeavyAtomCount(m)  # Number of heavy atoms a molecule
                aroring = d.CalcNumAromaticRings(m)  # number of aromatic rings for a molecule, they are very stable and do not break apart easily
                amidebond = d.CalcNumAmideBonds(m)  # number of amide bonds in a molecule
                rotabond= d.CalcNumRotatableBonds(m)  # number of rotatable bonds for a molecule
                saturatedring = d.CalcNumSaturatedRings(m)  # returns the number of saturated rings for a molecule
                alcoo = f.fr_Al_COO(m)  # Number of aliphatic carboxylic acids
                benzene = f.fr_benzene(m) # Number of benzene rings

                df_featureCollection.loc[i, 'NumAtoms'] = numatoms
                df_featureCollection.loc[i, 'MolWt'] = molwt
                df_featureCollection.loc[i, 'HeavyAtom'] = heavyatom
                df_featureCollection.loc[i, 'AroRing'] = aroring
                df_featureCollection.loc[i, 'AmideBond'] = amidebond
                df_featureCollection.loc[i, 'RotatableBond'] = rotabond
                df_featureCollection.loc[i, 'SaturatedRing'] = saturatedring
                df_featureCollection.loc[i, 'AL_COO'] = alcoo
                df_featureCollection.loc[i, 'Benzene'] = benzene

            # move ACTIVE to the end of the dataframe
            tempcolumn = df_featureCollection.columns.tolist()
            activeindex = df_featureCollection.columns.get_loc("ACTIVE")
            newcolumn = tempcolumn[0:activeindex] + tempcolumn[activeindex + 1:] + tempcolumn[activeindex:activeindex + 1]
            print(newcolumn)
            df_featureCollection = df_featureCollection[newcolumn]

            return df_featureCollection

        if (option == "fingerprints"): #option 2: use fingerprints

            # Adding columns for desired feature
            # specific features or fragment
            df_featureCollection["ecpc"] = np.nan
            df_featureCollection["rd_form"] = np.nan
            
            # fingerprints for properties of pharmacs
            ecfp_list = []
            for ecpc in range(124):
                df_featureCollection["ecpc" + str(ecpc)] = 0
                ecfp_list.append("ecpc" + str(ecpc))
            ecpc_x = df_featureCollection["rd_form"].apply(lambda x: np.array(AllChem.GetMorganFingerprintAsBitVect(x,2,nBits=124)))
            ecpc_vector_lists = list(itertools.chain(*ecpc_x))
            df_featureCollection.loc[:, ecfp_list] = np.array(ecpc_vector_lists).reshape(len(df_featureCollection),124)
            
            # fingerprints for properties of the atoms
            fcfp_list = []    
            for fcpc in range(124):
                df_featureCollection["fcpc" + str(fcpc)] = 0
                fcfp_list.append("fcpc" + str(fcpc))
            fcpc_x = df_featureCollection["rd_form"].apply(lambda x: np.array(AllChem.GetMorganFingerprintAsBitVect(x,2,nBits=124, useFeatures=True)))
            fcpc_vector_lists = list(itertools.chain(*fcpc_x))
            df_featureCollection.loc[:, fcfp_list] = np.array(fcpc_vector_lists).reshape(len(df_featureCollection),124)

            # move ACTIVE to the end of the dataframe
            tempcolumn = df_featureCollection.columns.tolist()
            activeindex = df_featureCollection.columns.get_loc("ACTIVE")
            newcolumn = tempcolumn[0:activeindex] + tempcolumn[activeindex + 1:] + tempcolumn[activeindex:activeindex + 1]
            print(newcolumn)
            df_featureCollection = df_featureCollection[newcolumn]

            return df_featureCollection
        
        if (option == "libraryPlusFingerprints"):
            
            # Adding columns for desired feature
            # basic info
            df_featureCollection["NumAtoms"] = np.nan
            df_featureCollection["MolWt"] = np.nan
            df_featureCollection["HeavyAtom"] = np.nan
            # specific features or fragment
            df_featureCollection["AroRing"] = np.nan
            df_featureCollection["AmideBond"] = np.nan
            df_featureCollection["RotatableBond"] = np.nan
            df_featureCollection["SaturatedRing"] = np.nan
            df_featureCollection["AL_COO"] = np.nan
            df_featureCollection["Benzene"] = np.nan
            # specific features for fingerprints
            df_featureCollection["ecpc"] = np.nan
            df_featureCollection["rd_form"] = np.nan

            # Loop through index, calculate and adding features to the feature dataframe
            for i in df_copy.index:
                m = Chem.MolFromSmiles(df_copy['SMILES'][i]) # convert SMILES to molecule. SMILES string and m will be assigned an object representing the corresonding
                #chemical compound, for which various properties might be derived
                numatoms = m.GetNumAtoms()  # numbers of atoms
                molwt = d.CalcExactMolWt(m)  # molecule's exact molecular weight
                heavyatom = l.HeavyAtomCount(m)  # Number of heavy atoms a molecule
                aroring = d.CalcNumAromaticRings(m)  # number of aromatic rings for a molecule, they are very stable and do not break apart easily
                amidebond = d.CalcNumAmideBonds(m)  # number of amide bonds in a molecule
                rotabond= d.CalcNumRotatableBonds(m)  # number of rotatable bonds for a molecule
                saturatedring = d.CalcNumSaturatedRings(m)  # returns the number of saturated rings for a molecule
                alcoo = f.fr_Al_COO(m)  # Number of aliphatic carboxylic acids
                benzene = f.fr_benzene(m) # Number of benzene rings

                df_featureCollection.loc[i, 'NumAtoms'] = numatoms
                df_featureCollection.loc[i, 'MolWt'] = molwt
                df_featureCollection.loc[i, 'HeavyAtom'] = heavyatom
                df_featureCollection.loc[i, 'AroRing'] = aroring
                df_featureCollection.loc[i, 'AmideBond'] = amidebond
                df_featureCollection.loc[i, 'RotatableBond'] = rotabond
                df_featureCollection.loc[i, 'SaturatedRing'] = saturatedring
                df_featureCollection.loc[i, 'AL_COO'] = alcoo
                df_featureCollection.loc[i, 'Benzene'] = benzene

            # fingerprints for properties of pharmacs
            ecfp_list = []
            for ecpc in range(124):
                df_featureCollection["ecpc" + str(ecpc)] = 0
                ecfp_list.append("ecpc" + str(ecpc))
            ecpc_x = df_featureCollection["rd_form"].apply(lambda x: np.array(AllChem.GetMorganFingerprintAsBitVect(x,2,nBits=124)))
            ecpc_vector_lists = list(itertools.chain(*ecpc_x))
            df_featureCollection.loc[:, ecfp_list] = np.array(ecpc_vector_lists).reshape(len(df_featureCollection),124)
            
            # fingerprints for properties of the atoms
            fcfp_list = []    
            for fcpc in range(124):
                df_featureCollection["fcpc" + str(fcpc)] = 0
                fcfp_list.append("fcpc" + str(fcpc))
            fcpc_x = df_featureCollection["rd_form"].apply(lambda x: np.array(AllChem.GetMorganFingerprintAsBitVect(x,2,nBits=124, useFeatures=True)))
            fcpc_vector_lists = list(itertools.chain(*fcpc_x))
            df_featureCollection.loc[:, fcfp_list] = np.array(fcpc_vector_lists).reshape(len(df_featureCollection),124)

            # move ACTIVE to the end of the dataframe
            #tempcolumn = df_featureCollection.columns.tolist()
            #activeindex = df_featureCollection.columns.get_loc("ACTIVE")
            #newcolumn = tempcolumn[0:activeindex] + tempcolumn[activeindex + 1:] + tempcolumn[activeindex:activeindex + 1]
            #print(newcolumn)
            #df_featureCollection = df_featureCollection[newcolumn]
            
            print(df_featureCollection)

            return df_featureCollection

# Train data features extraction

In [154]:
train_df = pd.read_csv("training_smiles.csv")

smile = SMILEActive()

#features_df = smile.featureCollection(train_df, "libraryPlusFingerprints")

features_df = smile.featureExtraction(train_df)


display(features_df)



Unnamed: 0,INDEX,SMILES,ACTIVE,NumAtoms,MolWt,HeavyAtom,AroRing,AmideBond,RotatableBond,SaturatedRing,AL_COO,Benzene
0,1,CC(C)N1CC(=O)C(c2nc3ccccc3[nH]2)=C1N,0.0,19.0,256.132411,19.0,2.0,0.0,2.0,0.0,0.0,1.0
1,2,COc1ccc(-c2ccc3c(N)c(C(=O)c4ccc(OC)c(OC)c4)sc3...,0.0,30.0,420.114378,30.0,4.0,0.0,6.0,0.0,0.0,2.0
2,3,CCc1ccc(C(=O)COC(=O)CCc2nc(=O)c3ccccc3[nH]2)cc1,0.0,27.0,364.142307,27.0,3.0,0.0,7.0,0.0,0.0,2.0
3,4,O=C(CN1CCOCC1)Nc1ccc(S(=O)(=O)N2CCCCCC2)cc1,0.0,26.0,381.172227,26.0,1.0,1.0,5.0,2.0,0.0,1.0
4,5,C=CCC(Nc1ccccc1)c1ccc(OC)c(OC)c1,0.0,21.0,283.157229,21.0,2.0,0.0,7.0,0.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...
156253,156254,O=C(N/N=C\c1ccc(F)cc1)c1ccc(Cn2cc(Br)c([N+](=O...,0.0,27.0,434.997844,27.0,3.0,1.0,6.0,0.0,0.0,1.0
156254,156255,COc1ccc(NS(=O)(=O)c2cc(NC(=O)/C=C/c3cc(OC)ccc3...,0.0,39.0,551.209007,39.0,3.0,1.0,10.0,1.0,0.0,3.0
156255,156256,O=c1nc(N2CCN(Cc3ccccc3)CC2)[nH]c2c1CCC2,0.0,23.0,310.179361,23.0,2.0,0.0,3.0,1.0,0.0,1.0
156256,156257,Cc1onc(-c2ccccc2)c1C(=O)N/N=C/c1ccco1,0.0,22.0,295.095691,22.0,3.0,1.0,4.0,0.0,0.0,1.0


# Fingerprint

In [155]:
import itertools
#features_df = pd.read_csv("training_smiles.csv")

#y_total = features_df["ACTIVE"]

#print(y_total)

def get_smile(dataframe):
    data = dataframe.copy()
    data["rd_form"] = data.SMILES.apply(lambda x: Chem.MolFromSmiles(x))
    data.drop(["SMILES"], inplace=True, axis=1)
    return data

features_df = get_smile(features_df)

def fingerprint(data):    
    # fingerprint
    fcfp_list = []    
    for fcpc in range(124):
        data["fcpc" + str(fcpc)] = 0
        fcfp_list.append("fcpc" + str(fcpc))
    fcpc_x = data["rd_form"].apply(lambda x: np.array(AllChem.GetMorganFingerprintAsBitVect(x,2,nBits=124, useFeatures=True)))
    fcpc_vector_lists = list(itertools.chain(*fcpc_x))
    data.loc[:, fcfp_list] = np.array(fcpc_vector_lists).reshape(len(data),124)
    return data

features_df_fingerprint = fingerprint(features_df)
###

# x_total = features_df.drop(["INDEX", "HeavyAtom", "ACTIVE"], axis = 1)

# display(train)
# display(x_train)
# display(x_validate)
# display(y_train)
# display(y_validate)
display(features_df)
display(features_df_fingerprint)




  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0


Unnamed: 0,INDEX,ACTIVE,NumAtoms,MolWt,HeavyAtom,AroRing,AmideBond,RotatableBond,SaturatedRing,AL_COO,...,fcpc114,fcpc115,fcpc116,fcpc117,fcpc118,fcpc119,fcpc120,fcpc121,fcpc122,fcpc123
0,1,0.0,19.0,256.132411,19.0,2.0,0.0,2.0,0.0,0.0,...,0,0,1,1,0,0,0,0,1,0
1,2,0.0,30.0,420.114378,30.0,4.0,0.0,6.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
2,3,0.0,27.0,364.142307,27.0,3.0,0.0,7.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
3,4,0.0,26.0,381.172227,26.0,1.0,1.0,5.0,2.0,0.0,...,1,0,0,0,0,1,0,0,0,1
4,5,0.0,21.0,283.157229,21.0,2.0,0.0,7.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156253,156254,0.0,27.0,434.997844,27.0,3.0,1.0,6.0,0.0,0.0,...,0,0,1,0,1,0,0,0,0,1
156254,156255,0.0,39.0,551.209007,39.0,3.0,1.0,10.0,1.0,0.0,...,1,0,0,0,0,0,1,1,1,1
156255,156256,0.0,23.0,310.179361,23.0,2.0,0.0,3.0,1.0,0.0,...,0,0,1,0,0,0,1,0,0,0
156256,156257,0.0,22.0,295.095691,22.0,3.0,1.0,4.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1


Unnamed: 0,INDEX,ACTIVE,NumAtoms,MolWt,HeavyAtom,AroRing,AmideBond,RotatableBond,SaturatedRing,AL_COO,...,fcpc114,fcpc115,fcpc116,fcpc117,fcpc118,fcpc119,fcpc120,fcpc121,fcpc122,fcpc123
0,1,0.0,19.0,256.132411,19.0,2.0,0.0,2.0,0.0,0.0,...,0,0,1,1,0,0,0,0,1,0
1,2,0.0,30.0,420.114378,30.0,4.0,0.0,6.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
2,3,0.0,27.0,364.142307,27.0,3.0,0.0,7.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
3,4,0.0,26.0,381.172227,26.0,1.0,1.0,5.0,2.0,0.0,...,1,0,0,0,0,1,0,0,0,1
4,5,0.0,21.0,283.157229,21.0,2.0,0.0,7.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156253,156254,0.0,27.0,434.997844,27.0,3.0,1.0,6.0,0.0,0.0,...,0,0,1,0,1,0,0,0,0,1
156254,156255,0.0,39.0,551.209007,39.0,3.0,1.0,10.0,1.0,0.0,...,1,0,0,0,0,0,1,1,1,1
156255,156256,0.0,23.0,310.179361,23.0,2.0,0.0,3.0,1.0,0.0,...,0,0,1,0,0,0,1,0,0,0
156256,156257,0.0,22.0,295.095691,22.0,3.0,1.0,4.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1


# Data separation

In [156]:
y_total = features_df["ACTIVE"]

#x_total = features_df.drop(["INDEX", "SMILES", "HeavyAtom", "ACTIVE"], axis = 1)

tempcolumn = features_df.columns.tolist()
activeindex = features_df.columns.get_loc("ACTIVE")
newcolumn = tempcolumn[0:activeindex] + tempcolumn[activeindex + 1:] + tempcolumn[activeindex:activeindex + 1]
print(newcolumn)
features_df = features_df[newcolumn]

x_total = features_df.drop(["INDEX", "HeavyAtom", "ACTIVE", "rd_form"], axis = 1)
#x_train, x_validate, y_train, y_validate = train_test_split(train, y, test_size=0.2, random_state=42)

# display(train)
# display(x_train)
# display(x_validate)
# display(y_train)
# display(y_validate)

# clf = RandomForestClassifier(class_weight = "balanced").fit(x_train, y_train)
# preds = clf.predict(x_validate)
# fpr, tpr, thresholds = metrics.roc_curve(y_validate, preds)
# metrics.auc(fpr, tpr)
# AUC = 0.5197425681819635

['INDEX', 'NumAtoms', 'MolWt', 'HeavyAtom', 'AroRing', 'AmideBond', 'RotatableBond', 'SaturatedRing', 'AL_COO', 'Benzene', 'rd_form', 'fcpc0', 'fcpc1', 'fcpc2', 'fcpc3', 'fcpc4', 'fcpc5', 'fcpc6', 'fcpc7', 'fcpc8', 'fcpc9', 'fcpc10', 'fcpc11', 'fcpc12', 'fcpc13', 'fcpc14', 'fcpc15', 'fcpc16', 'fcpc17', 'fcpc18', 'fcpc19', 'fcpc20', 'fcpc21', 'fcpc22', 'fcpc23', 'fcpc24', 'fcpc25', 'fcpc26', 'fcpc27', 'fcpc28', 'fcpc29', 'fcpc30', 'fcpc31', 'fcpc32', 'fcpc33', 'fcpc34', 'fcpc35', 'fcpc36', 'fcpc37', 'fcpc38', 'fcpc39', 'fcpc40', 'fcpc41', 'fcpc42', 'fcpc43', 'fcpc44', 'fcpc45', 'fcpc46', 'fcpc47', 'fcpc48', 'fcpc49', 'fcpc50', 'fcpc51', 'fcpc52', 'fcpc53', 'fcpc54', 'fcpc55', 'fcpc56', 'fcpc57', 'fcpc58', 'fcpc59', 'fcpc60', 'fcpc61', 'fcpc62', 'fcpc63', 'fcpc64', 'fcpc65', 'fcpc66', 'fcpc67', 'fcpc68', 'fcpc69', 'fcpc70', 'fcpc71', 'fcpc72', 'fcpc73', 'fcpc74', 'fcpc75', 'fcpc76', 'fcpc77', 'fcpc78', 'fcpc79', 'fcpc80', 'fcpc81', 'fcpc82', 'fcpc83', 'fcpc84', 'fcpc85', 'fcpc86', 'fcpc8

# Data preparation

In [157]:
def norm(x_train, label_list):
    
    train = x_train.loc[:, label_list]
    #val = x_val.loc[:, label_list]
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    scaler = MinMaxScaler()
    kbd = KBinsDiscretizer(n_bins=10, encode="ordinal")
    train = imp_mean.fit_transform(train)
    train = scaler.fit_transform(train)
    train = kbd.fit_transform(train)
    #val = imp_mean.transform(val)
    #val = scaler.transform(val)
    #val = kbd.transform(val)
    x_train.loc[:, label_list] = np.array(train)
    #x_val.loc[:, label_list] = np.array(val)
    return x_train


def prep(x_train):
    
    col_list = list(x_train.columns)
    x_train = norm(x_train, col_list)
    return x_train

# Apply data preparation

In [158]:
x_total = prep(x_total)

display(x_total)
#display(x_validate)





Unnamed: 0,NumAtoms,MolWt,AroRing,AmideBond,RotatableBond,SaturatedRing,AL_COO,Benzene,fcpc0,fcpc1,...,fcpc114,fcpc115,fcpc116,fcpc117,fcpc118,fcpc119,fcpc120,fcpc121,fcpc122,fcpc123
0,1.0,1.0,2.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,8.0,8.0,3.0,0.0,5.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7.0,5.0,3.0,0.0,6.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6.0,6.0,1.0,1.0,4.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.0,2.0,2.0,0.0,6.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156253,7.0,8.0,3.0,1.0,5.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
156254,9.0,9.0,3.0,1.0,7.0,1.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
156255,4.0,3.0,2.0,0.0,2.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
156256,3.0,2.0,3.0,1.0,3.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Random tree forest

In [54]:
# clf = RandomForestClassifier(class_weight = "balanced").fit(x_train, y_train)
# preds = clf.predict(x_validate)
# fpr, tpr, thresholds = metrics.roc_curve(y_validate, preds)
# metrics.auc(fpr, tpr)
# AUC = 0.6426990749407495 

0.6426990749407495

# Random tree forest + k-fold

In [81]:
kf = KFold(n_splits=5)
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5)

#rf = RandomForestClassifier(n_estimators=100, class_weight = "balanced")
#cross_val_score(rf, x_train, y_train, cv=kf, scoring="roc_auc")

In [82]:
clf = RandomForestClassifier()

parameters = {
    "n_estimators": [200, 300, 400],
    "max_depth": [5, 10, 15],
    "random_state": [0]
}

grid = GridSearchCV(clf, param_grid=parameters, cv=skf, scoring="roc_auc").fit(x_total, y)

In [83]:
grid.best_score_

0.7585958910633003

In [84]:
grid.best_params_

{'max_depth': 10, 'n_estimators': 400, 'random_state': 0}

In [85]:
cv_results = grid.cv_results_
display(pd.DataFrame(cv_results).sort_values(by='rank_test_score'))

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,param_random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
5,17.365747,0.109674,0.97189,0.007019,10,400,0,"{'max_depth': 10, 'n_estimators': 400, 'random...",0.772856,0.759384,0.746942,0.75423,0.759567,0.758596,0.008483,1
3,8.713499,0.163845,0.490063,0.003934,10,200,0,"{'max_depth': 10, 'n_estimators': 200, 'random...",0.773608,0.758398,0.746875,0.753095,0.760625,0.75852,0.008912,2
4,12.962109,0.210271,0.728553,0.003643,10,300,0,"{'max_depth': 10, 'n_estimators': 300, 'random...",0.77283,0.760018,0.746288,0.753745,0.759678,0.758512,0.008727,3
2,10.487834,0.123879,0.592655,0.012635,5,400,0,"{'max_depth': 5, 'n_estimators': 400, 'random_...",0.764216,0.767899,0.742286,0.738539,0.746499,0.751888,0.011897,4
1,7.907291,0.07731,0.441838,0.002407,5,300,0,"{'max_depth': 5, 'n_estimators': 300, 'random_...",0.763862,0.767853,0.742509,0.738255,0.746135,0.751723,0.011875,5
0,5.326684,0.117249,0.299859,0.0033,5,200,0,"{'max_depth': 5, 'n_estimators': 200, 'random_...",0.763775,0.767773,0.740932,0.737815,0.745577,0.751174,0.012239,6
8,20.200686,0.176259,1.25208,0.009907,15,400,0,"{'max_depth': 15, 'n_estimators': 400, 'random...",0.688763,0.674773,0.685518,0.704456,0.689147,0.688531,0.009512,7
7,15.303305,0.13892,0.952438,0.01741,15,300,0,"{'max_depth': 15, 'n_estimators': 300, 'random...",0.688856,0.671688,0.684814,0.706097,0.686368,0.687564,0.011011,8
6,10.205533,0.063688,0.636633,0.005089,15,200,0,"{'max_depth': 15, 'n_estimators': 200, 'random...",0.682612,0.670549,0.681859,0.706535,0.685447,0.685401,0.011727,9


In [None]:
df_rfc = pd.DataFrame(cv_results).sort_values(by='rank_test_score')
df_rfc.to_csv('df_rfc.csv')

In [67]:
# grid.best_score_ = 0.7648324191179492
# grid.best_params_ = {'max_depth': 10, 'n_estimators': 200, 'random_state': 0}
# grid.best_score_ = 0.7583838511187961
# {'max_depth': 10, 'n_estimators': 400, 'random_state': 0}

{'max_depth': 10, 'n_estimators': 200, 'random_state': 0}

# MLPClassifier

In [91]:


def mlp(x_total, y_total):
    # hyper parameter list
    max_iter = [10, 20, 30, 40, 50, 60]
    hidden_layer_sizes = [(10,), (15,), (20,), (25,), (30,) ]
    solver = ['sgd', 'adam']
    activation = ['relu', 'identity', 'logistic', 'tanh']
    learning_rate = ['constant', 'adaptive']
    param_grid = {'max_iter': max_iter, 'hidden_layer_sizes': hidden_layer_sizes, 'solver': solver, 'activation': activation, 'learning_rate': learning_rate}
    mlp = MLPClassifier()
    clf = GridSearchCV(estimator=mlp, param_grid=param_grid, cv=skf,
                       scoring='roc_auc', n_jobs=4)
    clf.fit(x_total, y_total)
    print("Best Score: ")
    print(clf.best_score_)
    print("Best Estimator: ")
    print(clf.best_estimator_)
    cv_results = clf.cv_results_
    display(pd.DataFrame(cv_results).sort_values(by='rank_test_score'))
    best_params = clf.best_params_
    model = MLPClassifier(**best_params)
    model.fit(x_total, y_total)

    #prediction = model.predict_proba(x_val)
    #auc = metrics.roc_auc_score(y_val, prediction[:, 1])
    #score = model.score(x_val, y_val)
    #print("auc:", auc)
    #print("score:", score)
    df_mlp = pd.DataFrame(cv_results).sort_values(by='rank_test_score')
    df_mlp.to_csv('df_mlp.csv')

    return model


mlpmodel = mlp(x_total, y)


#0.7655011685101009
# MLPClassifier(activation='tanh', hidden_layer_sizes=(25,), max_iter=40)









































































Best Score: 
0.7659960331861561
Best Estimator: 
MLPClassifier(activation='tanh', hidden_layer_sizes=(30,),
              learning_rate='adaptive', max_iter=50)


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_activation,param_hidden_layer_sizes,param_learning_rate,param_max_iter,param_solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
477,12.494058,1.307455,0.029570,0.001334,tanh,"(30,)",adaptive,50,adam,"{'activation': 'tanh', 'hidden_layer_sizes': (...",0.771861,0.777152,0.757890,0.754707,0.768371,0.765996,0.008457,1
459,9.834606,0.675684,0.029211,0.000635,tanh,"(30,)",constant,20,adam,"{'activation': 'tanh', 'hidden_layer_sizes': (...",0.771713,0.775710,0.755470,0.752964,0.769811,0.765133,0.009149,2
467,9.467026,0.465614,0.031260,0.003445,tanh,"(30,)",constant,60,adam,"{'activation': 'tanh', 'hidden_layer_sizes': (...",0.769591,0.777201,0.756734,0.753836,0.767141,0.764901,0.008573,3
431,9.539782,0.919173,0.025259,0.003982,tanh,"(20,)",adaptive,60,adam,"{'activation': 'tanh', 'hidden_layer_sizes': (...",0.771415,0.775769,0.756538,0.754972,0.765577,0.764854,0.008118,4
449,11.095641,1.496204,0.027993,0.001863,tanh,"(25,)",adaptive,30,adam,"{'activation': 'tanh', 'hidden_layer_sizes': (...",0.773088,0.775024,0.755706,0.752803,0.767514,0.764827,0.009024,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300,3.807488,0.114951,0.020892,0.001104,logistic,"(20,)",adaptive,10,sgd,"{'activation': 'logistic', 'hidden_layer_sizes...",0.429785,0.524892,0.457165,0.570596,0.518596,0.500207,0.050426,476
276,3.700262,0.130195,0.020881,0.001624,logistic,"(15,)",adaptive,10,sgd,"{'activation': 'logistic', 'hidden_layer_sizes...",0.406546,0.458222,0.438894,0.492720,0.471288,0.453534,0.029308,477
264,4.668010,0.585382,0.019879,0.001046,logistic,"(15,)",constant,10,sgd,"{'activation': 'logistic', 'hidden_layer_sizes...",0.422970,0.398837,0.530305,0.439506,0.425352,0.443394,0.045380,478
240,3.164421,0.043514,0.017274,0.000935,logistic,"(10,)",constant,10,sgd,"{'activation': 'logistic', 'hidden_layer_sizes...",0.373677,0.432845,0.449950,0.428011,0.433519,0.423601,0.026038,479


In [160]:




test_df = pd.read_csv("test_smiles.csv")

smilet = SMILEActive()

test_feature_df = smilet.featureExtraction(test_df)
test_feature_df = get_smile(test_feature_df)
test_feature_df = fingerprint(test_feature_df)
x_test = test_feature_df.drop(["INDEX", "HeavyAtom", "rd_form"], axis = 1)
x_test = prep(x_test)

x_test.to_csv('test_features.csv')

print(x_test)



  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0
  data["fcpc" + str(fcpc)] = 0




       NumAtoms  MolWt  AroRing  AmideBond  RotatableBond  SaturatedRing  \
0           4.0    3.0      3.0        0.0            5.0            0.0   
1           5.0    7.0      3.0        1.0            6.0            0.0   
2           2.0    1.0      2.0        0.0            1.0            1.0   
3           4.0    3.0      1.0        2.0            4.0            0.0   
4           6.0    7.0      2.0        1.0            4.0            1.0   
...         ...    ...      ...        ...            ...            ...   
52081       0.0    0.0      1.0        1.0            2.0            0.0   
52082       0.0    1.0      1.0        0.0            6.0            0.0   
52083       9.0    8.0      3.0        1.0            7.0            0.0   
52084       7.0    7.0      2.0        2.0            3.0            0.0   
52085       2.0    2.0      1.0        0.0            3.0            1.0   

       AL_COO  Benzene  fcpc0  fcpc1  ...  fcpc114  fcpc115  fcpc116  fcpc117  \
0     

In [164]:
prediction = mlpmodel.predict_proba(x_test)
print(prediction)
result = pd.DataFrame(prediction, columns=['active=0', 'active=1'], index=list(x_test.index))
result.to_csv('final_results_mlp.csv')
mlpmodel

[[0.99463932 0.00536068]
 [0.99285977 0.00714023]
 [0.99618675 0.00381325]
 ...
 [0.98416372 0.01583628]
 [0.98810712 0.01189288]
 [0.99731748 0.00268252]]


MLPClassifier(activation='tanh', hidden_layer_sizes=(30,),
              learning_rate='adaptive', max_iter=50)

In [165]:
predictions_df = pd.read_csv("03.txt", header=None)
print(predictions_df.shape)
print(np.all((predictions_df.values >= 0) & (predictions_df.values <= 1)))

(52087, 1)
True


In [161]:
mlpmodel = MLPClassifier(activation='tanh', hidden_layer_sizes=(30,),
              learning_rate='adaptive', max_iter=50)
mlpmodel.fit(x_total, y)
mlpmodel.score(x_total, y)

modelr = RandomForestClassifier(max_depth = 10, n_estimators = 400, random_state = 0)
modelr.fit(x_total, y)



RandomForestClassifier(max_depth=10, n_estimators=400, random_state=0)

In [163]:
predictiontest = mlpmodel.predict_proba(x_test)
print(predictiontest.shape)
print(predictiontest)
print(mlpmodel.score(x_total, y))
print(mlpmodel.predict(x_test))
pr = mlpmodel.predict(x_test)
resultmlp = pd.DataFrame(pr, columns=[ 'ACTIVE'], index=list(x_test.index))
resultmlp.to_csv('final_resultsACTIVE.csv')


predictiontestr = modelr.predict_proba(x_test)
print(predictiontestr.shape)
print(predictiontestr)

predictiontestrr = modelr.predict(x_test)
print(predictiontestrr)

predictiontestmnb = mnbmodel.predict_proba(x_test)
print(predictiontestmnb.shape)
print(predictiontestmnb)

predictiontestbnb = bnbmodel.predict_proba(x_test)
print(predictiontestbnb.shape)
print(predictiontestbnb)



(52086, 2)
[[0.99463932 0.00536068]
 [0.99285977 0.00714023]
 [0.99618675 0.00381325]
 ...
 [0.98416372 0.01583628]
 [0.98810712 0.01189288]
 [0.99731748 0.00268252]]
0.988928566857377
[0. 0. 0. ... 0. 0. 0.]
(52086, 2)
[[0.99561917 0.00438083]
 [0.9934424  0.0065576 ]
 [0.99770089 0.00229911]
 ...
 [0.97972735 0.02027265]
 [0.99020888 0.00979112]
 [0.99738875 0.00261125]]
[0. 0. 0. ... 0. 0. 0.]
(52086, 2)
[[0.99206308 0.00793692]
 [0.99251035 0.00748965]
 [0.99020724 0.00979276]
 ...
 [0.98964367 0.01035633]
 [0.98928946 0.01071054]
 [0.99260548 0.00739452]]
(52086, 2)
[[9.73569525e-01 2.64304755e-02]
 [9.73569525e-01 2.64304755e-02]
 [9.99360620e-01 6.39379780e-04]
 ...
 [9.73569525e-01 2.64304755e-02]
 [9.91197844e-01 8.80215616e-03]
 [9.99312354e-01 6.87646418e-04]]


# MultinomialNB

In [120]:
from sklearn.naive_bayes import MultinomialNB

def mnb(x_total, y_total):
    # hyper parameter list
    alpha = [0, 1, 2]
    fit_prior = [True, False]
    #force_alpha = [True]
    param_grid = {'alpha': alpha, 'fit_prior': fit_prior}
    mnb = MultinomialNB()
    clf = GridSearchCV(estimator=mnb, param_grid=param_grid, cv=skf,
                       scoring='roc_auc', n_jobs=4)
    clf.fit(x_total, y_total)
    print("Best Score: ")
    print(clf.best_score_)
    print("Best Estimator: ")
    print(clf.best_estimator_)
    cv_results = clf.cv_results_
    display(pd.DataFrame(cv_results).sort_values(by='rank_test_score'))
    best_params = clf.best_params_
    model = MultinomialNB(**best_params)
    model.fit(x_total, y_total)

    #prediction = model.predict_proba(x_val)
    #auc = metrics.roc_auc_score(y_val, prediction[:, 1])
    #score = model.score(x_val, y_val)
    #print("auc:", auc)
    #print("score:", score)

    return model


mnbmodel = mnb(x_total, y)

print(x_total.columns)

#Best Score: 
#0.7495497632193935
#Best Estimator: 
#MultinomialNB(alpha=0)



Best Score: 
0.7495497632193935
Best Estimator: 
MultinomialNB(alpha=0)




Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_fit_prior,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.101642,0.003949,0.016784,0.002058,0,True,"{'alpha': 0, 'fit_prior': True}",0.752244,0.758346,0.737104,0.743091,0.756964,0.74955,0.0082,1
1,0.086269,0.021543,0.020428,0.009209,0,False,"{'alpha': 0, 'fit_prior': False}",0.752244,0.758346,0.737104,0.743091,0.756964,0.74955,0.0082,1
2,0.076401,0.026888,0.015495,0.001155,1,True,"{'alpha': 1, 'fit_prior': True}",0.747372,0.754115,0.733785,0.739245,0.753913,0.745686,0.008058,3
3,0.064576,0.024476,0.015454,0.000843,1,False,"{'alpha': 1, 'fit_prior': False}",0.747372,0.754115,0.733785,0.739245,0.753913,0.745686,0.008058,3
4,0.074315,0.024946,0.017559,0.002325,2,True,"{'alpha': 2, 'fit_prior': True}",0.741303,0.74834,0.729632,0.734432,0.75019,0.740779,0.007881,5
5,0.090318,0.007456,0.01639,0.003146,2,False,"{'alpha': 2, 'fit_prior': False}",0.741303,0.74834,0.729632,0.734432,0.75019,0.740779,0.007881,5


Index(['NumAtoms', 'MolWt', 'AroRing', 'AmideBond', 'RotatableBond',
       'SaturatedRing', 'AL_COO', 'Benzene', 'fcpc0', 'fcpc1',
       ...
       'fcpc114', 'fcpc115', 'fcpc116', 'fcpc117', 'fcpc118', 'fcpc119',
       'fcpc120', 'fcpc121', 'fcpc122', 'fcpc123'],
      dtype='object', length=132)




# BernoulliNB

In [119]:
from sklearn.naive_bayes import BernoulliNB

def bnb(x_total, y_total):
    # hyper parameter list
    alpha = [0, 1, 2]
    fit_prior = [True, False]
    #force_alpha = [True, False]
    binarize = [0.0, 1, 2]
    param_grid = {'alpha': alpha, 'fit_prior': fit_prior, 'binarize': binarize}
    bnb = BernoulliNB()
    clf = GridSearchCV(estimator=bnb, param_grid=param_grid, cv=skf,
                       scoring='roc_auc', n_jobs=4)
    clf.fit(x_total, y_total)
    print("Best Score: ")
    print(clf.best_score_)
    print("Best Estimator: ")
    print(clf.best_estimator_)
    cv_results = clf.cv_results_
    display(pd.DataFrame(cv_results).sort_values(by='rank_test_score'))
    best_params = clf.best_params_
    model = BernoulliNB(**best_params)
    model.fit(x_total, y_total)

    #prediction = model.predict_proba(x_val)
    #auc = metrics.roc_auc_score(y_val, prediction[:, 1])
    #score = model.score(x_val, y_val)
    #print("auc:", auc)
    #print("score:", score)

    return model


bnbmodel = bnb(x_total, y)

print(x_total.columns)


#Best Score: 
#0.6608025324979593
#Best Estimator: 
#BernoulliNB(alpha=2, binarize=2)



Best Score: 
0.6608025324979593
Best Estimator: 
BernoulliNB(alpha=2, binarize=2)


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_binarize,param_fit_prior,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
17,0.156093,0.001826,0.03076,0.00242,2,2.0,False,"{'alpha': 2, 'binarize': 2, 'fit_prior': False}",0.65257,0.67575,0.663236,0.654804,0.657653,0.660803,0.008283,1
16,0.164429,0.009525,0.035506,0.001764,2,2.0,True,"{'alpha': 2, 'binarize': 2, 'fit_prior': True}",0.65257,0.67575,0.663236,0.654804,0.657653,0.660803,0.008283,1
4,0.17262,0.005151,0.03212,0.003202,0,2.0,True,"{'alpha': 0, 'binarize': 2, 'fit_prior': True}",0.65257,0.67575,0.663236,0.654804,0.657499,0.660772,0.008295,3
5,0.169252,0.011375,0.032947,0.003529,0,2.0,False,"{'alpha': 0, 'binarize': 2, 'fit_prior': False}",0.65257,0.67575,0.663236,0.654804,0.657499,0.660772,0.008295,3
11,0.174434,0.008315,0.032909,0.001835,1,2.0,False,"{'alpha': 1, 'binarize': 2, 'fit_prior': False}",0.65257,0.67575,0.663236,0.654804,0.657499,0.660772,0.008295,3
10,0.172198,0.010355,0.031628,0.003267,1,2.0,True,"{'alpha': 1, 'binarize': 2, 'fit_prior': True}",0.65257,0.67575,0.663236,0.654804,0.657499,0.660772,0.008295,3
2,0.185737,0.004825,0.032594,0.001534,0,1.0,True,"{'alpha': 0, 'binarize': 1, 'fit_prior': True}",0.64597,0.664465,0.629696,0.62188,0.65591,0.643584,0.01586,7
3,0.176104,0.007259,0.033501,0.001494,0,1.0,False,"{'alpha': 0, 'binarize': 1, 'fit_prior': False}",0.64597,0.664465,0.629696,0.62188,0.65591,0.643584,0.01586,7
15,0.171688,0.012589,0.033472,0.002996,2,1.0,False,"{'alpha': 2, 'binarize': 1, 'fit_prior': False}",0.64597,0.664466,0.629696,0.621879,0.655907,0.643584,0.015859,9
14,0.170657,0.010498,0.033504,0.003354,2,1.0,True,"{'alpha': 2, 'binarize': 1, 'fit_prior': True}",0.64597,0.664466,0.629696,0.621879,0.655907,0.643584,0.015859,9


Index(['NumAtoms', 'MolWt', 'AroRing', 'AmideBond', 'RotatableBond',
       'SaturatedRing', 'AL_COO', 'Benzene', 'fcpc0', 'fcpc1',
       ...
       'fcpc114', 'fcpc115', 'fcpc116', 'fcpc117', 'fcpc118', 'fcpc119',
       'fcpc120', 'fcpc121', 'fcpc122', 'fcpc123'],
      dtype='object', length=132)
