In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator

In [2]:
#dataset-1 with permeability
Dataset1_Smiles_P = pd.read_csv("dataset1_imputed_all.csv")
Dataset1_Smiles_P = Dataset1_Smiles_P.groupby('Smiles').mean().reset_index()
Dataset1_Smiles_P

In [4]:
#dataset-1 fingerprint
molecules = Dataset1_Smiles_P.Smiles.apply(Chem.MolFromSmiles)

fp = molecules.apply(lambda m: AllChem.GetMorganFingerprint(m, radius=3))
fp_n = fp.apply(lambda m: m.GetNonzeroElements())

In [5]:
# using substructures in dataset-1 to construct a dictionary
HashCode = []
for i in fp_n:
    for j in i.keys():
        HashCode.append(j)
        
unique_set = set(HashCode)
unique_list = list(unique_set)

Corr_df = pd.DataFrame(unique_list).reset_index()

In [6]:
#construct dataset-1 input
MY_finger = []
for polymer in fp_n:
    my_finger = [0] * len(unique_list)
    for key in polymer.keys():
        index = Corr_df[Corr_df[0] == key]['index'].values[0]
        my_finger[index] = polymer[key]
    MY_finger.append(my_finger)
    
MY_finger_dataset_1 = pd.DataFrame(MY_finger)  

In [7]:
# filter input into the most popular 47 substructures
Zero_Sum = (MY_finger_dataset_1 == 0).astype(int).sum()
NumberOfZero = 300
print(len(Zero_Sum[Zero_Sum < NumberOfZero]))
X_fingerprints = MY_finger_dataset_1[Zero_Sum[Zero_Sum < NumberOfZero].index]
X_fingerprints

47


Unnamed: 0,298,380,423,619,724,799,866,984,1116,1160,...,2706,2728,2814,2818,2854,2857,2925,2984,3093,3155
0,0,0,0,1,2,0,0,0,0,2,...,0,0,0,0,3,0,2,2,0,13
1,0,0,0,0,2,0,0,0,0,2,...,0,0,0,0,3,0,2,2,0,13
2,0,0,0,0,2,0,0,0,0,2,...,0,0,0,0,0,0,4,2,0,14
3,0,0,0,1,4,0,0,0,0,2,...,0,0,0,0,3,0,0,2,0,8
4,1,0,0,1,4,0,0,0,0,2,...,0,2,0,0,8,0,0,2,0,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
348,0,6,0,0,0,0,0,4,1,0,...,4,0,4,2,8,6,0,0,0,10
349,2,3,0,2,0,0,0,4,1,0,...,4,6,0,2,2,0,0,0,0,18
350,2,5,0,0,0,0,0,4,1,0,...,4,2,5,2,2,1,0,0,2,16
351,2,5,0,0,0,0,0,4,1,0,...,4,4,2,2,4,3,0,0,0,16


In [9]:
X_fingerprints.to_csv('X_fingerprints.csv')

In [None]:
#do the same iwth dataset2
#dataset-2 no permeability
ID_noTg_pred = pd.read_csv("Dataset2_Smiles.csv")

#dataset-2 fingerprint
molecules_noTg_pred = ID_noTg_pred.Smiles.apply(Chem.MolFromSmiles)

fp_noTg_pred = molecules_noTg_pred.apply(lambda m: AllChem.GetMorganFingerprint(m, radius=3))
fp_noTg_pred_n = fp_noTg_pred.apply(lambda m: m.GetNonzeroElements())

#construct dataset-2 input
MY_finger = []
for polymer in fp_noTg_pred_n:
    my_finger = [0] * len(unique_list)
    for key in polymer.keys():
        if key in list(Corr_df[0]):
            index = Corr_df[Corr_df[0] == key]['index'].values[0]
            my_finger[index] = polymer[key]
    MY_finger.append(my_finger)

MY_finger_dataset_2 = pd.DataFrame(MY_finger)
X_dataset_2 = MY_finger_dataset_2[Zero_Sum[Zero_Sum < NumberOfZero].index]

In [10]:
#get the descriptors from SMILES
X_descriptors = np.zeros((Dataset1_Smiles_P.shape[0], 208))
X_descriptors = pd.DataFrame(X_descriptors)
for i in range(Dataset1_Smiles_P.shape[0]):
    mol = Chem.MolFromSmiles(Dataset1_Smiles_P.loc[i, 'Smiles'])
    X_descriptors.iloc[i,:] = [f[1](mol) for f in Descriptors.descList]
#calculator = MolecularDescriptorCalculator([])
#calculator.CalcDescriptors(mol)
X_descriptors

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,198,199,200,201,202,203,204,205,206,207
0,2.333429,0.164923,2.333429,0.164923,0.479509,310.440,288.264,310.172151,118.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.402725,-1.216811,2.402725,0.169515,0.418236,326.515,304.339,326.149077,118.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.314907,0.184489,2.314907,0.184489,0.557853,254.332,240.220,254.109550,94.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9.420806,0.127855,9.420806,0.127855,0.771337,250.341,232.197,250.135765,96.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6.449983,-1.769760,6.449983,0.125872,0.408536,364.605,332.349,364.222242,138.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
348,14.732718,-0.632621,14.732718,0.018773,0.190272,686.852,644.516,686.314458,260.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
349,13.238844,-0.568127,13.238844,0.058631,0.197023,592.607,568.415,592.163436,218.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
350,13.429798,-0.577844,13.429798,0.065072,0.163610,660.726,628.470,660.226037,246.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
351,13.560843,-0.600562,13.560843,0.078563,0.176060,672.737,640.481,672.226037,250.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
Descriptors.descList
X_descriptors = X_descriptors.dropna(axis='columns')
X_descriptors = X_descriptors.loc[:, (X_descriptors != 0).any(axis=0)]
X_descriptors

Unnamed: 0,0,1,2,3,4,5,6,7,8,14,...,178,180,187,188,189,196,198,200,205,206
0,2.333429,0.164923,2.333429,0.164923,0.479509,310.440,288.264,310.172151,118.0,0.625000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.402725,-1.216811,2.402725,0.169515,0.418236,326.515,304.339,326.149077,118.0,0.625000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.314907,0.184489,2.314907,0.184489,0.557853,254.332,240.220,254.109550,94.0,0.500000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9.420806,0.127855,9.420806,0.127855,0.771337,250.341,232.197,250.135765,96.0,0.842105,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,6.449983,-1.769760,6.449983,0.125872,0.408536,364.605,332.349,364.222242,138.0,0.769231,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
348,14.732718,-0.632621,14.732718,0.018773,0.190272,686.852,644.516,686.314458,260.0,0.461538,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
349,13.238844,-0.568127,13.238844,0.058631,0.197023,592.607,568.415,592.163436,218.0,0.555556,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
350,13.429798,-0.577844,13.429798,0.065072,0.163610,660.726,628.470,660.226037,246.0,0.600000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
351,13.560843,-0.600562,13.560843,0.078563,0.176060,672.737,640.481,672.226037,250.0,0.549020,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
X_descriptors.to_csv('X_descriptors.csv')