In [None]:
import pandas as pd
import torch
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import sys
import os
import pickle

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdmolops

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

In [None]:
data_folder = 'raw_data/'
data_source = '1 Supplemental Excel data.xlsx'
data_source_non = "Non-PBT.xlsx"
robcmr = 'Excluded in robustness check 2 - CMR'
robpbt = 'Excluded in robustness check 2 - PBT/vPvB'
sm = 'SMILES'

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
chems_data  = pd.read_excel(data_folder+data_source, sheet_name = 1, header=1)
non_pbt = pd.read_excel(data_folder+data_source_non)
non_pbt = non_pbt['SMILES']
non_pbt = pd.concat([non_pbt, pd.DataFrame(np.zeros([481]), columns=["PBT/vPvB"])], axis=1)


In [None]:
fp_cols = ['SMILES']

chems_ext = chems_data[fp_cols+['CMR']].loc[chems_data[robcmr] == 0]

chems_maccs = chems_data[fp_cols+['PBT/vPvB']].loc[chems_data[robpbt] == 0]

chems_maccs_non = pd.concat([chems_maccs, non_pbt], axis=0)

# chems_data['CM'] = np.where((chems_data['C'] == 1) | (chems_data['M'] == 1), 1, 0)

chems_cm = chems_data[fp_cols+['CM']].loc[chems_data[robcmr] == 0]
chems_r = chems_data[fp_cols+['R']].loc[chems_data[robcmr] == 0]

In [None]:
# print(chems_cm.shape)
# print(chems_r.shape)
print(chems_ext.shape)
print(chems_maccs.shape)
print(chems_maccs_non.shape)

In [None]:
print(chems_cm.loc[chems_cm['CM']==1].shape)
print(chems_cm.loc[chems_cm['CM']==0].shape)
print(chems_r.loc[chems_r['CMR']==1].shape)
print(chems_r.loc[chems_r['CMR']==0].shape)
print(chems_ext.loc[chems_ext['CMR']==1].shape)
print(chems_ext.loc[chems_ext['CMR']==0].shape)
print(chems_maccs.loc[chems_maccs['PBT/vPvB']==1].shape)
print(chems_maccs.loc[chems_maccs['PBT/vPvB']==0].shape)
print(chems_maccs_non.loc[chems_maccs_non['PBT/vPvB']==1].shape)
print(chems_maccs_non.loc[chems_maccs_non['PBT/vPvB']==0].shape)
# print(chems_fcfp.loc[chems_fcfp['ED']==1].shape)


In [None]:
# Carbon_NonPolar = "[C&!$(C=[O,N,P,S])&!$(C#N)]"
Hphobe = "[C&!$(C=[O,N,P,S])&!$(C#N),c,s,S&H0&v2,F,Cl,Br,I]"

Acceptor = "[$([O,S;H1;v2;!$(*-*=[O,N,P,S])]),\
$([O,S;H0;v2]),\
$([O,S;-]),\
$([N;v3;!$(N-*=[O,N,P,S])]),\
n&H0&+0,\
$([o,s;+0;!$([o,s]:n);!$([o,s]:c:n)])]"

Donor="[$([N;!H0;v3,v4&+1]),\
$([O,S;H1;+0]),\
n&H1&+0]"

Aromatic =   "[a]"

feature_list = [Acceptor, Donor, Aromatic, Hphobe]
# feature_list = [Acceptor, Donor]

label_list   = ['HB acceptor', 'HB donor', 'Aromatic', 'Hydrophobic']
# tox_list_pbt = ['non-PBT/vPvB','PBT/vPvB']
# tox_list_cmr = ['non-CMR','CMR']

cyp_metabol_feats = pd.read_excel("output.xls")

In [None]:
# bonds=[]
# for mol_idx in range(chems_maccs.shape[0]):
#     mole = Chem.MolFromSmiles(chems_maccs['SMILES'].iloc[mol_idx])
#     A = Chem.GetAdjacencyMatrix(mole)
#     edge_list = (A != 0).nonzero()
#     for a,b in np.transpose(edge_list):
#         bonds.append(str(mole.GetBondBetweenAtoms(int(a),int(b)).GetBondType()))
# print(list(set(bonds)))

In [None]:
from rdkit.Chem import MACCSkeys
print(MACCSkeys)

In [None]:
file1 = open("MACCS_SMARTS.txt","r")
maccs = []
for idx, line in enumerate(file1.readlines()):
    maccs.append([idx, line.split(":('")[1].rsplit("',",1)[0]])
maccs = pd.DataFrame(maccs, columns=["idx","SMARTS"])
print(maccs)

In [None]:
def maccs_som(dataset, maccs_smarts, soms_smarts):
    ml_data = []
    for mol_idx in range(dataset.shape[0]):
        mol_feats = np.zeros([len(maccs_smarts)+len(soms_smarts),1])
        mcule = Chem.MolFromSmiles(dataset['SMILES'].iloc[mol_idx])

        for i in range(166):
            if maccs['SMARTS'].iloc[i] != '?':
                cyp_idx_feats = mcule.GetSubstructMatches(Chem.MolFromSmarts(maccs['SMARTS'].iloc[i]))
                for csub in cyp_idx_feats:
                    for c in csub:
                        mol_feats[i] += 1
                        
        for i in range(71):
            cyp_idx_feats = mcule.GetSubstructMatches(Chem.MolFromSmarts(cyp_metabol_feats['SMARTS'].iloc[i]))
            for c in cyp_idx_feats:
                mol_feats[i+166] += 1
        ml_data.append([dataset['SMILES'].iloc[mol_idx], dataset['PBT/vPvB'].iloc[mol_idx], mol_feats[feats_idx].ravel()])
    return ml_data

In [None]:
#robustness check good
chems_maccs_non = chems_maccs_non.drop(chems_maccs_non.index[506])
chems_maccs_non = chems_maccs_non.drop(chems_maccs_non.index[751])
chems_maccs_non = chems_maccs_non.drop(chems_maccs_non.index[794])

In [None]:
dat = maccs_som(chems_maccs_non, maccs, cyp_metabol_feats)
dat = pd.DataFrame(dat)

In [None]:
print(sum(dat[2]))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

X = dat[2].values
y = dat[1].values
X = [[int(number) for number in group] for group in X]

rfc=RandomForestClassifier(random_state=42)
param_grid = { 
    'n_estimators': [200,500,1000],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8,12,15],
    'criterion' :['gini', 'entropy']
}
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv = 5)
CV_rfc.fit(X, y)

In [None]:
CV_rfc.best_params_

In [None]:
print(len(X_test))

In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-2, 1e-3, 1e-4, 1e-5],
                     'C': [1, 10, 100, 1000, 2000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(
        SVC(), tuned_parameters, scoring='%s_macro' % score
    )
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

In [None]:
for i in range(40):
    skf = StratifiedKFold(n_splits=10, random_state=i, shuffle=True)
    X = dat[2].values
    y = dat[1].values
    X = [[int(number) for number in group] for group in X]
    skf.get_n_splits(X, y)
    from sklearn.metrics import balanced_accuracy_score
    X = np.array(X)
    bac=[]
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf = SVC(C=1000, kernel='rbf', gamma=1e-4)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        bac.append(balanced_accuracy_score(y_test,y_pred))
    print(np.mean(bac), np.std(bac))

In [None]:
for i in range(2):
    skf = StratifiedKFold(n_splits=10, random_state=i, shuffle=True)
    X = dat[2].values
    y = dat[1].values
    X = [[int(number) for number in group] for group in X]
    skf.get_n_splits(X, y)
    from sklearn.metrics import balanced_accuracy_score
    X = np.array(X)
    bac=[]
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf = RandomForestClassifier(criterion='gini', max_depth=15, max_features='auto', n_estimators=500)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        bac.append(balanced_accuracy_score(y_test,y_pred))
    print(np.mean(bac), np.std(bac))

In [None]:
print(clf.feature_importances_)

In [None]:
feats_idx = feature_importances.index[-50:-1]


In [None]:
import pandas as pd
feature_importances = pd.DataFrame(clf.feature_importances_,columns=['importance']).sort_values('importance',ascending=False)

feature_importances.plot.barh(figsize=(15,50))
print(feature_importances.index)


In [None]:
print(maccs['SMARTS'].iloc[86])
print(maccs['SMARTS'].iloc[133])
print(maccs['SMARTS'].iloc[158])
print(maccs['SMARTS'].iloc[106])
print()
print("SOM", cyp_metabol_feats['SMARTS'].iloc[191-166])
print()
print(maccs['SMARTS'].iloc[41])
print(maccs['SMARTS'].iloc[40])
print(maccs['SMARTS'].iloc[49])
print(maccs['SMARTS'].iloc[160])
print(maccs['SMARTS'].iloc[140])
print(maccs['SMARTS'].iloc[145])
print(maccs['SMARTS'].iloc[100])
print(maccs['SMARTS'].iloc[163])
print(maccs['SMARTS'].iloc[157])
print(maccs['SMARTS'].iloc[135])
print(maccs['SMARTS'].iloc[159])
print(maccs['SMARTS'].iloc[153])
print()
print("SOM", cyp_metabol_feats['SMARTS'].iloc[191-166])



In [None]:
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the impurity-based feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
        color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

In [None]:
from sklearn.metrics import balanced_accuracy_score
X = np.array(X)
bac=[]
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf = SVC(C=1000, kernel='rbf', gamma=1e-4)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    bac.append(balanced_accuracy_score(y_test,y_pred))
print(np.mean(bac), np.std(bac))

In [None]:
# print(tellen1)

In [None]:
# chems_maccs.loc[chems_maccs['PBT/vPvB']==1].shape

In [None]:
# mol_feats = np.zeros([320,1])
# print(mol_feats)


In [None]:

# tellen2 = np.zeros([71,1])
# for i in range(chems_maccs.loc[chems_maccs['PBT/vPvB']==0].shape[0]):
#     mcule = Chem.MolFromSmiles(chems_maccs.loc[chems_maccs['PBT/vPvB']==0]['SMILES'].iloc[i])

#     feat_cyp = np.zeros((len(list(mcule.GetAtoms())), 71))
#     for i in range(71):
#         cyp_idx_feats = mcule.GetSubstructMatches(Chem.MolFromSmarts(cyp_metabol_feats['SMARTS'].iloc[i]))
#         for c in cyp_idx_feats:
#             tellen2[c] +=1
# tellen2 = 100*(tellen2/381)
# tellen2 = np.zeros([166,1])
# for mol_idx in range(chems_maccs.loc[chems_maccs['PBT/vPvB']==0].shape[0]):
#     mcule = Chem.MolFromSmiles(chems_maccs.loc[chems_maccs['PBT/vPvB']==0]['SMILES'].iloc[mol_idx])

#     for i in range(166):
#         if maccs['SMARTS'].iloc[i] != '?':
#             cyp_idx_feats = mcule.GetSubstructMatches(Chem.MolFromSmarts(maccs['SMARTS'].iloc[i]))
#             for csub in cyp_idx_feats:
#                 for c in csub:
#                     tellen2[i] += 1

In [None]:
# tellen = pd.DataFrame(np.concatenate((tellen1,tellen2), axis=1))

# tellen.plot.barh(figsize=(15,50))

In [None]:
def create_representation1(smiles_data, features, dictionary_bonds, tox_label):
    matrices_dataset = []
    
    for mol_idx in range(smiles_data.shape[0]):
        #get molecule from SMILES
        mcule = (Chem.MolFromSmiles(smiles_data['SMILES'].iloc[mol_idx]))
        #adjacency matrix, molecule in graph and edge list
        adj_matrix = Chem.GetAdjacencyMatrix(mcule)
        mcule_graph = nx.from_numpy_matrix(adj_matrix)
        edge_list = (np.tril(adj_matrix) != 0).nonzero()
        
        #get label for each atom as list
        atom_labels = []
        for _, atom in enumerate(mcule.GetAtoms()):
            atom_labels.append(atom.GetAtomicNum())
        
        #add edge label as weight to the graph representation, use dict of bonds
        for atom1,atom2 in np.transpose(edge_list):
            bond_str = str(mcule.GetBondBetweenAtoms(int(atom1),int(atom2)).GetBondType())
            mcule_graph[atom1][atom2]['weight'] = dictionary_bonds.get(bond_str)

        #add adjancency matrix with edge labels, atom labels and label to dataset
        adj_mat = nx.to_numpy_matrix(mcule_graph).astype(int)
        tox_lab = smiles_data[tox_label].iloc[mol_idx]
        matrices_dataset.append([adj_mat, atom_labels, tox_lab])


    return pd.DataFrame(matrices_dataset, columns=['matrix', 'atoms', 'tox'])

In [None]:
def create_representation2(smiles_data, features, dictionary_bonds, tox_label):
    matrices_dataset = []
    
    for mol_idx in range(smiles_data.shape[0]):
        #get molecule from SMILES and feature locations list
        mcule = Chem.MolFromSmiles(smiles_data['SMILES'].iloc[mol_idx])
        feat1 = mcule.GetSubstructMatches(Chem.MolFromSmarts(features[0]))
        feat2 = mcule.GetSubstructMatches(Chem.MolFromSmarts(features[1]))
        feat3 = mcule.GetSubstructMatches(Chem.MolFromSmarts(features[2]))
        feat4 = mcule.GetSubstructMatches(Chem.MolFromSmarts(features[3]))
        
        #adjacency matrix, molecule in graph and edge list
        adj_matrix = Chem.GetAdjacencyMatrix(mcule)
        mcule_graph = nx.from_numpy_matrix(adj_matrix)
        edge_list = (np.tril(adj_matrix) != 0).nonzero()
        
        #get label for each atom as list
        atom_labels = []
        for _, atom in enumerate(mcule.GetAtoms()):
            atom_labels.append(atom.GetAtomicNum())
        
        #add edge label as weight to the graph representation, use dict of bonds
        for atom1,atom2 in np.transpose(edge_list):
            bond_str = str(mcule.GetBondBetweenAtoms(int(atom1),int(atom2)).GetBondType())
            mcule_graph[atom1][atom2]['weight'] = dictionary_bonds.get(bond_str)

        #get start of idx for new nodes
        node_count = len(mcule_graph.nodes)
        
        #add all new features as nodes
        for idx, feat_list in enumerate([feat1,feat2,feat3,feat4]):
#         for idx, feat_list in enumerate([feat1,feat2]):
            for feat in feat_list:
                mcule_graph.add_node(node_count)
                mcule_graph.add_edge(feat[0], node_count, weight=(len(dictionary_bonds)+1+idx))
                #add the new nodes as numbers above the current periodic table
                atom_labels.append(120+idx)
                node_count+=1
        
        #add adjancency matrix with edge labels, atom labels and label to dataset
        adj_mat = nx.to_numpy_matrix(mcule_graph).astype(int)
        tox_lab = smiles_data[tox_label].iloc[mol_idx]
        matrices_dataset.append([adj_mat, atom_labels, tox_lab])


    return pd.DataFrame(matrices_dataset, columns=['matrix', 'atoms', 'tox'])

In [None]:
def create_representation3(smiles_data, features, dictionary_bonds, tox_label):
    matrices_dataset = []
    
    for mol_idx in range(smiles_data.shape[0]):
        #get molecule from SMILES
        mcule = Chem.MolFromSmiles(smiles_data['SMILES'].iloc[mol_idx])
        
        #adjacency matrix, molecule in graph and edge list
        adj_matrix = Chem.GetAdjacencyMatrix(mcule)
        mcule_graph = nx.from_numpy_matrix(adj_matrix)
        edge_list = (np.tril(adj_matrix) != 0).nonzero()

        #get label for each atom as list
        atom_labels = []
        for _, atom in enumerate(mcule.GetAtoms()):
            atom_labels.append(atom.GetAtomicNum())
        
        #get start of idx for new nodes
        node_count = len(mcule_graph.nodes)
        
        #add edge label as weight to the graph representation, use dict of bonds
        for atom1,atom2 in np.transpose(edge_list):
            bond_str = str(mcule.GetBondBetweenAtoms(int(atom1),int(atom2)).GetBondType())
            mcule_graph.remove_edge(atom1,atom2)
            mcule_graph.add_node(node_count)
            mcule_graph.add_edge(atom1, node_count, weight=dictionary_bonds.get(bond_str))
            mcule_graph.add_edge(atom2, node_count, weight=dictionary_bonds.get(bond_str))
            atom_labels.append(120+dictionary_bonds.get(bond_str))
            node_count += 1


        #add adjancency matrix with edge labels, atom labels and label to dataset
        adj_mat = nx.to_numpy_matrix(mcule_graph).astype(int)
        tox_lab = smiles_data[tox_label].iloc[mol_idx]
        matrices_dataset.append([adj_mat, atom_labels, tox_lab])


    return pd.DataFrame(matrices_dataset, columns=['matrix', 'atoms', 'tox'])

In [None]:
def create_representation4(smiles_data, features, dictionary_bonds, tox_label):
    matrices_dataset = []
    
    for mol_idx in range(smiles_data.shape[0]):
        #get molecule from SMILES
        mcule = (Chem.MolFromSmiles(smiles_data['SMILES'].iloc[mol_idx]))
        
        #adjacency matrix, molecule in graph and edge list
        adj_matrix = Chem.GetAdjacencyMatrix(mcule)
        mcule_graph = nx.from_numpy_matrix(adj_matrix)
        edge_list = (np.tril(adj_matrix) != 0).nonzero()
        
        #get label for each atom as list
        atom_labels = []
        for _, atom in enumerate(mcule.GetAtoms()):
            atom_labels.append(atom.GetAtomicNum())
        
        #add edge label as weight to the graph representation, use dict of bonds
        for atom1,atom2 in np.transpose(edge_list):
            bond_str = str(mcule.GetBondBetweenAtoms(int(atom1),int(atom2)).GetBondType())
            mcule_graph[atom1][atom2]['weight'] = dictionary_bonds.get(bond_str)
        
        #Create matrix with one hot encoded array for every atom and metabolism features
        feat_cyp = np.zeros((len(list(mcule.GetAtoms())), 71))
        for idx in range(cyp_metabol_feats.shape[0]):
            cyp_idx_feats = mcule.GetSubstructMatches(Chem.MolFromSmarts(cyp_metabol_feats['SMARTS'].iloc[idx]))
            for feat in cyp_idx_feats:
                feat_cyp[feat[0]][idx] = 1
        
        #add adjancency matrix with edge labels, atom labels and label to dataset
        adj_mat = nx.to_numpy_matrix(mcule_graph).astype(int)
        tox_lab = smiles_data[tox_label].iloc[mol_idx]
        matrices_dataset.append([adj_mat, atom_labels, tox_lab, feat_cyp])

    return pd.DataFrame(matrices_dataset, columns=['matrix', 'atoms', 'tox', 'cyps'])

In [None]:
def prepare_onehotencoded(rep_dataset):
    atom_list = [x for x in rep_dataset['atoms'].values]
    atom_flat = [item for sublist in atom_list for item in sublist]
    atom_set  = sorted(list(set(atom_flat)))
    
    #first replace with numbers above max of atom set to avoid multiple wrong replacing
    replace1  = list(range(max(atom_set)+1, len(atom_set)+max(atom_set)+1))
    for i, a_list in enumerate(atom_list):
        for j, atom_l in enumerate(a_list):
            for idx, atom_s in enumerate(atom_set):
                if atom_l == atom_s:
                    atom_list[i][j] = replace1[idx]
                    
    #second replace with numbers in range zero to length atom set 
    replace2  = list(range(len(replace1)))
    for i, a_list in enumerate(atom_list):
        for j, atom_l in enumerate(a_list):
            for idx, atom_s in enumerate(replace1):
                if atom_l == atom_s:
                    atom_list[i][j] = replace2[idx]
    
    rep_dataset['atoms'] = atom_list
    dict_onehot = {}
    for i, d in enumerate(replace2):
        dict_onehot[d] = atom_set[i]

    return rep_dataset, dict_onehot

In [None]:
def transform_DGL(dataframe):
    DGL_list = []
    #convert to correct conventions
    max_atoms = max([max(atoms) for atoms in dataframe['atoms']])
    dataframe['matrix'] = dataframe['matrix'].apply(lambda x: torch.tensor(x))
    dataframe['atoms']  = dataframe['atoms'].apply(lambda x: torch.tensor(x))
    #create list of graph info dictionaries
    for i in range(dataframe.shape[0]):
        mol_dict = {}
        mol_dict['num_atom']  = int(len(dataframe['atoms'].iloc[i]))
        if 'cyps' in dataframe.columns:
            onehot_atoms = np.eye(max_atoms+1)[dataframe['atoms'].iloc[i]]
            onehot_cyps = dataframe['cyps'].iloc[i]
            mol_dict['atom_type'] = np.concatenate((onehot_atoms,onehot_cyps), axis=1)
        else:
            mol_dict['atom_type'] = np.eye(max_atoms+1)[dataframe['atoms'].iloc[i]]
        mol_dict['bond_type'] = dataframe['matrix'].iloc[i]
        mol_dict['label']     = dataframe['tox'].iloc[i]
        DGL_list.append(mol_dict)
    return DGL_list

In [None]:
# bond_dict = {'SINGLE':1, 'DOUBLE':2, 'TRIPLE':3, 'AROMATIC':4}

# MUT_Rep1 = create_representation1(mutag, feature_list, bond_dict, 'PBT/vPvB')
# MUT_Rep1, PBT_Rep1_dict = prepare_onehotencoded(PBT_Rep1)
# MUT_Rep1 = transform_DGL(PBT_Rep1)

# MUT_Rep2 = create_representation2(chems_maccs, feature_list, bond_dict, 'PBT/vPvB')
# MUT_Rep2, PBT_Rep2_dict = prepare_onehotencoded(PBT_Rep2)
# MUT_Rep2 = transform_DGL(PBT_Rep2)

# MUT_Rep3 = create_representation3(chems_maccs, feature_list, bond_dict, 'PBT/vPvB')
# MUT_Rep3, PBT_Rep3_dict = prepare_onehotencoded(PBT_Rep3)
# MUT_Rep3 = transform_DGL(PBT_Rep3)

# MUT_Rep4 = create_representation4(chems_maccs, feature_list, bond_dict, 'PBT/vPvB')
# MUT_Rep4, PBT_Rep4_dict = prepare_onehotencoded(PBT_Rep4)
# MUT_Rep4 = transform_DGL(PBT_Rep4)

In [None]:
plt.bar(np.arange(len(tellen1.reshape(1,-1))),tellen1.reshape(1,-1))

In [None]:
for i, t in enumerate(tellen1):
    print(tellen1[i],tellen2[i])

In [None]:
bond_dict = {'SINGLE':1, 'DOUBLE':2, 'TRIPLE':3, 'AROMATIC':4}

CM_Rep1 = create_representation1(chems_cm, feature_list, bond_dict, 'CM')
CM_Rep1, CM_Rep1_dict = prepare_onehotencoded(CM_Rep1)
CM_Rep1 = transform_DGL(CM_Rep1)

CM_Rep2 = create_representation2(chems_cm, feature_list, bond_dict, 'CM')
CM_Rep2, CM_Rep2_dict = prepare_onehotencoded(CM_Rep2)
CM_Rep2 = transform_DGL(CM_Rep2)

CM_Rep3 = create_representation3(chems_cm, feature_list, bond_dict, 'CM')
CM_Rep3, CM_Rep3_dict = prepare_onehotencoded(CM_Rep3)
CM_Rep3 = transform_DGL(CM_Rep3)

CM_Rep4 = create_representation4(chems_cm, feature_list, bond_dict, 'CM')
CM_Rep4, CM_Rep4_dict = prepare_onehotencoded(CM_Rep4)
CM_Rep4 = transform_DGL(CM_Rep4)

In [None]:
bond_dict = {'SINGLE':1, 'DOUBLE':2, 'TRIPLE':3, 'AROMATIC':4}

R_Rep1 = create_representation1(chems_r, feature_list, bond_dict, 'CMR')
R_Rep1, R_Rep1_dict = prepare_onehotencoded(R_Rep1)
R_Rep1 = transform_DGL(R_Rep1)

R_Rep2 = create_representation2(chems_r, feature_list, bond_dict, 'CMR')
R_Rep2, R_Rep2_dict = prepare_onehotencoded(R_Rep2)
R_Rep2 = transform_DGL(R_Rep2)

R_Rep3 = create_representation3(chems_r, feature_list, bond_dict, 'CMR')
R_Rep3, R_Rep3_dict = prepare_onehotencoded(R_Rep3)
R_Rep3 = transform_DGL(R_Rep3)

R_Rep4 = create_representation4(chems_r, feature_list, bond_dict, 'CMR')
R_Rep4, R_Rep4_dict = prepare_onehotencoded(R_Rep4)
R_Rep4 = transform_DGL(R_Rep4)

In [None]:
bond_dict = {'SINGLE':1, 'DOUBLE':2, 'TRIPLE':3, 'AROMATIC':4}


PBT_Rep1 = create_representation1(chems_maccs, feature_list, bond_dict, 'PBT/vPvB')
PBT_Rep1, PBT_Rep1_dict = prepare_onehotencoded(PBT_Rep1)
PBT_Rep1 = transform_DGL(PBT_Rep1)

PBT_Rep2 = create_representation2(chems_maccs, feature_list, bond_dict, 'PBT/vPvB')
PBT_Rep2, PBT_Rep2_dict = prepare_onehotencoded(PBT_Rep2)
PBT_Rep2 = transform_DGL(PBT_Rep2)

PBT_Rep3 = create_representation3(chems_maccs, feature_list, bond_dict, 'PBT/vPvB')
PBT_Rep3, PBT_Rep3_dict = prepare_onehotencoded(PBT_Rep3)
PBT_Rep3 = transform_DGL(PBT_Rep3)

PBT_Rep4 = create_representation4(chems_maccs, feature_list, bond_dict, 'PBT/vPvB')
PBT_Rep4, PBT_Rep4_dict = prepare_onehotencoded(PBT_Rep4)
PBT_Rep4 = transform_DGL(PBT_Rep4)

CMR_Rep1 = create_representation1(chems_ext, feature_list, bond_dict, 'CMR')
CMR_Rep1, CMR_Rep1_dict = prepare_onehotencoded(CMR_Rep1)
CMR_Rep1 = transform_DGL(CMR_Rep1)

CMR_Rep2 = create_representation2(chems_ext, feature_list, bond_dict, 'CMR')
CMR_Rep2, CMR_Rep2_dict = prepare_onehotencoded(CMR_Rep2)
CMR_Rep2 = transform_DGL(CMR_Rep2)

CMR_Rep3 = create_representation3(chems_ext, feature_list, bond_dict, 'CMR')
CMR_Rep3, CMR_Rep3_dict = prepare_onehotencoded(CMR_Rep3)
CMR_Rep3 = transform_DGL(CMR_Rep3)

CMR_Rep4 = create_representation4(chems_ext, feature_list, bond_dict, 'CMR')
CMR_Rep4, CMR_Rep4_dict = prepare_onehotencoded(CMR_Rep4)
CMR_Rep4 = transform_DGL(CMR_Rep4)

In [None]:
#robustness check good
chems_maccs_non = chems_maccs_non.drop(chems_maccs_non.index[506])
chems_maccs_non = chems_maccs_non.drop(chems_maccs_non.index[751])
chems_maccs_non = chems_maccs_non.drop(chems_maccs_non.index[794])

In [None]:
bond_dict = {'SINGLE':1, 'DOUBLE':2, 'TRIPLE':3, 'AROMATIC':4}

PBT_Rep1n = create_representation1(chems_maccs_non, feature_list, bond_dict, 'PBT/vPvB')
PBT_Rep1n, PBT_Rep1_dictn = prepare_onehotencoded(PBT_Rep1n)
PBT_Rep1n = transform_DGL(PBT_Rep1n)

PBT_Rep2n = create_representation2(chems_maccs_non, feature_list, bond_dict, 'PBT/vPvB')
PBT_Rep2n, PBT_Rep2_dictn = prepare_onehotencoded(PBT_Rep2n)
PBT_Rep2n = transform_DGL(PBT_Rep2n)

PBT_Rep3n = create_representation3(chems_maccs_non, feature_list, bond_dict, 'PBT/vPvB')
PBT_Rep3n, PBT_Rep3_dictn = prepare_onehotencoded(PBT_Rep3n)
PBT_Rep3n = transform_DGL(PBT_Rep3n)

PBT_Rep4n = create_representation4(chems_maccs_non, feature_list, bond_dict, 'PBT/vPvB')
PBT_Rep4n, PBT_Rep4_dictn = prepare_onehotencoded(PBT_Rep4n)
PBT_Rep4n = transform_DGL(PBT_Rep4n)

In [None]:
from collections import Counter
def data_desc(dataset, label):
    bond_counter = Counter()
    atom_counter = Counter()
    max_b = 0
    max_a = 0
    min_b = 100000
    min_a = 100000
    for i in range(len(dataset)):
        if dataset[i]['label'] == label:
            bond_counter.update(np.array(dataset[i]['bond_type'])[np.array(dataset[i]['bond_type']).nonzero()])
            atom_counter.update(dataset[i]['atom_type'].nonzero()[1])
            lenb = len(np.array(dataset[i]['bond_type'])[np.array(dataset[i]['bond_type']).nonzero()])
            lena = len(dataset[i]['atom_type'].nonzero()[1])
            if lenb > max_b:
                max_b = lenb
            if lena > max_a:
                max_a = lena
            if lenb < min_b:
                min_b = lenb
            if lena < min_a:
                min_a = lena
#     print('b',(sum(list(bond_counter.values()))/2))
#     print('a',sum(list(atom_counter.values())))
#     print("a", max_a)
#     print("a", min_a)
#     print(bond_counter)
    print(atom_counter[11])
    print(atom_counter[12])
    print(atom_counter[13])
    print(atom_counter[14])
    print(sorted(list(atom_counter.keys())))

In [None]:
feat_cyp = np.zeros((len(list(mcule.GetAtoms())), 71))
print(feat_cyp[0])
for i in range(71):
    cyp_idx_feats = mcule.GetSubstructMatches(Chem.MolFromSmarts(cyp_metabol_feats['SMARTS'].iloc[i]))
    print(cyp_idx_feats)


In [None]:
from collections import Counter
def prior_desc(dataset, label, check):
    check_list = [0,0,0,0]
    for i in range(len(dataset)):
        if dataset[i]['label'] == label:
            bonds = np.array(dataset[i]['bond_type'])[np.array(dataset[i]['bond_type']).nonzero()]
            atoms = dataset[i]['atom_type'].nonzero()[1]
            if check[0] in atoms:
                check_list[0] +=1
            if check[1] in atoms:
                check_list[1] +=1
            if check[2] in atoms:
                check_list[2] +=1
            if check[3] in atoms:
                check_list[3] +=1
    return check_list

In [None]:
print(PBT_Rep2_dictn)

# data_desc(CMR_Rep2, 0)
# print(sum(chems_ext['CMR']==0))
# print()
# data_desc(CMR_Rep2, 1)
# print(sum(chems_ext['CMR']==1))
# print()

a = prior_desc(PBT_Rep2n, 0, [11,12,13,14])
for i in a:
    print("%.2f%%" % float(100*(i/sum(chems_maccs_non['PBT/vPvB']==0))))
print("")
a = prior_desc(PBT_Rep2n, 1, [11,12,13,14])
for i in a:
    print("%.2f%%" % float(100*(i/sum(chems_maccs_non['PBT/vPvB']==1))))

# data_desc(PBT_Rep2n, 0)
# print(sum(chems_maccs_non['PBT/vPvB']==0))
# print()
# data_desc(PBT_Rep2n, 1)
# [Acceptor, Donor, Aromatic, Hphobe]
# print(sum(chems_maccs_non['PBT/vPvB']==1))

In [None]:
print([196, 203, 295, 495, 685, 870, 878])

In [None]:
for i in chems_maccs_non['SMILES'].iloc[[19, 81, 23, 92, 22, 921, 25]]:
# for i in chems_maccs_non['SMILES'].iloc[[469, 53, 95, 136, 94, 231, 916, 359, 518]]:
    print(i)

In [None]:
for i in chems_ext['SMILES'].iloc[[365,271,210,110,304]]:
    print(i)
print()
for i in chems_ext['SMILES'].iloc[[237,463,466,238,280]]:
    print(i)

### print(chems_maccs)

In [None]:
pickle.dump(CM_Rep1, open("data/CM_Rep1.pickle","wb"))
pickle.dump(CM_Rep1, open("data/CM_Rep2.pickle","wb"))
pickle.dump(CM_Rep1, open("data/CM_Rep3.pickle","wb"))
pickle.dump(CM_Rep1, open("data/CM_Rep4.pickle","wb"))

pickle.dump(CM_Rep1_dict, open("data/CM_Rep1_dict.pickle","wb"))
pickle.dump(CM_Rep2_dict, open("data/CM_Rep2_dict.pickle","wb"))
pickle.dump(CM_Rep3_dict, open("data/CM_Rep3_dict.pickle","wb"))
pickle.dump(CM_Rep4_dict, open("data/CM_Rep4_dict.pickle","wb"))

pickle.dump(R_Rep1, open("data/R_Rep1.pickle","wb"))
pickle.dump(R_Rep2, open("data/R_Rep2.pickle","wb"))
pickle.dump(R_Rep3, open("data/R_Rep3.pickle","wb"))
pickle.dump(R_Rep4, open("data/R_Rep4.pickle","wb"))

pickle.dump(R_Rep1_dict, open("data/R_Rep1_dict.pickle","wb"))
pickle.dump(R_Rep2_dict, open("data/R_Rep2_dict.pickle","wb"))
pickle.dump(R_Rep3_dict, open("data/R_Rep3_dict.pickle","wb"))
pickle.dump(R_Rep4_dict, open("data/R_Rep4_dict.pickle","wb"))

In [None]:
pickle.dump(PBT_Rep1n, open("data/PBT_Rep1n.pickle","wb"))
pickle.dump(PBT_Rep2n, open("data/PBT_Rep2n.pickle","wb"))
pickle.dump(PBT_Rep3n, open("data/PBT_Rep3n.pickle","wb"))
pickle.dump(PBT_Rep4n, open("data/PBT_Rep4n.pickle","wb"))

pickle.dump(PBT_Rep1_dictn, open("data/PBT_Rep1_dictn.pickle","wb"))
pickle.dump(PBT_Rep2_dictn, open("data/PBT_Rep2_dictn.pickle","wb"))
pickle.dump(PBT_Rep3_dictn, open("data/PBT_Rep3_dictn.pickle","wb"))
pickle.dump(PBT_Rep4_dictn, open("data/PBT_Rep4_dictn.pickle","wb"))

In [None]:
print(PBT_Rep1_dict.keys())
print(PBT_Rep2_dict.keys())
print(PBT_Rep3_dict.keys())
print(PBT_Rep4_dict.keys())
print(CMR_Rep1_dict.keys())
print(CMR_Rep2_dict.keys())
print(CMR_Rep3_dict.keys())
print(CMR_Rep4_dict.keys())

In [None]:
pickle.dump(PBT_Rep1, open("data/PBT_Rep1.pickle","wb"))
pickle.dump(CMR_Rep1, open("data/CMR_Rep1.pickle","wb"))
pickle.dump(PBT_Rep2, open("data/PBT_Rep2.pickle","wb"))
pickle.dump(CMR_Rep2, open("data/CMR_Rep2.pickle","wb"))
pickle.dump(PBT_Rep3, open("data/PBT_Rep3.pickle","wb"))
pickle.dump(CMR_Rep3, open("data/CMR_Rep3.pickle","wb"))
pickle.dump(PBT_Rep4, open("data/PBT_Rep4.pickle","wb"))
pickle.dump(CMR_Rep4, open("data/CMR_Rep4.pickle","wb"))

pickle.dump(PBT_Rep1_dict, open("data/PBT_Rep1_dict.pickle","wb"))
pickle.dump(CMR_Rep1_dict, open("data/CMR_Rep1_dict.pickle","wb"))
pickle.dump(PBT_Rep2_dict, open("data/PBT_Rep2_dict.pickle","wb"))
pickle.dump(CMR_Rep2_dict, open("data/CMR_Rep2_dict.pickle","wb"))
pickle.dump(PBT_Rep3_dict, open("data/PBT_Rep3_dict.pickle","wb"))
pickle.dump(CMR_Rep3_dict, open("data/CMR_Rep3_dict.pickle","wb"))
pickle.dump(PBT_Rep4_dict, open("data/PBT_Rep4_dict.pickle","wb"))
pickle.dump(CMR_Rep4_dict, open("data/CMR_Rep4_dict.pickle","wb"))

In [None]:
# for i in false_idx:
#     if chems_maccs.iloc[i]['PBT/vPvB'] == 0:
#         print(chems_maccs.iloc[i]['SMILES'])
# # print(PBT_Rep1[32])

In [None]:
# false_idx = [70, 76, 125, 126, 128, 173, 203, 206, 209, 214, 244, 280, 310, 313, 317]

In [None]:
# chems_maccs.iloc[16]

In [None]:
# print(list(chems_ext['CMR']))

In [None]:
# cmr_list = [214, 262, 342, 357, 87, 196, 468, 469, 806, 114, 470, 247, 256, 280, 352, 356, 482, 784, 25, 198, 303, 323, 360, 752, 78, 131, 138, 178, 249, 345, 164, 272, 348, 350, 825, 80, 88, 161, 181, 235, 298, 312, 490, 512, 783, 849, 192, 234, 760, 817, 836, 0, 238, 240, 291, 337, 885]
# for s in list(np.array(chems_ext['SMILES'].tolist())[cmr_list]):
#     print(s)

In [None]:
# for s in list(np.array(chems_maccs['SMILES'].tolist())[[244, 76, 317, 125, 126, 209, 173, 203, 70, 128, 481, 214, 310, 206, 313, 676, 280]]):
#     print(s)

In [None]:
# print(gr_dict.items())

In [None]:
[19, 81, 23, 92, 22, 921, 25]

In [None]:
print(mcule_graph.nodes)
print(gr_dict)

In [None]:
color_map = []
for node in range(17):
    if node < 8:
        color_map.append('r')
    else: 
        color_map.append('g')  

In [None]:
plt.figure(figsize=(7,7))
# gr=29
# gr = 6

gr = 27
mcule_graph =nx.from_numpy_matrix(CMR_Rep1[gr]['bond_type'].numpy())
pos = nx.spring_layout(mcule_graph)
atoms1 = CMR_Rep1[gr]['atom_type'].nonzero()[1]
gr_dict = {}
for n, idx in enumerate(mcule_graph.nodes()):
    gr_dict[n] = atoms1[idx]
nx.draw_networkx_labels(mcule_graph, pos, labels=gr_dict)
# nx.draw_networkx_edges(mcule_graph, pos)
labels = nx.get_edge_attributes(mcule_graph,'weight')
nx.draw_networkx_edge_labels(mcule_graph,pos,edge_labels=labels)
nx.draw(mcule_graph, pos, node_color=color_map)
plt.show()

In [None]:
plt.figure(figsize=(7,7))
mcule_graph = nx.from_numpy_matrix(CMR_Rep2[gr]['bond_type'].numpy())
pos = nx.spring_layout(mcule_graph)
atoms1 = CMR_Rep2[gr]['atom_type'].nonzero()[1]
gr_dict = {}
for n, idx in enumerate(mcule_graph.nodes()):
    gr_dict[n] = atoms1[idx]
nx.draw_networkx_labels(mcule_graph, pos,labels=gr_dict)
labels = nx.get_edge_attributes(mcule_graph,'weight')
nx.draw_networkx_edge_labels(mcule_graph,pos,edge_labels=labels)
# nx.draw_networkx_nodes(mcule_graph, pos, node_list=[0,1,2,3,4], node_color="r")
# nx.draw_networkx_edges(mcule_graph,pos)
nx.draw(mcule_graph, pos, node_color=color_map)
plt.show()

In [None]:
plt.figure(figsize=(7,7))
mcule_graph = nx.from_numpy_matrix(PBT_Rep3n[gr]['bond_type'].numpy())
pos = nx.spring_layout(mcule_graph)
atoms1 = PBT_Rep3n[gr]['atom_type'].nonzero()[1]
gr_dict = {}
for n, idx in enumerate(mcule_graph.nodes()):
    gr_dict[n] = atoms1[idx]
nx.draw_networkx_labels(mcule_graph, pos, labels=gr_dict)
labels = nx.get_edge_attributes(mcule_graph,'weight')
nx.draw_networkx_edge_labels(mcule_graph,pos,edge_labels=labels)
nx.draw(mcule_graph, pos, node_color=color_map)
plt.show()

In [None]:
print(PBT_Rep1[gr]['atom_type'].shape)
print(PBT_Rep4[gr]['atom_type'].shape)
print(sum(sum(PBT_Rep4[gr]['atom_type'])))
print(CMR_Rep1[gr]['atom_type'].shape)
print(CMR_Rep4[gr]['atom_type'].shape)
print(sum(sum(CMR_Rep4[gr]['atom_type'])))