In [8]:
from rdkit import Chem
from rdkit.Chem import Draw
import os
import glob
import numpy as np
import pandas as pd
import PIL
from random import sample, seed
import csv
import pubchempy as pcp
import matplotlib.pyplot as plt
from itertools import chain
%matplotlib inline
from rdkit.Chem.SaltRemover import SaltRemover
from skmultilearn.cluster.igraph import IGraphLabelGraphClusterer
import igraph as ig

In [9]:
# stuff for exploring the classes
from skmultilearn.model_selection.measures import get_combination_wise_output_matrix
from skmultilearn.dataset import load_dataset
from collections import Counter
from skmultilearn.model_selection import iterative_train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from skmultilearn.cluster import LabelCooccurrenceGraphBuilder

# read pubchem CIDs from files

In [10]:
os.chdir("/home/jgmeyer2/drugclass/data/fromPubChem")

In [11]:
tables_list=glob.glob("*txt")
print(tables_list)
CID_dict = {}
cidslist = []
for table in tables_list:
    tableID = table.replace('.txt','')
    tableID = tableID.replace('../data/','')
    CID_dict[tableID] = []
    with open(table) as inf:
        for aline in inf.readlines():
            if aline.startswith('CID')==True:
                #cid = aline.replace('\n')
                CID_dict[tableID].append(aline.replace('\n','').replace('CID: ',''))
                

print(len(CID_dict[tableID]))
#CID_dict.keys()
print(CID_dict[tableID][0])

['gastrointestinal.txt', 'dermatologic.txt', 'urological.txt', 'cns.txt', 'antiinfective.txt', 'antineoplastic.txt', 'reproductivecontrol.txt', 'lipidregulating.txt', 'hematologic.txt', 'respiratorysystem.txt', 'cardio.txt', 'antiinflammatory.txt']
662
134715169


# get SMILES for CIDs

### keep things with smiles length <400

In [12]:
smiles_dict = {}

for key in CID_dict.keys():
    smiles_dict[key] = []
    prop_dict = pcp.get_properties('IsomericSMILES', CID_dict[key])
    for i in range(0, len(prop_dict)):
        if len(prop_dict[i]['IsomericSMILES'])<400:  #### only those under 200 char
            smiles_dict[key].append(prop_dict[i]['IsomericSMILES'])

                
print(prop_dict[i]["IsomericSMILES"])
print(len(smiles_dict[key]))

C1=CC(=C(C=C1C2C(C(=O)C3=C(C=C(C=C3O2)O)O)O)O)O
656


In [13]:
total=0
for key in CID_dict.keys():
    total+=len(smiles_dict[key])
    print(len(smiles_dict[key]))
print(total)

619
257
73
2025
3367
1868
264
228
540
396
1636
656
11929


# Remove salts

In [15]:
### one molecule in antiinfective is wrong
print(len(smiles_dict['antiinfective']))
smiles_dict['antiinfective'].remove('O=Cl=O')
print(len(smiles_dict['antiinfective']))

3367
3366


In [16]:
remover = SaltRemover()
cln_smiles_dict = {}

for key in smiles_dict.keys():
    tmplist = []
    print(key)
    for smiles in smiles_dict[key]:
        #print(smiles)
        tmplist.append(Chem.MolToSmiles(remover(Chem.MolFromSmiles(smiles))))
    cln_smiles_dict[key] = list(set(tmplist))

gastrointestinal
dermatologic
urological
cns
antiinfective
antineoplastic
reproductivecontrol
lipidregulating
hematologic
respiratorysystem
cardio
antiinflammatory


## do any overlap?


In [18]:
rm = []
for key1 in cln_smiles_dict.keys():
    print(key1+ '__________MAIN_len='+str(len(set(cln_smiles_dict[key1]))))
    for key2 in cln_smiles_dict.keys(): ### loop through the keys and check for overlap
        if key1 != key2: ### first key cannot equal second key
            rm.append(list(set(cln_smiles_dict[key1]) & set(cln_smiles_dict[key2])))
            print(str(key2)+'_'+str(len(list(set(cln_smiles_dict[key1]) & set(cln_smiles_dict[key2])))))

gastrointestinal__________MAIN_len=491
dermatologic_6
urological_0
cns_105
antiinfective_65
antineoplastic_23
reproductivecontrol_3
lipidregulating_12
hematologic_6
respiratorysystem_4
cardio_21
antiinflammatory_38
dermatologic__________MAIN_len=226
gastrointestinal_6
urological_0
cns_3
antiinfective_43
antineoplastic_36
reproductivecontrol_5
lipidregulating_0
hematologic_0
respiratorysystem_5
cardio_0
antiinflammatory_32
urological__________MAIN_len=48
gastrointestinal_0
dermatologic_0
cns_0
antiinfective_7
antineoplastic_0
reproductivecontrol_0
lipidregulating_0
hematologic_1
respiratorysystem_0
cardio_12
antiinflammatory_0
cns__________MAIN_len=1529
gastrointestinal_105
dermatologic_3
urological_0
antiinfective_25
antineoplastic_33
reproductivecontrol_5
lipidregulating_6
hematologic_12
respiratorysystem_82
cardio_157
antiinflammatory_29
antiinfective__________MAIN_len=2923
gastrointestinal_65
dermatologic_43
urological_7
cns_25
antineoplastic_320
reproductivecontrol_5
lipidregulatin

In [19]:
### how many chemicals/class pairs are there?
totalcount = 0
chems = [] 
for key in cln_smiles_dict.keys():
    totalcount += len(set(cln_smiles_dict[key]))
    chems += list(set(cln_smiles_dict[key]))
totalcount

9885

In [20]:
### how many unique chemicals are there?
unique_chems = list(set(chems))
print(len(unique_chems))
unique_chems.remove('')
print(len(unique_chems))
unique_chems

8337
8336


['CC(=O)NC1C(O)OC(CO)C(O)C1O',
 'CCC[C@@]1(CCc2ccccc2)CC(O)=C([C@H](CC)c2cccc(NS(=O)(=O)c3ccc(C(F)(F)F)cn3)c2)C(=O)O1',
 'CCCCC(C)C(=O)OC1C(C)C(CC)OC2(CC3CC(C/C=C(\\C)CC(C)/C=C/C=C4\\COC5C(O)C(C)=CC(C(=O)O3)C45O)O2)C1O',
 'COc1cc2c(c(OC)c1OC)-c1c(cc3c(c1OC)OCO3)C[C@H](C)[C@@](C)(O)C2',
 'CC(=O)N[C@@H](CS)C(=O)[O-]',
 'CC(=O)OCC(=O)C1CCC2C3CCC4CC(O)CCC4(C)C3C(=O)CC12C',
 'CC(=O)Nc1nnc(S(N)(=O)=O)s1',
 'O=C([O-])CCC/C=C\\C[C@H]1[C@H]2CC[C@H](C2)[C@@H]1NS(=O)(=O)c1ccccc1.O=C([O-])CCC/C=C\\C[C@H]1[C@H]2CC[C@H](C2)[C@@H]1NS(=O)(=O)c1ccccc1',
 'OCCN1CCN(CC/C=C2\\c3ccccc3Sc3ccc(Cl)cc32)CC1',
 'C[C@@H]1CCC/C=C/[C@H]2C[C@@H](O)C[C@@H]2[C@@H](O)/C=C\\C(=O)O1',
 'CCC=C(CCC)C(=O)O',
 'CCCCCCC(C)(C)c1cc(O)c2c(c1)OC(C)(C)[C@@H]1CC=C(CO)C[C@@H]21',
 'C[C@H]1[C@@H](O)[C@H](C)/C=C/C=C\\CC/C=C/C=C/C=C/C=C/C(O[C@@H]2O[C@@H](C)[C@H](O)[C@@H](N)[C@H]2O)CC2O[C@@](O)(CC(O)C(O)CCC(O)CC(O)CC(O)CC(=O)O[C@@H]1C)CC(O)C2C(=O)O',
 'CCCCCCCCCCCCCCCCCC[N+](C)(C)CCCCCCCCCCCCCCCCCC',
 'CCN=c1cc2oc3cc(NCC)c(C)cc3c(-c3cc

## Loop through the molecules and look for their membership in each set

In [21]:
### loop through the unique chems
# check if they are in the list under smiles_dict[key]
# if yes, then record key
multiclass_list = [] 
for x in unique_chems:
    tmpclasses = ''
    for key in cln_smiles_dict.keys():
        #print(x in cln_smiles_dict[key])
        if x in cln_smiles_dict[key]:
            tmpclasses+= str(key)
            tmpclasses += ' '
    print(tmpclasses)
    multiclass_list.append(tmpclasses)

dermatologic 
antiinfective 
antiinfective 
antineoplastic 
antiinfective respiratorysystem 
cns 
cns 
hematologic 
cns 
antiinfective 
cns 
gastrointestinal cns cardio 
antiinfective 
antiinfective 
antineoplastic 
antineoplastic 
antineoplastic 
lipidregulating 
cns 
hematologic 
lipidregulating 
antiinflammatory 
hematologic cardio 
cardio 
gastrointestinal 
hematologic 
cardio 
cns 
cardio 
antiinfective antineoplastic 
antiinflammatory 
cns 
antineoplastic 
antineoplastic 
antiinfective 
cns 
antiinfective 
hematologic 
antiinfective 
antineoplastic 
antiinfective 
cardio 
antiinfective 
cardio 
cns 
hematologic cardio 
antiinflammatory 
antiinfective antineoplastic 
antiinfective 
cardio 
antiinfective 
antiinfective 
cardio 
antiinfective 
respiratorysystem 
respiratorysystem 
cns 
cns 
antiinfective antineoplastic 
respiratorysystem 
cns 
cns 
antiinfective 
dermatologic antiinfective antineoplastic 
antineoplastic 
hematologic cardio 
cardio 
antiinfective antineoplastic antii

antineoplastic 
hematologic cardio 
cns 
cardio 
gastrointestinal cns 
antiinfective 
urological 
cns 
antiinfective 
antineoplastic 
antiinfective 
respiratorysystem cardio 
antiinflammatory 
antineoplastic 
gastrointestinal 
reproductivecontrol 
antiinfective 
hematologic 
antiinfective 
cardio 
cardio 
cns 
lipidregulating 
cns 
antiinfective 
antiinfective 
antineoplastic 
dermatologic antineoplastic 
cns 
antiinfective 
cns 
antiinfective 
antiinfective 
cns 
hematologic 
cardio 
antiinfective antineoplastic 
antiinfective 
antiinfective 
lipidregulating 
cns 
antiinfective 
antiinfective 
cns respiratorysystem 
cardio 
antiinfective 
antiinfective 
antineoplastic reproductivecontrol 
gastrointestinal 
antineoplastic 
cns 
antiinflammatory 
antiinfective 
antineoplastic 
antiinfective 
hematologic 
cns respiratorysystem 
antiinfective 
antiinfective 
antiinfective 
cardio 
antiinfective 
antiinfective 
antiinfective 
lipidregulating 
gastrointestinal 
antiinfective 
antineoplastic

gastrointestinal 
cns cardio 
cns antiinfective 
gastrointestinal 
antiinfective 
dermatologic 
antiinfective 
antiinfective 
cardio 
dermatologic 
cns cardio 
gastrointestinal cns 
antiinfective 
respiratorysystem cardio 
cns 
cardio 
cns cardio 
cns respiratorysystem cardio 
antiinfective 
cardio 
antineoplastic 
cns antiinfective 
respiratorysystem antiinflammatory 
antiinfective 
lipidregulating 
lipidregulating 
antiinfective 
cns 
antineoplastic 
dermatologic 
antiinflammatory 
gastrointestinal 
cardio 
cns 
hematologic 
antiinfective 
antiinfective 
antineoplastic 
hematologic 
respiratorysystem 
antiinfective 
antiinfective 
antiinfective 
antiinfective antineoplastic 
antiinfective 
cns 
gastrointestinal 
antiinfective 
cns 
antiinfective 
antiinfective 
antiinfective 
antineoplastic hematologic cardio 
reproductivecontrol 
antineoplastic antiinflammatory 
gastrointestinal 
cns reproductivecontrol 
antiinfective 
antiinfective 
antineoplastic 
antiinfective 
cns 
hematologic 


antiinfective 
antiinfective 
gastrointestinal antineoplastic 
antiinfective 
reproductivecontrol 
cns 
cns 
antiinfective 
gastrointestinal 
cns 
antiinfective 
antiinflammatory 
reproductivecontrol cardio 
dermatologic 
antiinflammatory 
antiinfective antineoplastic antiinflammatory 
antineoplastic 
antiinfective antineoplastic 
antiinfective 
reproductivecontrol 
hematologic 
cardio 
antiinfective 
antiinfective 
hematologic 
cns 
antiinfective 
antiinfective 
gastrointestinal 
cns 
cns 
antineoplastic 
antiinfective 
cardio 
cns 
antiinfective lipidregulating 
antiinfective antineoplastic 
gastrointestinal cns 
cardio 
antiinfective 
cns 
cns respiratorysystem 
antineoplastic 
antiinflammatory 
antiinfective 
cardio 
antiinfective 
antiinfective 
antiinfective 
antiinfective 
antiinfective antineoplastic 
antineoplastic 
antiinfective 
cns 
cns 
antiinfective 
antiinfective 
antineoplastic 
hematologic 
cardio 
gastrointestinal 
antiinfective 
cns 
antiinfective 
hematologic 
cardi

antiinfective 
antiinfective 
cns antiinfective 
antineoplastic 
antiinfective 
antineoplastic 
cardio 
lipidregulating 
cns 
cns 
hematologic cardio 
antineoplastic 
antiinfective 
antiinfective 
antineoplastic 
antiinfective 
antiinflammatory 
lipidregulating 
antiinfective 
cardio 
cns 
dermatologic 
cardio 
antiinfective 
antineoplastic 
cns 
antiinfective antineoplastic 
antineoplastic 
gastrointestinal 
cns 
antiinfective 
antiinfective 
cns antineoplastic 
antiinflammatory 
lipidregulating 
antiinfective 
cardio 
gastrointestinal 
cns 
antiinfective 
antiinflammatory 
antineoplastic 
antineoplastic cardio 
antiinfective 
cns 
antiinfective respiratorysystem antiinflammatory 
cns 
antineoplastic 
antiinfective 
antiinfective 
antineoplastic 
dermatologic 
cardio 
antiinfective 
gastrointestinal 
antiinfective 
antiinfective 
cardio 
cns 
antiinfective 
antiinfective antineoplastic 
cns 
gastrointestinal 
cardio 
antiinfective antineoplastic antiinflammatory 
antiinfective 
hemato

cns 
antiinfective 
antiinfective respiratorysystem antiinflammatory 
antineoplastic 
antiinfective 
antineoplastic 
cns 
antiinfective 
cns hematologic 
antineoplastic 
cardio 
cns 
antiinfective 
gastrointestinal 
antiinfective 
antiinfective 
cns 
cardio 
antiinfective 
antiinfective 
reproductivecontrol 
antiinfective 
hematologic 
antiinfective 
antineoplastic 
antineoplastic 
gastrointestinal antiinfective 
antiinfective 
cardio 
cns cardio 
gastrointestinal 
antiinfective 
cns 
antineoplastic 
antiinfective 
antiinfective 
antiinfective 
antiinflammatory 
antineoplastic cardio 
cns 
hematologic 
antiinfective 
cns 
antiinfective 
antiinfective 
antiinfective 
gastrointestinal cns antiinfective antineoplastic cardio 
antiinfective 
respiratorysystem cardio 
gastrointestinal hematologic cardio antiinflammatory 
cns 
cardio 
antiinfective 
gastrointestinal antiinflammatory 
antiinfective 
lipidregulating 
gastrointestinal antiinfective 
antiinfective 
antiinfective 
cardio 
cardio 

In [22]:
print(len(unique_chems))
print(len(multiclass_list))

8336
8336


# make PNGs of chemicals in unique_chems

In [23]:
# write the files in order from the SMILES dictionary
PATH = '/home/jgmeyer2/drugclass/multiclass_data/pics/'
n = 0
for tmpchem in unique_chems:
    #print(tmpchem)
    Draw.MolToFile(Chem.MolFromSmiles(str(tmpchem)), fileName=PATH+'/'+str(n)+'.png', size=(500, 500), kekulize=True, wedgeBonds=True, imageType="png")
    n +=1

### make pandas dataframe of (1) file name, (2) classes, and (3) SMILES

In [24]:
pngnums = list(range(0, len(unique_chems)))
pngnames = ['pics/'+str(x) for x in pngnums]

In [25]:
df = pd.DataFrame(list(zip(pngnames, multiclass_list, unique_chems)), columns=["image_name", "tags","smiles"])

In [350]:
### randomly take 5% of those
#seed(2148944145)
#testindexes = sample(pngnums, round(len(pngnums)*0.05))

In [351]:
testdf = df.iloc[testindexes]

In [352]:
traindf = df.drop(testindexes)

In [56]:
len(df)

8336

In [354]:
len(testdf)

414

In [355]:
len(traindf)

7861

In [356]:
#testdf.to_csv("/home/jgmeyer2/drugclass/multiclass_data/testdf.csv", index = False)

In [357]:
#traindf.to_csv("/home/jgmeyer2/drugclass/multiclass_data/traindf.csv", index=False)

In [55]:
df['tags'] = [x.strip(' ') for x in df['tags'].tolist()]

In [57]:
df['tags'].tolist()

['dermatologic',
 'antiinfective',
 'antiinfective',
 'antineoplastic',
 'antiinfective respiratorysystem',
 'cns',
 'cns',
 'hematologic',
 'cns',
 'antiinfective',
 'cns',
 'gastrointestinal cns cardio',
 'antiinfective',
 'antiinfective',
 'antineoplastic',
 'antineoplastic',
 'antineoplastic',
 'lipidregulating',
 'cns',
 'hematologic',
 'lipidregulating',
 'antiinflammatory',
 'hematologic cardio',
 'cardio',
 'gastrointestinal',
 'hematologic',
 'cardio',
 'cns',
 'cardio',
 'antiinfective antineoplastic',
 'antiinflammatory',
 'cns',
 'antineoplastic',
 'antineoplastic',
 'antiinfective',
 'cns',
 'antiinfective',
 'hematologic',
 'antiinfective',
 'antineoplastic',
 'antiinfective',
 'cardio',
 'antiinfective',
 'cardio',
 'cns',
 'hematologic cardio',
 'antiinflammatory',
 'antiinfective antineoplastic',
 'antiinfective',
 'cardio',
 'antiinfective',
 'antiinfective',
 'cardio',
 'antiinfective',
 'respiratorysystem',
 'respiratorysystem',
 'cns',
 'cns',
 'antiinfective antin

In [58]:
df.to_csv("/home/jgmeyer2/drugclass/multiclass_data/all_chem_df.csv", index=False)