## Get Pubchem drug feautures

1. finding corresponding Pubchem ids for the drugs 
2. call Pubchem to get chemical properties of the drugs
3. Preprocess text Drug description from the original datasets
4. Preprocess some text characteristics from PubChem properties

In [1]:
import pandas as pd
import numpy as np
import os
#pip install PubChemPy
import pubchempy as pcp
import re
from pubchempy import Compound
import warnings
warnings.filterwarnings("ignore")
import time
import tqdm


In [4]:
# from GDSC database
drug_features = pd.read_csv("GDSC2_drug_features.csv")
drug_features.set_index("DRUG_ID", inplace= True)
drug_features.head()

Unnamed: 0_level_0,Drug_Name,Target,Target_Pathway,PubChem
DRUG_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1834,743380,-,Unclassified,-
1854,MN-64,"TNKS1, TNKS2",WNT signaling,2802462
1909,Venetoclax,BCL2,Apoptosis regulation,49846579
1913,AGI-5198,IDH1 (R132H),Metabolism,56645356
1915,AZD3759,EGFR,EGFR signaling,78209992


## Part 1: Get drug features from PubChempy

In [5]:
%%time

for drug_id in drug_features.index: 
    drug_name = drug_features.loc[drug_id, "Drug_Name"]
    deriv = pcp.get_compounds(drug_name, 'name')
    drug_features.loc[drug_id, "deriv_found"] = len(deriv)
    try:
        drug_features.loc[drug_id, "PubChem_ID"]= re.findall(r'\((.*?)\)', str(deriv))
    except:
        if len(deriv)>1:
            drug_features.loc[drug_id, "PubChem_ID"]= str([np.int(x) for x in re.findall(r'\((.*?)\)', str(deriv))]).strip("[").strip("]")
        else:
            drug_features.loc[drug_id, "PubChem_ID"]= 0

CPU times: user 8.23 s, sys: 1.19 s, total: 9.42 s
Wall time: 7min 41s


In [6]:
drug_features.to_csv("GDSC2_drug_features_PubChem.csv", index=False)

### Manual matching for drugs with missing or mutiple data

In [7]:
# example
error_names_dict={"Lestauritinib": "Lestaurtinib"}
error_name = "Lestauritinib"

# correct the search results
error_drug_index = drug_features[drug_features["Drug_Name"]==error_name].index
correct_drug_name = error_names_dict[error_name]
drug_features.loc[error_drug_index, "Drug_Name"] = correct_drug_name

In [8]:
manual_corrections = {
    "Bleomycin (50 uM)":{"pubchem_id" : 5460769,
               "reference" : "https://www.cancerrxgene.org/compounds"},
    
    "Picolinici-acid": {"pubchem_id" : 1018,
               "reference" : "https://www.cancerrxgene.org/compounds"},
    
    "Nutlin-3a (-)": {"pubchem_id" : 11433190 ,
               "reference" : "https://www.cancerrxgene.org/compounds"},
    
    "KRAS (G12C) Inhibitor-12" : {"pubchem_id" : 73555129,
               "reference" : "https://www.cancerrxgene.org/compounds"},
    
    "BMS-345541": {"pubchem_id" : 9926054,
               "reference" : "https://www.cancerrxgene.org/compounds"},
    
    "Staurosporine": {"pubchem_id" : 44259,
               "reference" : "https://pubchem.ncbi.nlm.nih.gov/compound/44259"},
    
    "GW441756": {"pubchem_id" : 9943465,
               "reference" : "https://www.cancerrxgene.org/compounds"},
    
    "EHT-1864": {"pubchem_id" : 9938202,
               "reference" : "https://www.cancerrxgene.org/compounds"},
    
    "GSK-LSD1": { "pubchem_id": 71522234, 
                  "reference": "https://pubchem.ncbi.nlm.nih.gov/compound/71522234" },
    
    "MK-8776": { "pubchem_id": 16224745, 
                         "reference": "https://www.cancerrxgene.org/compounds"},
    
    "SL0101": {"pubchem_id": 10459196, 
                   "reference": "https://www.cancerrxgene.org/compounds"},
    
    "BIBR-1532": {"pubchem_id": 9927531, 
                "reference": "https://www.cancerrxgene.org/compounds"},
    
    "Dacarbazine": { "pubchem_id": 135398738, 
                   "reference": "https://pubchem.ncbi.nlm.nih.gov/compound/135398738"},
    
    "Sinularin": {"pubchem_id": 5477029, 
                  "reference": "https://pubchem.ncbi.nlm.nih.gov/compound/5477029"},
    
    "Bleomycin": {"pubchem_id": 5360373, 
                  "reference": "https://pubchem.ncbi.nlm.nih.gov/compound/Bleomycin"},
    
    "Dihydrorotenone": { "pubchem_id": 243725, 
                        "reference": "https://pubchem.ncbi.nlm.nih.gov/compound/243725"},
    
    "Vinorelbine": { "pubchem_id": 5311497, 
               "reference": "https://www.cancerrxgene.org/compounds"},
    
    "Vinblastine": {"pubchem_id": 6710780, 
                   "reference": "https://www.cancerrxgene.org/compounds"},
    
    "Obatoclax Mesylate": { "pubchem_id": 16681698, 
                   "reference": "https://pubchem.ncbi.nlm.nih.gov/compound/16681698"},
    
    "Vincristine" : {"pubchem_id": 5978, 
                   "reference": "https://pubchem.ncbi.nlm.nih.gov/compound/5978"},
    
    "Romidepsin": {"pubchem_id": 3425, 
                   "reference": "https://pubchem.ncbi.nlm.nih.gov/compound/Romidepsin"},
    
    "Temsirolimus": {"pubchem_id": 6918289, 
                   "reference": "https://www.cancerrxgene.org/compounds"},
    
    "Bleomycin": {"pubchem_id": 5360373, 
                   "reference": "https://www.drugbank.ca/drugs/DB00290"},
    
    "Vinblastine": {"pubchem_id": 13342, 
                   "reference": "https://www.drugbank.ca/drugs/DB00570"},
    
    
    "THZ-2-102-1" : {"pubchem_id": 146011539, 
                   "reference": "Katjusa Koler's suggestion"},
    
    "THZ-2-49" : {"pubchem_id": 78357763 , 
                   "reference": ["https://www.cancerrxgene.org/compounds", 
                                "https://www.medchemexpress.com/THZ2.html",
                                "https://pubchem.ncbi.nlm.nih.gov/compound/78357763"]},
    
    "QL-XII-47": {"pubchem_id": 71748056, 
                   "reference": "https://lincs.hms.harvard.edu/db/sm/10077-101-1/"},
    
    "BMS-345541" : {"pubchem_id": 9813758, 
                   "reference": ""},
    
    "Temsirolimus" : {"pubchem_id": 23724530, 
                   "reference": "https://www.drugbank.ca/drugs/DB06287"},
           
    }
    

for drug_name in manual_corrections:
    drug_index = drug_features[drug_features["Drug_Name"]==drug_name].index
    drug_features.loc[drug_index, "deriv_found"] = 1
    drug_features.loc[drug_index, "PubChem_ID"]= manual_corrections[drug_name]["pubchem_id"]



In [9]:
drugs_with_pubchem_id = drug_features[drug_features["PubChem_ID"]!=0].index
print("All drugs: %d, With known PubChem_Id: %d" % (drug_features.shape[0], len(drugs_with_pubchem_id)))
#54 without known PubChem_Id

All drugs: 297, With known PubChem_Id: 243


In [10]:
with open("drugs_with_pubchem_id_NEW.txt", 'w') as f:
    for s in drugs_with_pubchem_id:
        f.write(str(s) + '\n')

## Getting properties by PubChem API

In [11]:
%%time
# for i, PubChem_id in tqdm(list(enumerate(drug_features["PubChem_ID"].values))):
for count,PubChem_id in enumerate(drug_features["PubChem_ID"]):
    print(PubChem_id)
    try:
        #drug_index = drug_features[drug_features["PubChem_ID"]==PubChem_id].index
        drug_index = drug_features["PubChem_ID"][count:count+1].index
        #print((drug_features["PubChem_ID"]==PubChem_id).sum())
        print(PubChem_id)
        print(drug_index)
        
        c = Compound.from_cid(PubChem_id)
        #print(c.molecular_weight)
        
        drug_features.loc[drug_index, "molecular_weight"] = c.molecular_weight
        #print(drug_features.loc[drug_index, "molecular_weight"])
        #break
        drug_features.loc[drug_index, "elements"] = str(set(c.elements)).strip("{").strip("}")
        
        bonds = [int(str(i).split(",")[-1].strip(")")) for i in c.bonds]
        drug_features.loc[drug_index, "2bonds"] = bonds.count(2)
        drug_features.loc[drug_index, "3bonds"] = bonds.count(3)

        drug_features.loc[drug_index, "xlogp"] = c.xlogp
        drug_features.loc[drug_index, "formal_charge"] = c.charge
    
        drug_features.loc[drug_index, "surface_area"] = c.tpsa

        drug_features.loc[drug_index, "complexity"] = c.complexity

        drug_features.loc[drug_index, "h_bond_donor_count"] = c.h_bond_donor_count

        drug_features.loc[drug_index, "h_bond_acceptor_count"] = c.h_bond_acceptor_count

        drug_features.loc[drug_index, "rotatable_bond_count"] = c.rotatable_bond_count

        drug_features.loc[drug_index, "heavy_atom_count"] = c.heavy_atom_count

        drug_features.loc[drug_index, "atom_stereo_count"] = c.atom_stereo_count

        drug_features.loc[drug_index, "defined_atom_stereo_count"] = c.defined_atom_stereo_count

        drug_features.loc[drug_index, "undefined_atom_stereo_count"] = c.undefined_atom_stereo_count

        drug_features.loc[drug_index, "bond_stereo_count"] = c.bond_stereo_count

        drug_features.loc[drug_index, "covalent_unit_count"] = c.covalent_unit_count
        drug_features.loc[drug_index, "molecular_formula"] = c.molecular_formula

        drug_features.loc[drug_index, "canonical_smiles"] = c.canonical_smiles

        drug_features.loc[drug_index, "inchi_string"] = c.inchi

        drug_features.loc[drug_index, "inchi_key"] = c.inchikey
        #print(drug_features.head(15))
        #print(drug_features.loc[drug_index, "molecular_weight"])
        #break
    except:
        print("Weid stuff")
        #break
        pass

# # fingerprint
# # Raw padded and hex-encoded fingerprint, as returned by the PUG REST API.

# # cactvs_fingerprint
# # PubChem CACTVS fingerprint.
# # Each bit in the fingerprint represents the presence or absence of one of 881 chemical substructures.
# # More information at ftp://ftp.ncbi.nlm.nih.gov/pubchem/specifications/pubchem_fingerprints.txt


0.0
0.0
Int64Index([1834], dtype='int64', name='DRUG_ID')
Weid stuff
['2802462']
['2802462']
Int64Index([1854], dtype='int64', name='DRUG_ID')
['49846579']
['49846579']
Int64Index([1909], dtype='int64', name='DRUG_ID')
['56645356']
['56645356']
Int64Index([1913], dtype='int64', name='DRUG_ID')
['78209992']
['78209992']
Int64Index([1915], dtype='int64', name='DRUG_ID')
['9825149']
['9825149']
Int64Index([1932], dtype='int64', name='DRUG_ID')
['46931953']
['46931953']
Int64Index([1933], dtype='int64', name='DRUG_ID')
['71297207']
['71297207']
Int64Index([1997], dtype='int64', name='DRUG_ID')
0
0
Int64Index([2011], dtype='int64', name='DRUG_ID')
Weid stuff
['46224516']
['46224516']
Int64Index([2038], dtype='int64', name='DRUG_ID')
['42642645']
['42642645']
Int64Index([2040], dtype='int64', name='DRUG_ID')
['5284616']
['5284616']
Int64Index([1084], dtype='int64', name='DRUG_ID')
['52918385']
['52918385']
Int64Index([1131], dtype='int64', name='DRUG_ID')
['135565082']
['135565082']
Int64Ind

In [12]:
drug_features.head(15)

Unnamed: 0_level_0,Drug_Name,Target,Target_Pathway,PubChem,deriv_found,PubChem_ID,molecular_weight,elements,2bonds,3bonds,xlogp,formal_charge,surface_area,complexity,h_bond_donor_count,h_bond_acceptor_count,rotatable_bond_count,heavy_atom_count,atom_stereo_count,defined_atom_stereo_count,undefined_atom_stereo_count,bond_stereo_count,covalent_unit_count,molecular_formula,canonical_smiles,inchi_string,inchi_key
DRUG_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
1834,743380,-,Unclassified,-,0.0,0.0,,,,,,,,,,,,,,,,,,,,,
1854,MN-64,"TNKS1, TNKS2",WNT signaling,2802462,1.0,[2802462],264.3,"'H', 'O', 'C'",8.0,0.0,4.7,0.0,26.3,388.0,0.0,2.0,2.0,20.0,0.0,0.0,0.0,0.0,1.0,C18H16O2,CC(C)C1=CC=C(C=C1)C2=CC(=O)C3=CC=CC=C3O2,InChI=1S/C18H16O2/c1-12(2)13-7-9-14(10-8-13)18...,PYTOHIUBXSJKQH-UHFFFAOYSA-N
1909,Venetoclax,BCL2,Apoptosis regulation,49846579,1.0,[49846579],868.4,"'C', 'S', 'N', 'O', 'H', 'Cl'",18.0,0.0,8.2,0.0,183.0,1640.0,3.0,11.0,12.0,61.0,0.0,0.0,0.0,0.0,1.0,C45H50ClN7O7S,CC1(CCC(=C(C1)C2=CC=C(C=C2)Cl)CN3CCN(CC3)C4=CC...,InChI=1S/C45H50ClN7O7S/c1-45(2)15-11-33(39(26-...,LQBVNQSMGBZMKD-UHFFFAOYSA-N
1913,AGI-5198,IDH1 (R132H),Metabolism,56645356,1.0,[56645356],462.6,"'C', 'N', 'O', 'F', 'H'",10.0,0.0,4.9,0.0,67.2,686.0,1.0,4.0,7.0,34.0,1.0,0.0,1.0,0.0,1.0,C27H31FN4O2,CC1=CC=CC=C1C(C(=O)NC2CCCCC2)N(C3=CC(=CC=C3)F)...,InChI=1S/C27H31FN4O2/c1-19-9-6-7-14-24(19)26(2...,FNYGWXSATBUBER-UHFFFAOYSA-N
1915,AZD3759,EGFR,EGFR signaling,78209992,1.0,[78209992],459.9,"'C', 'N', 'O', 'F', 'H', 'Cl'",9.0,0.0,4.1,0.0,79.8,649.0,1.0,8.0,5.0,32.0,1.0,1.0,0.0,0.0,1.0,C22H23ClFN5O3,CC1CN(CCN1C(=O)OC2=C(C=C3C(=C2)C(=NC=N3)NC4=C(...,InChI=1S/C22H23ClFN5O3/c1-13-11-28(2)7-8-29(13...,MXDSJQHFFDGFDK-CYBMUJFWSA-N
1932,NVP-ADW742,IGF1R,IGF1R signaling,9825149,1.0,[9825149],453.6,"'H', 'N', 'O', 'C'",10.0,0.0,4.6,0.0,69.2,645.0,1.0,5.0,7.0,34.0,0.0,0.0,0.0,0.0,1.0,C28H31N5O,C1CCN(C1)CC2CC(C2)N3C=C(C4=C(N=CN=C43)N)C5=CC(...,InChI=1S/C28H31N5O/c29-27-26-25(22-9-6-10-24(1...,LSFLAQVDISHMNB-UHFFFAOYSA-N
1933,P22077,"USP7, USP47",Protein stability and degradation,46931953,1.0,[46931953],315.3,"'C', 'S', 'N', 'O', 'F', 'H'",7.0,0.0,4.1,0.0,116.0,393.0,0.0,7.0,3.0,20.0,0.0,0.0,0.0,0.0,1.0,C12H7F2NO3S2,CC(=O)C1=CC(=C(S1)SC2=C(C=C(C=C2)F)F)[N+](=O)[O-],InChI=1S/C12H7F2NO3S2/c1-6(16)11-5-9(15(17)18)...,RMAMGGNACJHXHO-UHFFFAOYSA-N
1997,WEHI-539,BCL-XL,Apoptosis regulation,71297207,1.0,[71297207],583.7,"'C', 'S', 'N', 'O', 'H'",14.0,0.0,4.4,0.0,179.0,903.0,3.0,10.0,10.0,41.0,0.0,0.0,0.0,1.0,1.0,C31H29N5O3S2,C1CC2=C(C=C(C=C2)C3=NC(=C(S3)CCCOC4=CC=C(C=C4)...,InChI=1S/C31H29N5O3S2/c32-18-19-10-14-22(15-11...,JKMWZKPAXZBYEH-JWHWKPFMSA-N
2011,ICL-SIRT078,SIR2,Other,-,0.0,0,,,,,,,,,,,,,,,,,,,,,
2038,UNC0638,"G9A, GLP",Chromatin histone methylation,-,1.0,[46224516],509.7,"'H', 'N', 'O', 'C'",5.0,0.0,6.3,0.0,62.8,660.0,1.0,7.0,10.0,37.0,0.0,0.0,0.0,0.0,1.0,C30H47N5O2,CC(C)N1CCC(CC1)NC2=NC(=NC3=CC(=C(C=C32)OC)OCCC...,InChI=1S/C30H47N5O2/c1-22(2)35-17-12-24(13-18-...,QOECJCJVIMVJGX-UHFFFAOYSA-N


In [13]:
len(manual_corrections)

25

In [14]:
drugs_with_pubchem_id = drug_features[drug_features["PubChem_ID"]!=0].index
print("All drugs: %d, With known PubChem_Id: %d" % (drug_features.shape[0], len(drugs_with_pubchem_id)))

All drugs: 297, With known PubChem_Id: 243


In [15]:
drug_features.to_csv('drug_features_GDSC2.csv', index=False)

## Preprocessing Text PubChem characteristics

### Presence of some elements (11 elements)

In [16]:
%%time
all_elements = list(set(drug_features["elements"].str.split(",", expand=True).fillna(0).values.flatten())- set([0," 'C'", "'C'", " 'H'"]))
all_elements

elements_in_drugs= list(set([atom.strip(" ").strip("'") for atom in all_elements]))
exceptions =[]
for drug_index in drug_features.index:
    compound_elements = drug_features.loc[drug_index, "elements"]
    print(compound_elements)
    try:
        for i, atom in list(enumerate(elements_in_drugs)):
            if atom in compound_elements:
                drug_features.loc[drug_index, atom] = 1
                print(atom, "Yes")
            else:
                drug_features.loc[drug_index, atom] = 0
                print(atom, "No")
    except:
        exceptions.append(drug_index)
        drug_features.loc[drug_index, atom] = 0

print("Exceptions:", drug_features.loc[exceptions, :].shape[0])
print("Elements in drugs:", len(elements_in_drugs), elements_in_drugs)

nan
'H', 'O', 'C'
P No
Br No
B No
Pt No
S No
N No
Cl No
O Yes
F No
H Yes
I No
'C', 'S', 'N', 'O', 'H', 'Cl'
P No
Br No
B No
Pt No
S Yes
N Yes
Cl Yes
O Yes
F No
H Yes
I No
'C', 'N', 'O', 'F', 'H'
P No
Br No
B No
Pt No
S No
N Yes
Cl No
O Yes
F Yes
H Yes
I No
'C', 'N', 'O', 'F', 'H', 'Cl'
P No
Br No
B No
Pt No
S No
N Yes
Cl Yes
O Yes
F Yes
H Yes
I No
'H', 'N', 'O', 'C'
P No
Br No
B No
Pt No
S No
N Yes
Cl No
O Yes
F No
H Yes
I No
'C', 'S', 'N', 'O', 'F', 'H'
P No
Br No
B No
Pt No
S Yes
N Yes
Cl No
O Yes
F Yes
H Yes
I No
'C', 'S', 'N', 'O', 'H'
P No
Br No
B No
Pt No
S Yes
N Yes
Cl No
O Yes
F No
H Yes
I No
nan
'H', 'N', 'O', 'C'
P No
Br No
B No
Pt No
S No
N Yes
Cl No
O Yes
F No
H Yes
I No
'C', 'N', 'O', 'F', 'H'
P No
Br No
B No
Pt No
S No
N Yes
Cl No
O Yes
F Yes
H Yes
I No
'H', 'N', 'O', 'C'
P No
Br No
B No
Pt No
S No
N Yes
Cl No
O Yes
F No
H Yes
I No
'H', 'N', 'O', 'C'
P No
Br No
B No
Pt No
S No
N Yes
Cl No
O Yes
F No
H Yes
I No
'C', 'N', 'O', 'F', 'H'
P No
Br No
B No
Pt No
S No
N Yes
Cl No

In [17]:
drug_features["Br"].value_counts()

0.0    233
1.0      8
Name: Br, dtype: int64

In [18]:
drug_features

Unnamed: 0_level_0,Drug_Name,Target,Target_Pathway,PubChem,deriv_found,PubChem_ID,molecular_weight,elements,2bonds,3bonds,xlogp,formal_charge,surface_area,complexity,h_bond_donor_count,h_bond_acceptor_count,rotatable_bond_count,heavy_atom_count,atom_stereo_count,defined_atom_stereo_count,undefined_atom_stereo_count,bond_stereo_count,covalent_unit_count,molecular_formula,canonical_smiles,inchi_string,inchi_key,P,Br,B,Pt,S,N,Cl,O,F,H,I
DRUG_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1
1834,743380,-,Unclassified,-,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,
1854,MN-64,"TNKS1, TNKS2",WNT signaling,2802462,1.0,[2802462],264.3,"'H', 'O', 'C'",8.0,0.0,4.7,0.0,26.3,388.0,0.0,2.0,2.0,20.0,0.0,0.0,0.0,0.0,1.0,C18H16O2,CC(C)C1=CC=C(C=C1)C2=CC(=O)C3=CC=CC=C3O2,InChI=1S/C18H16O2/c1-12(2)13-7-9-14(10-8-13)18...,PYTOHIUBXSJKQH-UHFFFAOYSA-N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1909,Venetoclax,BCL2,Apoptosis regulation,49846579,1.0,[49846579],868.4,"'C', 'S', 'N', 'O', 'H', 'Cl'",18.0,0.0,8.2,0.0,183.0,1640.0,3.0,11.0,12.0,61.0,0.0,0.0,0.0,0.0,1.0,C45H50ClN7O7S,CC1(CCC(=C(C1)C2=CC=C(C=C2)Cl)CN3CCN(CC3)C4=CC...,InChI=1S/C45H50ClN7O7S/c1-45(2)15-11-33(39(26-...,LQBVNQSMGBZMKD-UHFFFAOYSA-N,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
1913,AGI-5198,IDH1 (R132H),Metabolism,56645356,1.0,[56645356],462.6,"'C', 'N', 'O', 'F', 'H'",10.0,0.0,4.9,0.0,67.2,686.0,1.0,4.0,7.0,34.0,1.0,0.0,1.0,0.0,1.0,C27H31FN4O2,CC1=CC=CC=C1C(C(=O)NC2CCCCC2)N(C3=CC(=CC=C3)F)...,InChI=1S/C27H31FN4O2/c1-19-9-6-7-14-24(19)26(2...,FNYGWXSATBUBER-UHFFFAOYSA-N,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0
1915,AZD3759,EGFR,EGFR signaling,78209992,1.0,[78209992],459.9,"'C', 'N', 'O', 'F', 'H', 'Cl'",9.0,0.0,4.1,0.0,79.8,649.0,1.0,8.0,5.0,32.0,1.0,1.0,0.0,0.0,1.0,C22H23ClFN5O3,CC1CN(CCN1C(=O)OC2=C(C=C3C(=C2)C(=NC=N3)NC4=C(...,InChI=1S/C22H23ClFN5O3/c1-13-11-28(2)7-8-29(13...,MXDSJQHFFDGFDK-CYBMUJFWSA-N,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2156,5-azacytidine,DNA methyltransferases,Other,-,1.0,[9444],244.20,"'H', 'N', 'O', 'C'",3.0,0.0,-2.2,0.0,141.0,384.0,4.0,5.0,2.0,17.0,4.0,4.0,0.0,0.0,1.0,C8H12N4O5,C1=NC(=NC(=O)N1C2C(C(C(O2)CO)O)O)N,InChI=1S/C8H12N4O5/c9-7-10-2-12(8(16)11-7)6-5(...,NMUSYJAQQFHJEW-KVTDHHQDSA-N,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2362,THR-103,Mutant RAS,PI3K/MTOR signaling,,0.0,0,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,
1030,KU-55933,ATM,Genome integrity,5278396,1.0,[5278396],395.5,"'C', 'S', 'N', 'O', 'H'",9.0,0.0,3.9,0.0,89.4,643.0,0.0,6.0,2.0,27.0,0.0,0.0,0.0,0.0,1.0,C21H17NO3S2,C1COCCN1C2=CC(=O)C=C(O2)C3=C4C(=CC=C3)SC5=CC=C...,InChI=1S/C21H17NO3S2/c23-14-12-16(25-20(13-14)...,XRKYMMUGXMWDAO-UHFFFAOYSA-N,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
1129,PF-4708671,S6K1,PI3K/MTOR signaling,51371303,1.0,[51371303],390.4,"'N', 'F', 'H', 'C'",7.0,0.0,3.2,0.0,60.9,510.0,1.0,8.0,4.0,28.0,0.0,0.0,0.0,0.0,1.0,C19H21F3N6,CCC1=CN=CN=C1N2CCN(CC2)CC3=NC4=C(N3)C=C(C=C4)C...,InChI=1S/C19H21F3N6/c1-2-13-10-23-12-24-18(13)...,FBLPQCAQRNSVHB-UHFFFAOYSA-N,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0


In [19]:
drug_features.to_csv("GDSC2_drug_features_with_pubchem_properties.csv")

### Write PubChem names

In [38]:
PubChem_features = ["molecular_weight","2bonds", "3bonds", "xlogp", "formal_charge", 
    "surface_area", "complexity", "h_bond_donor_count", 
    "h_bond_acceptor_count", "rotatable_bond_count",
    "heavy_atom_count", "atom_stereo_count", "defined_atom_stereo_count",
    "undefined_atom_stereo_count", "bond_stereo_count", "covalent_unit_count",
    'B', 'I', 'Br', 'Cl', 'O', 'N', 'F', 'P', 'S', 'Pt']

with open("X_PubChem_properties.txt", 'w') as f:
    for s in PubChem_features:
        f.write(str(s) + '\n')

print("Number of PubChem features:", len(PubChem_features))

Number of PubChem features: 26


## Part 2: Preprocessing Drugs description from original data

In this section, we are going to have some dumnies columns for Target and Target_Pathway

Converting of Target Pathway resulted in 26 new columns

It is also worth considering elements columns and that deleting columns with C and H which are present in all the compounds

### Dumnies for Target (229) and Target_Pathway (23)

In [21]:
drug_features.head(3)

Unnamed: 0_level_0,Drug_Name,Target,Target_Pathway,PubChem,deriv_found,PubChem_ID,molecular_weight,elements,2bonds,3bonds,xlogp,formal_charge,surface_area,complexity,h_bond_donor_count,h_bond_acceptor_count,rotatable_bond_count,heavy_atom_count,atom_stereo_count,defined_atom_stereo_count,undefined_atom_stereo_count,bond_stereo_count,covalent_unit_count,molecular_formula,canonical_smiles,inchi_string,inchi_key,P,Br,B,Pt,S,N,Cl,O,F,H,I
DRUG_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1
1834,743380,-,Unclassified,-,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,
1854,MN-64,"TNKS1, TNKS2",WNT signaling,2802462,1.0,[2802462],264.3,"'H', 'O', 'C'",8.0,0.0,4.7,0.0,26.3,388.0,0.0,2.0,2.0,20.0,0.0,0.0,0.0,0.0,1.0,C18H16O2,CC(C)C1=CC=C(C=C1)C2=CC(=O)C3=CC=CC=C3O2,InChI=1S/C18H16O2/c1-12(2)13-7-9-14(10-8-13)18...,PYTOHIUBXSJKQH-UHFFFAOYSA-N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1909,Venetoclax,BCL2,Apoptosis regulation,49846579,1.0,[49846579],868.4,"'C', 'S', 'N', 'O', 'H', 'Cl'",18.0,0.0,8.2,0.0,183.0,1640.0,3.0,11.0,12.0,61.0,0.0,0.0,0.0,0.0,1.0,C45H50ClN7O7S,CC1(CCC(=C(C1)C2=CC=C(C=C2)Cl)CN3CCN(CC3)C4=CC...,InChI=1S/C45H50ClN7O7S/c1-45(2)15-11-33(39(26-...,LQBVNQSMGBZMKD-UHFFFAOYSA-N,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0


In [22]:
drug_features.dtypes

Drug_Name                       object
Target                          object
Target_Pathway                  object
PubChem                         object
deriv_found                    float64
PubChem_ID                      object
molecular_weight                object
elements                        object
2bonds                         float64
3bonds                         float64
xlogp                          float64
formal_charge                  float64
surface_area                   float64
complexity                     float64
h_bond_donor_count             float64
h_bond_acceptor_count          float64
rotatable_bond_count           float64
heavy_atom_count               float64
atom_stereo_count              float64
defined_atom_stereo_count      float64
undefined_atom_stereo_count    float64
bond_stereo_count              float64
covalent_unit_count            float64
molecular_formula               object
canonical_smiles                object
inchi_string             

In [23]:
targets = ""
for x in drug_features.Target.values:
    targets = targets + ", " + str(x)
targets = list(set(targets.split(", ")[1:]))
print("Number of targets:", len(targets))



Number of targets: 242


In [24]:
df_target = pd.DataFrame(data = np.int32(np.zeros([drug_features.shape[0], len(targets)])), 
                         index = drug_features.index, 
                         columns = targets)

In [25]:
df_target

Unnamed: 0_level_0,MTORC2,NAMPT,AR,KDM1,Antimetabolite,ROS1,RSK,BCL-W,DNA methyltransferases,CDK5,CAPN1,WIP1,IAP,ACVR1B,IKK-2,ADRA1A,FLT3,USP47,JNK2,JNK,PI3Kgamma,UAF1,BCL-XL,SYK,ERBB2,FGFR2,Antimetabolite (DNA & RNA),NTRK1,G-quadruplex stabiliser,SIR2,CECR2,ERK2,KDM4E,BIRC5,ROCK2,DOT1L,AKT,-,FLT2,IR,...,BAZ2A,IAP2,FAK1,IRAK4,HDAC2,PERK,PI3K (class 1),Inflammatory related,ROCK1,IIb,CHEK2,BCL-B,KDM3A,Mutant RAS,IKK,RAC2,Ephrins,ADRB1,IIa,MTORC1,JAK2,PDK1 (PDPK1),PLK2,Broad spectrum kinase inhibitor,KDR,HDAC1,ATM,Proteasome,LRRK2,Pyrimidine synthesis inhibitor,BAZ2B,BRAF,TP53,PAK1,PORCN,TGFBR1,NTRK2,BRD4,ACVR1C,TBK1
DRUG_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1834,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1854,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1909,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1913,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1915,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2156,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2362,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1030,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1129,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [26]:
for index in drug_features.index:
    targets_i = str(drug_features.loc[index, 'Target']).split(', ')
    df_target.loc[index, targets_i]=1
df_target.shape



(297, 242)

In [27]:
df_target

Unnamed: 0_level_0,MTORC2,NAMPT,AR,KDM1,Antimetabolite,ROS1,RSK,BCL-W,DNA methyltransferases,CDK5,CAPN1,WIP1,IAP,ACVR1B,IKK-2,ADRA1A,FLT3,USP47,JNK2,JNK,PI3Kgamma,UAF1,BCL-XL,SYK,ERBB2,FGFR2,Antimetabolite (DNA & RNA),NTRK1,G-quadruplex stabiliser,SIR2,CECR2,ERK2,KDM4E,BIRC5,ROCK2,DOT1L,AKT,-,FLT2,IR,...,BAZ2A,IAP2,FAK1,IRAK4,HDAC2,PERK,PI3K (class 1),Inflammatory related,ROCK1,IIb,CHEK2,BCL-B,KDM3A,Mutant RAS,IKK,RAC2,Ephrins,ADRB1,IIa,MTORC1,JAK2,PDK1 (PDPK1),PLK2,Broad spectrum kinase inhibitor,KDR,HDAC1,ATM,Proteasome,LRRK2,Pyrimidine synthesis inhibitor,BAZ2B,BRAF,TP53,PAK1,PORCN,TGFBR1,NTRK2,BRD4,ACVR1C,TBK1
DRUG_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1834,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1854,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1909,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1913,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1915,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2156,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2362,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1030,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1129,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [28]:
print("Number of unique pathways:", drug_features["Target_Pathway"].nunique())

df_target_target_pathway = pd.concat([df_target, pd.get_dummies(drug_features["Target_Pathway"])], axis=1)
df_target_target_pathway.shape

Number of unique pathways: 25


(297, 267)

In [29]:
df_target_target_pathway.to_csv("GDSC2_target_target_pathway_df.csv", index=False)

In [30]:
# only pathway
df_target_pathway = pd.get_dummies(drug_features["Target_Pathway"])
df_target_pathway 

Unnamed: 0_level_0,-,ABL signaling,Apoptosis regulation,Cell cycle,Chromatin histone acetylation,Chromatin histone methylation,Chromatin other,Cytoskeleton,DNA replication,EGFR signaling,ERK MAPK signaling,Genome integrity,Hormone-related,IGF1R signaling,JNK and p38 signaling,Metabolism,Mitosis,Other,"Other, kinases",PI3K/MTOR signaling,Protein stability and degradation,RTK signaling,Unclassified,WNT signaling,p53 pathway
DRUG_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1834,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1854,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1909,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1913,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1915,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2156,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2362,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1030,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1129,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [31]:
df_target_pathway.to_csv('GDSC2_target_pathway_df.csv')

### Write names of uniques Targets and Target_Pathway

In [32]:
with open("X_features_Targets.txt", 'w') as f:
    for s in targets:
        f.write(str(s) + '\n')
        
with open("X_features_Target_Pathway.txt", 'w') as f:
    for s in drug_features["Target_Pathway"].unique():
        f.write(str(s) + '\n')   

In [33]:
df_target_target_pathway.reset_index()

Unnamed: 0,DRUG_ID,MTORC2,NAMPT,AR,KDM1,Antimetabolite,ROS1,RSK,BCL-W,DNA methyltransferases,CDK5,CAPN1,WIP1,IAP,ACVR1B,IKK-2,ADRA1A,FLT3,USP47,JNK2,JNK,PI3Kgamma,UAF1,BCL-XL,SYK,ERBB2,FGFR2,Antimetabolite (DNA & RNA),NTRK1,G-quadruplex stabiliser,SIR2,CECR2,ERK2,KDM4E,BIRC5,ROCK2,DOT1L,AKT,-,FLT2,...,HDAC1,ATM,Proteasome,LRRK2,Pyrimidine synthesis inhibitor,BAZ2B,BRAF,TP53,PAK1,PORCN,TGFBR1,NTRK2,BRD4,ACVR1C,TBK1,-.1,ABL signaling,Apoptosis regulation,Cell cycle,Chromatin histone acetylation,Chromatin histone methylation,Chromatin other,Cytoskeleton,DNA replication,EGFR signaling,ERK MAPK signaling,Genome integrity,Hormone-related,IGF1R signaling,JNK and p38 signaling,Metabolism,Mitosis,Other,"Other, kinases",PI3K/MTOR signaling,Protein stability and degradation,RTK signaling,Unclassified,WNT signaling,p53 pathway
0,1834,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,1854,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,1909,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1913,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,1915,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,2156,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
293,2362,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
294,1030,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
295,1129,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [34]:
df_target_pathway.reset_index()

Unnamed: 0,DRUG_ID,-,ABL signaling,Apoptosis regulation,Cell cycle,Chromatin histone acetylation,Chromatin histone methylation,Chromatin other,Cytoskeleton,DNA replication,EGFR signaling,ERK MAPK signaling,Genome integrity,Hormone-related,IGF1R signaling,JNK and p38 signaling,Metabolism,Mitosis,Other,"Other, kinases",PI3K/MTOR signaling,Protein stability and degradation,RTK signaling,Unclassified,WNT signaling,p53 pathway
0,1834,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,1854,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,1909,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1913,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,1915,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,2156,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
293,2362,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
294,1030,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
295,1129,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [35]:
with open("X_features_cancer_cell_lines.txt", 'r') as f:
    X_cancer_cell_lines = [line.rstrip('\n') for line in f]

In [36]:
print("Final Features: \n")
print("Cell lines (CCL) features:", len(X_cancer_cell_lines))
print("PubChem drug features:", len(PubChem_features))
print("Drug description features - Targets: %d, Target_Pathway: %d" % (len(targets), drug_features["Target_Pathway"].nunique()))

Final Features: 

Cell lines (CCL) features: 1073
PubChem drug features: 26
Drug description features - Targets: 242, Target_Pathway: 25


In [37]:
all_elements

["'Pt'",
 " 'Cl'",
 " 'I'",
 " 'O'",
 " 'Br'",
 "'N'",
 " 'S'",
 " 'F'",
 "'B'",
 " 'Pt'",
 "'Br'",
 "'H'",
 " 'N'",
 "'P'"]