# ExCAPEDB v1 fix

The original release https://zenodo.org/record/173258#.XECnKVUzZlY contains a number of broken SMILES.

In [60]:
import os,path
import datetime
import urllib.request
import logging
from logging.config import fileConfig
from  rdkit  import  Chem
fileConfig('logging.ini')
%run settings.py
logger= logging.getLogger()
logger.debug('Started at %s \t%s',os.name, datetime.datetime.now())

2019-01-17 19:33:53,553  DEBUG    Started at nt 	2019-01-17 19:33:53.553325


In [118]:
import pandas as pd

class ExCAPEDB():
    def __init__(self,path):
        self.zenodo = "https://zenodo.org/record/173258/files/pubchem.chembl.dataset4publication_inchi_smiles.tsv.xz?download=1"
        self.path = path
        self.file = "{}/pubchem.chembl.dataset4publication_inchi_smiles.tsv.xz".format(path)
        self.id_tag='Original_Entry_ID'
        self.db_tag='DB'
        self.smi_tag='SMILES'
        self.id_index  = None
        self.db_index  = None
        self.smi_index = None
        self.smiles_errors = {}

    def getSmiles(self,line):
        if line is None:
            return None
        return line[self.smi_index]
    
    def getDB(self,line):
        if line is None:
            return None
        db = line[self.db_index]
        if db.startswith("pubchem"):
            db="pubchem"
        return db    
    
    def getIdentifier(self,line):
        if line is None:
            return None
        return line[self.id_index]    
    
    def read(self,process=None,_max_records=100):
        import lzma

        header=[]
        r=0
        with lzma.open(excapedb.file, mode='rt') as file:
            prev_line=None
            for line in file:
                line = line.strip().split("\t")
                if r==0:
                    header=line
                    self.id_index=line.index(self.id_tag)
                    self.db_index=line.index(self.db_tag)
                    self.smi_index=line.index(self.smi_tag)
                    print(line)
                else:    
                    if process is None:
                        print(line[self.id_index])
                        print(line[self.db_index])
                        print(line[self.smi_index])
                    else:
                        process(r,line,prev_line)
                    prev_line=line    
                    if _max_records>0 and r>_max_records:
                        break
                r=r+1        
        
        
    def check_smiles(self,num,line,prev_line=None):
        is_error=False
        _id = self.getIdentifier(line)
        _db = self.getDB(line)
        #we have checked this one already
        if (_id == self.getIdentifier(prev_line)):
            return
        
        try:
            if _id in self.smiles_errors[_db]:
                return
        except:
            pass

        try:
            smiles = self.getSmiles(line)
            m = Chem.MolFromSmiles(smiles)
            if m == None :
                is_error=True
        except ValueError as e:
            is_error=True
            
        if is_error:    
            if not _db in self.smiles_errors:
                self.smiles_errors[_db] = []    
            
            self.smiles_errors[self.getDB(line)].append(_id)  
            
    def error_file(self,db):
        return "{}/errors_{}.txt".format(self.path,db)
        
    def write_errors(self):
        for db in self.smiles_errors:
            tmp=pd.DataFrame({"id" : self.smiles_errors[db]})
            file = self.error_file(db)
            print(file)
            tmp.to_csv(file,index=False,sep="\t")
            
    def read_errors(self):
        for db in ["pubchem","chembl20"]:
            file = self.error_file(db)
            tmp = pd.read_csv(file,sep="\t")            
            self.smiles_errors[db]=tmp["id"].values
    #print(excapedb.getDB(line))
    #print(excapedb.getIdentifier(line))
    #print(excapedb.getSmiles(line))        
#if os.path.isfile(excapedb[""]) 

excapedb = ExCAPEDB(local_path)
if not os.path.isfile(excapedb.file):
    urllib.request.urlretrieve(excapedb.zenodo, excapedb.file)
print(excapedb.file)

G://ChemicalData/EXCAPE/excapedb_fix/pubchem.chembl.dataset4publication_inchi_smiles.tsv.xz


In [98]:
def process(num,line,prev_line=None):
    smiles = excapedb.getSmiles(line)
    print(smiles)
            
    #print(excapedb.getDB(line))
    #print(excapedb.getIdentifier(line))
    #print(excapedb.getSmiles(line))

def retrieve(num,line,prev_line=None):
    url=None
    if "excapedb.getDB(line)" == "chembl20":
        url="https://www.ebi.ac.uk/chembl/api/data/molecule/{}.sdf".format(excapedb.getIdentifier(line))
    else:
        url="https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{}/record/sdf".format(excapedb.getIdentifier(line))
        
excapedb.read(excapedb.check_smiles,_max_records=-1)
excapedb.write_errors()

['Ambit_InchiKey', 'Original_Entry_ID', 'Entrez_ID', 'Activity_Flag', 'pXC50', 'DB', 'Original_Assay_ID', 'Tax_ID', 'Gene_Symbol', 'Ortholog_Group', 'InChI', 'SMILES']


In [120]:
print(excapedb.smiles_errors)


{'pubchem': array([  664065,  1479464, 11834578, ..., 16031493, 57398070, 22519912],
      dtype=int64), 'chembl20': array(['CHEMBL1498877', 'CHEMBL1598851', 'CHEMBL1428547', ...,
       'CHEMBL1889571', 'CHEMBL2047019', 'CHEMBL1714078'], dtype=object)}


In [119]:
excapedb.read_errors()