## Índice
[OBTER SEQUENCIAS](#OBTER-SEQUENCIAS)\
[OBTER SMILES](#OBTER-SMILES)

In [1]:
import pandas as pd
import requests
import os

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
data = pd.read_table('data/sabio_rk/sabioExport.tsv')

In [3]:
print(data.shape)
print(30*'-')
print(data.isna().sum())
print(30*'-')
print(data.isnull().sum())

(9951, 23)
------------------------------
Enzyme Variant                    0
Enzymename                        0
EntryID                           0
Reaction                          0
ECNumber                          0
UniProtKB_AC                    715
Organism                          0
Temperature                       0
pH                                0
Inhibitor                      8607
KineticMechanismType              0
parameter.name                    0
parameter.type                    0
parameter.associatedSpecies    2114
parameter.startValue           2801
parameter.endValue             9198
parameter.standardDeviation       0
parameter.unit                    0
Product                           0
PubMedID                          0
Publication                       0
SabioReactionID                   1
Substrate                         1
dtype: int64
------------------------------
Enzyme Variant                    0
Enzymename                        0
EntryID       

In [4]:
(data['parameter.name'].nunique())

58

In [5]:
data['parameter.name'] = data['parameter.name'].astype(str).str.lower()
data['parameter.name'].nunique()

55

In [6]:
# Primeiro: separa só o que interessa pro pivot
parametros = data[['EntryID', 'parameter.name', 'parameter.startValue']]

# Faz o pivot normalmente
pivoted_params = (
    parametros.pivot_table(
        index='EntryID',
        columns='parameter.name',
        values='parameter.startValue',
        aggfunc='first'
    )
    .reset_index()
)

In [7]:
pivoted_params.head()

parameter.name,EntryID,a,act,b,c,co,e,i,i1,i2,ic50,ic50_clavulanic_acid,ic50_sulbactam,ic50_tazobactam,ic_50,k,k1,k2,k2_k_2,k3,k4,k_2,k_inact,kcat,kcat/km,kcat_a,kcat_b,kcat_c,kcat_d,kcat_ki,kcat_kinact,kcat_km,kd,ki,ki_lim,kinact,kinact_ki,kistar,km,kma,kmax,kmb,kmc,kmd,kminus1,kminus2,ks,ksi,m,s,v,vmax,vmax_km,zn
0,247,,,,,,2e-08,,,,,,,,,,,,,,,,,5.9,1.6e-05,,,,,,,,,,,,,,3.6e-05,,,,,,,,,,,2e-06,,,,
1,248,,,,,,2e-08,,,,,,,,,,,,,,,,,11.7,6e-05,,,,,,,,,,,,,,1.9e-05,,,,,,,,,,,2e-06,,,,
2,249,,,,,,2e-08,,,,,,,,,,,,,,,,,3.7,4e-06,,,,,,,,,,,,,,9.6e-05,,,,,,,,,,,2e-06,,,,
3,250,,,,,,2e-08,,,,,,,,,,,,,,,,,5.0,5e-06,,,,,,,,,,,,,,9.7e-05,,,,,,,,,,,2e-06,,,,
4,251,,,,,,2e-08,,,,,,,,,,,,,,,,,11.5,890.0,,,,,,,,,,,,,,1e-06,,,,,,,,,,,2e-06,,,,


In [8]:
pivoted_params.shape

(2337, 54)

In [9]:
# Depois junta de volta com as outras informações (pH, temperatura etc.)
# Usamos 'first' pra não duplicar caso tenha múltiplas medições por EntryID
meta = data.groupby('EntryID', as_index=False).agg({
    'EntryID': 'first',
    'pH': 'first',
    'Temperature': 'first',
    # 'kcat_km': 'first',
    # 'kcat': 'first',
    # 'km': 'first',
    # 'kcat/km': 'first',
    'Organism': 'first',
    'Enzyme Variant': 'first',
    'Enzymename': 'first',
    'Substrate': 'first',
    'Reaction': 'first',
    'Product': 'first',
    'PubMedID': 'first',
    'Publication': 'first',
    'SabioReactionID': 'first',
    'UniProtKB_AC':'first',
    'parameter.associatedSpecies':'first',
})

# Junta os dados pivoteados com as demais informações
df_final = meta.merge(pivoted_params, on='EntryID', how='left')

In [10]:
df_final.isna().sum()

EntryID                           0
pH                                0
Temperature                       0
Organism                          0
Enzyme Variant                    0
Enzymename                        0
Substrate                         1
Reaction                          0
Product                           0
PubMedID                          0
Publication                       0
SabioReactionID                   1
UniProtKB_AC                    213
parameter.associatedSpecies      12
a                              1677
act                            2336
b                              2331
c                              2154
co                             2321
e                              2041
i                              2261
i1                             2307
i2                             2307
ic50                           2202
ic50_clavulanic_acid           2334
ic50_sulbactam                 2334
ic50_tazobactam                2334
ic_50                       

In [11]:
df_final = df_final[['EntryID',
                    'pH',
                    'Temperature',
                    'kcat_km',
                    'kcat',
                    'km',
                    'kcat/km',
                    'Organism',
                    'Enzyme Variant',
                    'Enzymename',
                    'Substrate',
                    'Reaction',
                    'Product',
                    'PubMedID',
                    'Publication',
                    'SabioReactionID',
                    'UniProtKB_AC',
                    'parameter.associatedSpecies']]

In [12]:
df_final.isna().sum()

EntryID                           0
pH                                0
Temperature                       0
kcat_km                         870
kcat                            739
km                              618
kcat/km                        2299
Organism                          0
Enzyme Variant                    0
Enzymename                        0
Substrate                         1
Reaction                          0
Product                           0
PubMedID                          0
Publication                       0
SabioReactionID                   1
UniProtKB_AC                    213
parameter.associatedSpecies      12
dtype: int64

In [13]:
df_final[(df_final['kcat'].isna()) & (df_final['km'].isna())].shape

(544, 18)

In [103]:
df_final.to_csv('data/sabio_rk/composite_data_sabiork.csv')

In [14]:
df_final.shape

(2338, 18)

### XML

In [92]:
d = 'data/sabio_rk/xml2'
os.makedirs(d, exist_ok=True)

In [None]:
# DONE
for entry_id in data.EntryID:
    url = f"https://sabiork.h-its.org/sabioRestWebServices/kineticLaws/{entry_id}"
    params = {"format": "sbml"}
    response = requests.get(url, params=params)
    if response.status_code == 200:
        with open(f"{d}/{entry_id}.xml", "w", encoding="utf-8") as f:
            f.write(response.text)
    else:
        print(f"Erro no ID {entry_id}: {response.status_code}")

### Extração

In [90]:
import xml.etree.ElementTree as ET

#path = f"{d}/247.xml"
XML_DIR = d

# Namespaces usados no SBML
NS = {
    'sbml': 'http://www.sbml.org/sbml/level3/version1/core',
    'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
    'sbrk': 'http://sabiork.h-its.org',
}

def parse_sbml(filepath):
    """Lê um arquivo SBML do SABIO-RK e extrai parâmetros principais"""
    tree = ET.parse(filepath)
    root = tree.getroot()

    # Identificação básica
    entry_id = os.path.basename(filepath).replace(".xml", "")
    model = root.find(".//sbml:model", NS)
    model_name = model.attrib.get("name") if model is not None else None

    # --- ENZIMA ---
    enzyme_name, uniprot_id = None, None
    for species in root.findall(".//sbml:species", NS):
        name = species.attrib.get("name", "")
        if "lactamase" in name.lower():
            enzyme_name = name
            # procurar UniprotID
            for li in species.findall(".//rdf:li", NS):
                link = li.attrib.get("rdf:resource", "")
                if "uniprot" in link:
                    uniprot_id = link.split("/")[-1]
                    break

    # --- SUBSTRATO ---
    substrate_name = None
    for species in root.findall(".//sbml:species", NS):
        name = species.attrib.get("name", "")
        if "enzyme" not in name.lower():
            # pega o primeiro que não for enzima
            substrate_name = name
            break

    # --- PARÂMETROS CINÉTICOS ---
    kcat = km = kcat_km = None
    for param in root.findall(".//sbml:localParameter", NS):
        pname = param.attrib.get("name", "").lower()
        value = float(param.attrib.get("value", "nan"))
        if "kcatdivkm" in pname or "kcat/km" in pname:
            kcat_km = value
        elif pname.startswith("km"):
            km = value
        elif pname.startswith("kcat"):
            kcat = value

    # --- CONDIÇÕES EXPERIMENTAIS ---
    temperature = None
    ph = None
    for val in root.findall(".//sbrk:startValueTemperature", NS):
        try:
            temperature = float(val.text)
        except:
            pass
    for val in root.findall(".//sbrk:startValuepH", NS):
        try:
            ph = float(val.text)
        except:
            pass

    # --- REFERÊNCIA (PubMed ID) ---
    pubmed_id = None
    for li in root.findall(".//rdf:li", NS):
        link = li.attrib.get("rdf:resource", "")
        if "pubmed" in link:
            pubmed_id = link.split("/")[-1]
            break

    return {
        "entry_id": entry_id,
        "model_name": model_name,
        "enzyme_name": enzyme_name,
        "UniProtKB_AC": uniprot_id,
        "substrate_name": substrate_name,
        "kcat": kcat,
        "Km": km,
        "kcat/Km": kcat_km,
        "temperature_C": temperature,
        "pH": ph,
        "pubmed_id": pubmed_id,
    }

# --- Ler todos os XMLs ---
data = []
for fname in os.listdir(XML_DIR):
    if fname.endswith(".xml"):
        filepath = os.path.join(XML_DIR, fname)
        try:
            info = parse_sbml(filepath)
            data.append(info)
        except Exception as e:
            print(f"Erro ao processar {fname}: {e}")

df = pd.DataFrame(data)

In [91]:
df

Unnamed: 0,entry_id,model_name,enzyme_name,UniProtKB_AC,substrate_name,kcat,Km,kcat/Km,temperature_C,pH,pubmed_id
0,10292,SABIOmdl10Nov2025755,beta-lactamase(Enzyme) mutant H99N CcrA3,,H2O,0.0018,0.000150,,25.0,7.2,
1,10293,SABIOmdl10Nov2025755,beta-lactamase(Enzyme) mutant C181S CcrA3,,H2O,0.6500,0.000170,,25.0,7.2,
2,10294,SABIOmdl10Nov2025755,beta-lactamase(Enzyme) mutant D103N CcrA3,,H2O,0.0770,0.000510,,25.0,7.2,
3,10295,SABIOmdl10Nov2025755,beta-lactamase(Enzyme) mutant delta 46-51 CcrA3,,Penicillin G,0.0024,0.000630,,25.0,7.2,
4,10296,SABIOmdl10Nov2025755,beta-lactamase(Enzyme) mutant C104R CcrA3,,Penicillin G,0.6300,0.000085,,25.0,7.2,
...,...,...,...,...,...,...,...,...,...,...,...
1486,9708,SABIOmdl10Nov2025755,beta-lactamase(Enzyme) mutant D120S,,Cefoxitin,6400.0000,0.000053,,25.0,7.0,
1487,9709,SABIOmdl10Nov2025755,beta-lactamase(Enzyme) mutant D120S,,Cephalothin,8000.0000,0.000046,,25.0,7.0,
1488,9710,SABIOmdl10Nov2025755,beta-lactamase(Enzyme) mutant D120S,,Nitrocefin,0.0280,0.000103,,25.0,7.0,
1489,9711,SABIOmdl10Nov2025755,beta-lactamase(Enzyme) mutant D120S,,H2O,,,,25.0,7.0,


In [62]:
# print('Qt amostras: ', df.shape[0])
# print('kcat E km nulos: ', df[(df['kcat'].isnull() & (df['Km'].isnull()))].shape[0])
# print('kcat NULO e km NÃO nulo: ', df[~(df['Km'].isnull()) & (df['kcat'].isnull())].shape[0])
# print('km NULO e kcat NÃO nulo: ', df[(df['Km'].isnull()) & ~(df['kcat'].isnull())].shape[0])
# print('kcat OU km nulos: ', 169+74)
df.to_csv('data/sabio_rk/dados_sabiork_xml.csv')

In [57]:
df.isnull().sum()

entry_id             0
model_name           0
enzyme_name          3
uniprot_id        1491
substrate_name       0
kcat               279
Km                 184
kcat/Km           1463
temperature_C       34
pH                  63
pubmed_id         1491
dtype: int64

#### A retirar: 280
* 243 sem kcat ou km
* 34 + 63 - 32 = 65 sem pH ou temperature_C
* 3 enzimas que não dá pra pegar outras informações porque não tem nome
#### Informações a obter
* Sequência das enzimas
* Classificação das enzimas
* SMILES dos substratos

In [66]:
#sequencia
def get_uniprot_from_pubmed(pubmed_id):
    # 1. Buscar o título do artigo
    pubmed_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
    params = {"db": "pubmed", "id": pubmed_id, "retmode": "json"}
    r = requests.get(pubmed_url, params=params)
    title = r.json()['result'][str(pubmed_id)]['title']

    # 2. Tentar localizar em UniProt
    query = f'"{title}" beta-lactamase'
    url = f"https://rest.uniprot.org/uniprotkb/search?query={query}&format=json&fields=accession,protein_name,sequence"
    r2 = requests.get(url)
    data = r2.json()

    if data.get('results'):
        first_hit = data['results'][0]
        acc = first_hit['primaryAccession']
        seq = first_hit['sequence']['value']
        return acc, seq
    return None, None

# smiles
def get_smiles(substrate_name):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{substrate_name}/property/CanonicalSMILES/JSON"
    r = requests.get(url)
    if r.status_code == 200:
        try:
            return r.json()['PropertyTable']['Properties'][0]['CanonicalSMILES']
        except:
            return None
    return None

#classe
def classify_beta_lactamase(protein_name):
    name = protein_name.upper()
    if "CLASS A" in name or any(tag in name for tag in ["TEM", "CTX", "SHV", "PER", "KPC"]):
        return "A"
    if "CLASS B" in name or any(tag in name for tag in ["NDM", "VIM", "IMP", "CphA"]):
        return "B"
    if "CLASS C" in name or "AMP" in name:
        return "C"
    if "CLASS D" in name or any(tag in name for tag in ["OXA"]):
        return "D"
    return None


In [None]:
seqs = []
for pmd_id in df["pubmed_id"]:
    acc, seq = get_uniprot_from_pubmed(pmd_id)
    seqs.append({"pubmed_id": pmd_id, "uniprot": acc, "sequence": seq})


In [88]:
df['kcat/Km'] = df['kcat'] / df['Km']
df['kcat/Km'].isna().sum()

np.int64(353)

In [89]:
df

Unnamed: 0,entry_id,model_name,enzyme_name,uniprot_id,substrate_name,kcat,Km,kcat/Km,temperature_C,pH,pubmed_id
0,10292,SABIOmdl10Nov2025755,beta-lactamase(Enzyme) mutant H99N CcrA3,,H2O,0.0018,0.000150,1.200000e+01,25.0,7.2,
1,10293,SABIOmdl10Nov2025755,beta-lactamase(Enzyme) mutant C181S CcrA3,,H2O,0.6500,0.000170,3.823529e+03,25.0,7.2,
2,10294,SABIOmdl10Nov2025755,beta-lactamase(Enzyme) mutant D103N CcrA3,,H2O,0.0770,0.000510,1.509804e+02,25.0,7.2,
3,10295,SABIOmdl10Nov2025755,beta-lactamase(Enzyme) mutant delta 46-51 CcrA3,,Penicillin G,0.0024,0.000630,3.809524e+00,25.0,7.2,
4,10296,SABIOmdl10Nov2025755,beta-lactamase(Enzyme) mutant C104R CcrA3,,Penicillin G,0.6300,0.000085,7.411765e+03,25.0,7.2,
...,...,...,...,...,...,...,...,...,...,...,...
1486,9708,SABIOmdl10Nov2025755,beta-lactamase(Enzyme) mutant D120S,,Cefoxitin,6400.0000,0.000053,1.207547e+08,25.0,7.0,
1487,9709,SABIOmdl10Nov2025755,beta-lactamase(Enzyme) mutant D120S,,Cephalothin,8000.0000,0.000046,1.739130e+08,25.0,7.0,
1488,9710,SABIOmdl10Nov2025755,beta-lactamase(Enzyme) mutant D120S,,Nitrocefin,0.0280,0.000103,2.718447e+02,25.0,7.0,
1489,9711,SABIOmdl10Nov2025755,beta-lactamase(Enzyme) mutant D120S,,H2O,,,,25.0,7.0,


# OBTER SEQUENCIAS

In [13]:
data_rk = pd.read_csv('data/sabio_rk/composite_data_sabiork.csv')

In [85]:
data_rk.isna().sum()

Unnamed: 0                        0
EntryID                           0
pH                                0
Temperature                       0
kcat_km                         870
kcat                            739
km                              618
kcat/km                        2299
Organism                          0
Enzyme Variant                    0
Enzymename                        0
Substrate                         1
Reaction                          0
Product                           0
PubMedID                          0
Publication                       0
SabioReactionID                   1
UniProtKB_AC                    213
parameter.associatedSpecies      12
dtype: int64

In [86]:
data_rk.nunique()

Unnamed: 0                     2338
EntryID                        2338
pH                               30
Temperature                      11
kcat_km                         664
kcat                            692
km                              591
kcat/km                          37
Organism                         37
Enzyme Variant                  198
Enzymename                        1
Substrate                       177
Reaction                        105
Product                          10
PubMedID                         89
Publication                      90
SabioReactionID                 105
UniProtKB_AC                     72
parameter.associatedSpecies     150
dtype: int64

In [87]:
pd.read_csv('data/sabio_rk/dados_sabiork_xml.csv').isna().sum()

Unnamed: 0           0
entry_id             0
model_name           0
enzyme_name          3
uniprot_id        1491
substrate_name       0
kcat               279
Km                 184
kcat/Km           1463
temperature_C       34
pH                  63
pubmed_id         1491
dtype: int64

In [14]:
df = data_rk.copy()

uniprot_ids = df['UniProtKB_AC'].dropna().unique()

def get_uniprot_sequence(uniprot_id):
    url = f"https://www.ebi.ac.uk/proteins/api/proteins/{uniprot_id}"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            return data['sequence']['sequence']
    except:
        return None

sequences = {}
for uniprot_id in uniprot_ids:
    seq = get_uniprot_sequence(uniprot_id)
    if seq:
        sequences[uniprot_id] = seq

In [89]:
# df['sequence'] = df['UniProtKB_AC'].map(sequences)
# df.to_csv('data/sabio_rk/composite_data_sabiork.csv')

# OBTER SMILES

In [71]:
[linha for linha in df['Substrate'] if 'H2O' not in str(linha)]

['Substituted beta-amino acid',
 'Substituted beta-amino acid',
 'Substituted beta-amino acid',
 'Substituted beta-amino acid',
 'Substituted beta-amino acid',
 'Substituted beta-amino acid',
 nan]

In [4]:
import urllib.parse
from time import sleep

# separar os componentes
def separate_substrate_components(compound_string):
    """Separa o substrato principal do solvente/co-substrato"""
    if pd.isna(compound_string):
        return None, None
    
    compounds = [c.strip() for c in str(compound_string).split(';')]
    
    # Identifica H2O e outros solventes
    solvents = []
    main_substrates = []
    
    for comp in compounds:
        comp_lower = comp.lower()
        if comp_lower in ['h2o', 'water', 'h₂o']:
            solvents.append(comp)
        else:
            main_substrates.append(comp)
    
    # Retorna o principal substrato e os solventes
    main_substrate = main_substrates[0] if main_substrates else None
    solvent = ';'.join(solvents) if solvents else None
    
    return main_substrate, solvent

In [31]:
df[['main_substrate', 'solvent']] = df['Substrate'].apply(
    lambda x: pd.Series(separate_substrate_components(x))
)

In [17]:
df.drop(['Unnamed: 0','Unnamed: 0.1'], axis=1, inplace=True)
df.head()

Unnamed: 0,EntryID,pH,Temperature,kcat_km,kcat,km,kcat/km,Organism,Enzyme Variant,Enzymename,Substrate,Reaction,Product,PubMedID,Publication,SabioReactionID,UniProtKB_AC,parameter.associatedSpecies,sequence
0,247,7.4,22.0,,5.9,3.6e-05,1.6e-05,Treponema pallidum,wildtype,beta-lactamase,Penicillin G;H2O,Penicillin G + H2O = Substituted beta-amino acid,Substituted beta-amino acid,14747460,"Cha JY, Ishiwata A, Mobashery S: A novel beta-...",4016.0,P29723,Penicillin G,MKVKYALLSAGALQLLVVGCGSSHHETHYGYATLSYADYWAGELGQ...
1,248,7.4,22.0,,11.7,1.9e-05,6e-05,Treponema pallidum,wildtype,beta-lactamase,H2O;Carbenicillin,H2O + Carbenicillin = Substituted beta-amino acid,Substituted beta-amino acid,14747460,"Cha JY, Ishiwata A, Mobashery S: A novel beta-...",6598.0,P29723,Carbenicillin,MKVKYALLSAGALQLLVVGCGSSHHETHYGYATLSYADYWAGELGQ...
2,249,7.4,22.0,,3.7,9.6e-05,4e-06,Treponema pallidum,wildtype,beta-lactamase,Oxacillin;H2O,H2O + Oxacillin = Substituted beta-amino acid,Substituted beta-amino acid,14747460,"Cha JY, Ishiwata A, Mobashery S: A novel beta-...",6579.0,P29723,Oxacillin,MKVKYALLSAGALQLLVVGCGSSHHETHYGYATLSYADYWAGELGQ...
3,250,7.4,22.0,,5.0,9.7e-05,5e-06,Treponema pallidum,wildtype,beta-lactamase,Cloxacillin;H2O,H2O + Cloxacillin = Substituted beta-amino acid,Substituted beta-amino acid,14747460,"Cha JY, Ishiwata A, Mobashery S: A novel beta-...",6596.0,P29723,Cloxacillin,MKVKYALLSAGALQLLVVGCGSSHHETHYGYATLSYADYWAGELGQ...
4,251,7.4,22.0,,11.5,1e-06,890.0,Treponema pallidum,wildtype,beta-lactamase,H2O;6-alpha-Hydroxymethylpenicillanate,H2O + 6-alpha-Hydroxymethylpenicillanate = Sub...,Substituted beta-amino acid,14747460,"Cha JY, Ishiwata A, Mobashery S: A novel beta-...",6610.0,P29723,6-alpha-Hydroxymethylpenicillanate,MKVKYALLSAGALQLLVVGCGSSHHETHYGYATLSYADYWAGELGQ...


In [18]:
df['parameter.associatedSpecies'].nunique()

150

In [19]:
import requests
import pandas as pd
import urllib.parse
from time import sleep

def obter_smiles_substratos(coluna_substratos):
    """
    Obtém SMILES para uma coluna de substratos usando múltiplas fontes
    """
    def buscar_pubchem(nome_composto):
        """Busca SMILES no PubChem"""
        try:
            encoded_name = urllib.parse.quote(str(nome_composto))
            url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{encoded_name}/property/CanonicalSMILES/JSON"
            response = requests.get(url, timeout=30)
            
            if response.status_code == 200:
                data = response.json()
                if ('PropertyTable' in data and 
                    'Properties' in data['PropertyTable'] and 
                    len(data['PropertyTable']['Properties']) > 0 and 
                    'CanonicalSMILES' in data['PropertyTable']['Properties'][0]):
                    return data['PropertyTable']['Properties'][0]['CanonicalSMILES']
            return None
        except:
            return None
    
    def buscar_chebi(nome_composto):
        """Busca SMILES no CheBI"""
        try:
            search_url = f"https://www.ebi.ac.uk/chebi/searchId.do?searchString={urllib.parse.quote(str(nome_composto))}"
            response = requests.get(search_url, timeout=30)
            
            if response.status_code == 200:
                import re
                chebi_matches = re.findall(r'CHEBI:\d+', response.text)
                if chebi_matches:
                    chebi_id = chebi_matches[0]
                    smiles_url = f"https://www.ebi.ac.uk/chebi/api/proxy/chebi/entity/{chebi_id}/smiles"
                    smiles_response = requests.get(smiles_url, timeout=30)
                    
                    if smiles_response.status_code == 200 and smiles_response.text.strip():
                        return smiles_response.text.strip()
            return None
        except:
            return None
    
    def buscar_cirpy(nome_composto):
        """Busca SMILES usando CIRPy"""
        try:
            import cirpy
            result = cirpy.resolve(str(nome_composto), 'smiles')
            if result:
                return result[0] if isinstance(result, list) else result
            return None
        except:
            return None
    
    # Mapeamento de fallback para antibióticos beta-lactâmicos comuns
    mapeamento_fallback = {
        'penicillin g': 'CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C',
        'benzylpenicillin': 'CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C',
        'ampicillin': 'CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O)O)C',
        'amoxicillin': 'CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=C(C=C3)O)N)C(=O)O)C',
        'cefotaxime': 'COC1=CC(=CC=C1OC)C2=NC3=C(C(=O)O2)C(CNC(=O)N)(C(S3)COC(=O)C)O',
        'ceftazidime': 'CC(C)(C(=O)O)OC1C2N(C1=O)C(=C(CS2)C(=O)NC3C4C(C(C3N)C(=O)NNC(=O)C5=CC=CC=C5)(C4O)O)C(=O)O'
    }
    
    resultados = []
    substratos_unicos = {}
    
    # Primeiro agrupa substratos únicos para evitar buscas duplicadas
    for substrato in coluna_substratos:
        if pd.isna(substrato):
            resultados.append(None)
        else:
            substrato_clean = str(substrato).strip().lower()
            if substrato_clean not in substratos_unicos:
                substratos_unicos[substrato_clean] = None
            resultados.append(substrato_clean)
    
    # Busca SMILES para cada substrato único
    print(f"Buscando SMILES para {len(substratos_unicos)} substratos únicos...")
    
    for i, (substrato, _) in enumerate(substratos_unicos.items()):
        if i % 10 == 0:
            print(f"Progresso: {i}/{len(substratos_unicos)}")
        
        smiles = None
        
        # 1. Tenta mapeamento de fallback
        for key, value in mapeamento_fallback.items():
            if key in substrato:
                smiles = value
                break
        
        # 2. Tenta PubChem
        if not smiles:
            smiles = buscar_pubchem(substrato)
            sleep(0.5)
        
        # 3. Tenta CheBI
        if not smiles:
            smiles = buscar_chebi(substrato)
            sleep(0.5)
        
        # 4. Tenta CIRPy
        if not smiles:
            try:
                smiles = buscar_cirpy(substrato)
            except:
                pass
        
        substratos_unicos[substrato] = smiles
    
    # Mapeia de volta para a lista original
    smiles_final = [substratos_unicos[substrato] if substrato else None for substrato in resultados]
    
    return smiles_final

In [32]:
df_s = obter_smiles_substratos(df['main_substrate'])

Buscando SMILES para 105 substratos únicos...
Progresso: 0/105
Progresso: 10/105
Progresso: 20/105
Progresso: 30/105
Progresso: 40/105
Progresso: 50/105
Progresso: 60/105
Progresso: 70/105
Progresso: 80/105
Progresso: 90/105
Progresso: 100/105


In [33]:
df['smiles'] = df_s
df.head() #617

Unnamed: 0,EntryID,pH,Temperature,kcat_km,kcat,km,kcat/km,Organism,Enzyme Variant,Enzymename,Substrate,Reaction,Product,PubMedID,Publication,SabioReactionID,UniProtKB_AC,parameter.associatedSpecies,sequence,smiles,main_substrate,solvent
0,247,7.4,22.0,,5.9,3.6e-05,1.6e-05,Treponema pallidum,wildtype,beta-lactamase,Penicillin G;H2O,Penicillin G + H2O = Substituted beta-amino acid,Substituted beta-amino acid,14747460,"Cha JY, Ishiwata A, Mobashery S: A novel beta-...",4016.0,P29723,Penicillin G,MKVKYALLSAGALQLLVVGCGSSHHETHYGYATLSYADYWAGELGQ...,CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C,Penicillin G,H2O
1,248,7.4,22.0,,11.7,1.9e-05,6e-05,Treponema pallidum,wildtype,beta-lactamase,H2O;Carbenicillin,H2O + Carbenicillin = Substituted beta-amino acid,Substituted beta-amino acid,14747460,"Cha JY, Ishiwata A, Mobashery S: A novel beta-...",6598.0,P29723,Carbenicillin,MKVKYALLSAGALQLLVVGCGSSHHETHYGYATLSYADYWAGELGQ...,,Carbenicillin,H2O
2,249,7.4,22.0,,3.7,9.6e-05,4e-06,Treponema pallidum,wildtype,beta-lactamase,Oxacillin;H2O,H2O + Oxacillin = Substituted beta-amino acid,Substituted beta-amino acid,14747460,"Cha JY, Ishiwata A, Mobashery S: A novel beta-...",6579.0,P29723,Oxacillin,MKVKYALLSAGALQLLVVGCGSSHHETHYGYATLSYADYWAGELGQ...,,Oxacillin,H2O
3,250,7.4,22.0,,5.0,9.7e-05,5e-06,Treponema pallidum,wildtype,beta-lactamase,Cloxacillin;H2O,H2O + Cloxacillin = Substituted beta-amino acid,Substituted beta-amino acid,14747460,"Cha JY, Ishiwata A, Mobashery S: A novel beta-...",6596.0,P29723,Cloxacillin,MKVKYALLSAGALQLLVVGCGSSHHETHYGYATLSYADYWAGELGQ...,,Cloxacillin,H2O
4,251,7.4,22.0,,11.5,1e-06,890.0,Treponema pallidum,wildtype,beta-lactamase,H2O;6-alpha-Hydroxymethylpenicillanate,H2O + 6-alpha-Hydroxymethylpenicillanate = Sub...,Substituted beta-amino acid,14747460,"Cha JY, Ishiwata A, Mobashery S: A novel beta-...",6610.0,P29723,6-alpha-Hydroxymethylpenicillanate,MKVKYALLSAGALQLLVVGCGSSHHETHYGYATLSYADYWAGELGQ...,,6-alpha-Hydroxymethylpenicillanate,H2O


In [37]:
df[df['smiles'].isna()]['parameter.associatedSpecies'].unique()

array(['Carbenicillin', 'Oxacillin', 'Cloxacillin',
       '6-alpha-Hydroxymethylpenicillanate',
       '6-beta-Hydroxymethylpenicillanate',
       '6-alpha-(1-Hydroxyethyl)-penicillanate',
       '6-beta-(1-Hydroxyethyl)-penicillanate', 'Enzyme',
       '6-beta-(1-Hydroxy-1-methylethyl)-penicillanate', 'Bocillin FL',
       'Imipenem', 'Biapenem', 'Nitrocefin', 'Cephalothin', 'Meropenem',
       'Cefuroxime',
       '3-[2-(2-Aminothiazol-4-yl)-2-(Z)-methoxyiminoacetylglycyl]oxybenzoic acid',
       '3-[(Phenylacetyl)glycyl]oxybenzoic acid', 'Cefotaxime',
       'Dexamethasone-Cephem-Methotrexate', 'Piperacillin',
       'Cephaloridine', 'Cefdinir', 'Cefcapene', 'beta-Lactam',
       'Sulbactam', 'Cefaclor', 'Cefoxitin', 'Tazobactam', 'BRL42715',
       '1,10-Phenanthroline', 'Cefazolin', 'Cephapirin', 'Cefamandole',
       'Cephaloridin', 'Cephalexin', 'Cephaloglycin',
       '6-beta-Iodopenicillanic acid', 'Moxalactam', 'Cephazoline',
       'Cephalosporin C', 'Loracarbef', 'Mecillin

In [None]:
import pandas as pd

def explodir_coluna_multivalorada(dataset, nome_coluna, separador=','):
    """
    Explode uma coluna multivalorada em linhas individuais
    
    Parameters:
    dataset: DataFrame
    nome_coluna: str - nome da coluna com valores múltiplos
    separador: str - separador dos valores (padrão: ',')
    
    Returns:
    DataFrame com uma linha para cada valor
    """
    # Faz uma cópia para não modificar o original
    df = dataset.copy()
    
    # Converte para string e divide os valores
    df[nome_coluna] = df[nome_coluna].astype(str)
    df[nome_coluna] = df[nome_coluna].str.split(separador)
    
    # Explode a coluna multivalorada
    df_explodido = df.explode(nome_coluna)
    
    # Remove espaços em branco
    df_explodido[nome_coluna] = df_explodido[nome_coluna].str.strip()
    
    # Remove valores vazios
    df_explodido = df_explodido[df_explodido[nome_coluna] != '']
    
    return df_explodido.reset_index(drop=True)

In [54]:
df.isna().sum()

EntryID                           1
pH                                1
Temperature                       1
kcat_km                         871
kcat                            740
km                              619
kcat/km                        2300
Organism                          1
Enzyme Variant                    1
Enzymename                        1
Substrate                         1
Reaction                          1
Product                           1
PubMedID                          1
Publication                       1
SabioReactionID                   2
UniProtKB_AC                    214
parameter.associatedSpecies      13
sequence                        214
smiles                         1722
main_substrate                    2
solvent                           8
dtype: int64