### Batch ClassyFire

In [1]:
from pybatchclassyfire import *
import pandas as pd
import csv
import time
import json
from pandas import json_normalize
import numpy as np
import logging
from rdkit.Chem.inchi import InchiToInchiKey

DEBUG:requests_cache.backends:Initializing backend: None demo_cache_pybatch
DEBUG:requests_cache.backends.base:Initializing SQLitePickleDict with serializer: <requests_cache.serializers.pipeline.SerializerPipeline object at 0x7fd28a2a9a30>
DEBUG:requests_cache.backends.sqlite:Opening connection to /home/hsilva/Tese/pybatchclassyfire/notebook/demo_cache_pybatch.sqlite:responses
DEBUG:requests_cache.backends.base:Initializing SQLiteDict with serializer: <requests_cache.serializers.pipeline.SerializerPipeline object at 0x7fd28a2a9a30>
DEBUG:requests_cache.backends.sqlite:Opening connection to /home/hsilva/Tese/pybatchclassyfire/notebook/demo_cache_pybatch.sqlite:redirects
INFO:rdkit:Enabling RDKit 2021.03.4 jupyter extensions


## Data loading

In [None]:
db = pd.read_pickle('~/Tese/AllDB.pkl')

## Retrieve ClassyFire classifications

This first step is done using inchikey and interrogation of the gnps classified structures

In [None]:
gnps_proxy = True

In [None]:
url = "http://classyfire.wishartlab.com"
proxy_url =  "https://gnps-classyfire.ucsd.edu"
chunk_size = 1000
sleep_interval = 12
return_format = 'json'

Below is a slightly modified version of the original get_classification() which will take list object as input

In [None]:
logging.getLogger("urllib3").setLevel(logging.WARNING)
logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("requests_cache").setLevel(logging.WARNING)

resolved_ik_number_list = [0, 0]
total_inchikey_number = len(all_inchi_keys)

while True:
    
    start_time = time.time()
    
    print('%s inchikey to resolve' % total_inchikey_number )
    get_classifications_cf_mod(all_inchi_keys, 1)
    
    cleanse('all_json.json', 'all_json.json')
    
    with open("all_json.json") as tweetfile:
        jsondic = json.loads(tweetfile.read())

    df = json_normalize(jsondic)
    df = df.drop_duplicates( 'inchikey' )
    resolved_ik_number = len( df.drop_duplicates('inchikey').inchikey )
    resolved_ik_number_list.append( resolved_ik_number )
    print('%s resolved inchikeys' % resolved_ik_number )
    print("done in --- %s seconds ---" % (time.time() - start_time))
    
    if resolved_ik_number_list[-1] < resolved_ik_number_list[-2] or resolved_ik_number_list[-1] == resolved_ik_number_list[-3]:
        break

We then use the cleanse function to directly remove unclassified structures from the json. Else the json file is not treated by the json_normalize() function.
To remove null entries from json inputs and output cleaned file we define the cleanse() function. Slightly adapted from https://stackoverflow.com/a/50531943

In [None]:
cleanse('all_json.json', 'all_json_cleaned.json')

We now load this cleaned json file

In [None]:
with open("all_json_cleaned.json") as tweetfile:
        jsondic = json.loads(tweetfile.read())

And normalize the output as a dataframe

In [None]:
flattened_classified_json = json_normalize(jsondic)

And have a peak into this new df

In [None]:
flattened_classified_json = flattened_classified_json.drop_duplicates('inchikey')
flattened_classified_json.info()
flattened_classified_json.to_pickle('~/Tese/ClassyFire/classyfire_gnps_results.pkl')

## Join classyfire results

Join this results with previous ones

In [None]:
classyfire_join = pd.read_pickle('~/Tese/ClassyFire/cf_allraw.pkl')
classyfire_join = classyfire_join.append(pd.read_pickle('~/Tese/ClassyFire/classyfire_gnps_results.pkl'))

print(classyfire_join.info())
classyfire_join = classyfire_join.dropna(how='all').drop_duplicates('inchikey').reset_index(drop=True)
classyfire_join.info()
classyfire_join.to_pickle('~/Tese/ClassyFire/cf_allraw.pkl')

We now want to output the unclassified InChI's

## Get unclassed compounds

Load Database and results from classyfire

In [83]:
all_db = pd.read_pickle('~/Tese/AllDB.pkl')[['InChI', 'InChIKey', 'SMILES']]
all_db['InChI'] = 'InChI=' + all_db['InChI']
all_db['InChIKey'] = 'InChIKey=' + all_db['InChIKey'] 
classyfire_df = pd.read_pickle('~/Tese/ClassyFire/cf_allraw.pkl')[['inchikey', 'smiles', 'kingdom.name']]

Join both Dataframes by InChIKey and check for entries with no structural information

In [84]:
df_merged = pd.merge(all_db, classyfire_df, left_on='InChIKey', right_on='inchikey', how='left')
print(df_merged[df_merged['InChI'].isna()])

df_merged_unclassed = df_merged[df_merged['kingdom.name'].isna()].drop_duplicates('InChIKey')
df_merged_unclassed.info()
df_merged_unclassed[['InChI', 'InChIKey']].to_csv('unclassed.tsv', sep='\t', encoding="utf-8")

Empty DataFrame
Columns: [InChI, InChIKey, SMILES, inchikey, smiles, kingdom.name]
Index: []
<class 'pandas.core.frame.DataFrame'>
Int64Index: 69 entries, 5615 to 290089
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   InChI         69 non-null     object
 1   InChIKey      69 non-null     object
 2   SMILES        67 non-null     object
 3   inchikey      69 non-null     object
 4   smiles        0 non-null      object
 5   kingdom.name  0 non-null      object
dtypes: object(6)
memory usage: 3.8+ KB


## Classyfing the unclassified (Post request)

In [85]:
query_ids = batch_query('unclassed.tsv',
                   'InChI', dialect='excel-tab')
print(query_ids)
with open('query_list.txt', 'w') as f:
    for id_ in query_ids:
        f.write(str(id_)+'\n')

DEBUG:requests_cache.cache_control:Cache directives from request headers: {}
DEBUG:urllib3.connectionpool:http://classyfire.wishartlab.com:80 "POST /queries.json HTTP/1.1" 201 None
DEBUG:requests_cache.session:Pre-cache checks for response from http://classyfire.wishartlab.com/queries.json: {'disabled cache': False, 'disabled method': True, 'disabled status': True, 'disabled by filter': False, 'disabled by headers or expiration params': False}
DEBUG:requests_cache.session:Skipping cache write for URL: http://classyfire.wishartlab.com/queries.json


1 queries submitted to ClassyFire API
[7068115]


In [86]:
with open('query_list.txt', 'r') as f:
    query_ids = []
    for id_ in f.readlines():
        query_ids.append(id_.strip())
query_ids

['7068115']

The status can also be checked manually at the following adress (just change the query id)
http://classyfire.wishartlab.com/queries/3879356.json?page=1

These settings of the request_cache allow to retry when 429 (or other) type of errors are returned by the classyfire server. Most of the time when too many intents are made. Since this seems to be a random behaviour, fixing a time.sleep is not safe enough.

In [None]:
with open("batch_query_classification.json", "w") as f:
    f.write(json.dumps(get_results_multientry_multipage_patient(query_ids, return_format="json")))

## Outputs standardization 

Now we will standardize the json output of classyfire get_entity() and the one of get_results_multipage_patient()

For the output of the get_results_multipage_patient() we first load the json as a dataframe and remove identities with an empty identifier value

In [24]:
with open("batch_query_classification.json") as tweetfile:
    jsondic_inchi = json.loads(tweetfile.read())

The json_normalize function is used to flatten the nested JSON structure.
Beware here the meta = ['id'] field can sometimes return an error. Remove if you dont need it.

In [25]:
normalized_df_inchi = json_normalize(jsondic_inchi,
                              record_path = 'entities',
                               meta = ['label']
                              )

In [26]:
normalized_df_inchi.head()

Unnamed: 0,identifier,smiles,inchikey,intermediate_nodes,alternative_parents,molecular_framework,substituents,description,external_descriptors,ancestors,...,subclass.description,subclass.chemont_id,subclass.url,direct_parent.name,direct_parent.description,direct_parent.chemont_id,direct_parent.url,subclass,report,label
0,Q7068026-11,CCCCCC\C=C/CCCCCCCC(=O)O[C@H]1CC(C)(C)C(=[C@@]...,InChIKey=XHONLVDYAICKMI-CTCVEWKCSA-N,"[{'name': 'Carotenoids', 'description': 'Organ...","[{'name': 'Oxepanes', 'description': 'Compound...",Aliphatic heteropolycyclic compounds,"[Xanthophyll, Oxepane, Fatty acid ester, Fatty...",This compound belongs to the class of organic ...,[],"[Acryloyl compounds, Alcohols and polyols, Alp...",...,Terpenoid molecules containing 10 consecutivel...,CHEMONTID:0001554,http://classyfire.wishartlab.com/tax_nodes/C00...,Xanthophylls,Carotenoids containing an oxygenated carotene ...,CHEMONTID:0001410,http://classyfire.wishartlab.com/tax_nodes/C00...,,,pyclassyfire
1,Q7068026-12,CCCCCC\C=C/CCCCCCCC(=O)O[C@H]1CC(C)(C)C(=[C@@]...,InChIKey=RMCGPKXYXZGYBD-YXZLPVEGSA-N,"[{'name': 'Carotenoids', 'description': 'Organ...","[{'name': 'Lineolic acids and derivatives', 'd...",Aliphatic heteropolycyclic compounds,"[Xanthophyll, Octadecanoid, Oxepane, Fatty aci...",This compound belongs to the class of organic ...,[],"[Acryloyl compounds, Alcohols and polyols, Alp...",...,Terpenoid molecules containing 10 consecutivel...,CHEMONTID:0001554,http://classyfire.wishartlab.com/tax_nodes/C00...,Xanthophylls,Carotenoids containing an oxygenated carotene ...,CHEMONTID:0001410,http://classyfire.wishartlab.com/tax_nodes/C00...,,,pyclassyfire
2,Q7068026-13,CCCCCCCC\C=C\CCCCCCCC(=O)O[C@@H]1C[C@@]2(C)O[C...,InChIKey=NIJXVFJGRRKJAV-RFBZYDAGSA-N,"[{'name': 'Carotenoids', 'description': 'Organ...","[{'name': 'Oxepanes', 'description': 'Compound...",Aliphatic heteropolycyclic compounds,"[Xanthophyll, Oxepane, Fatty acid ester, Fatty...",This compound belongs to the class of organic ...,[],"[Acryloyl compounds, Alcohols and polyols, Alp...",...,Terpenoid molecules containing 10 consecutivel...,CHEMONTID:0001554,http://classyfire.wishartlab.com/tax_nodes/C00...,Xanthophylls,Carotenoids containing an oxygenated carotene ...,CHEMONTID:0001410,http://classyfire.wishartlab.com/tax_nodes/C00...,,,pyclassyfire
3,Q7068026-14,CCCCCC\C=C/CCCCCCCC(=O)O[C@@H]1C[C@@]2(C)O[C@@...,InChIKey=FUSVDWMWPOQCIO-JPMRSYBPSA-N,"[{'name': 'Carotenoids', 'description': 'Organ...","[{'name': 'Oxepanes', 'description': 'Compound...",Aliphatic heteropolycyclic compounds,"[Xanthophyll, Oxepane, Fatty acid ester, Fatty...",This compound belongs to the class of organic ...,[],"[Acryloyl compounds, Alcohols and polyols, Alp...",...,Terpenoid molecules containing 10 consecutivel...,CHEMONTID:0001554,http://classyfire.wishartlab.com/tax_nodes/C00...,Xanthophylls,Carotenoids containing an oxygenated carotene ...,CHEMONTID:0001410,http://classyfire.wishartlab.com/tax_nodes/C00...,,,pyclassyfire
4,Q7068026-15,CCCCCC\C=C/CCCCCCCC(=O)O[C@H]1CC(C)(C)C(=[C@@]...,InChIKey=FMGAMESTTXTHIR-FHZDASMESA-N,"[{'name': 'Carotenoids', 'description': 'Organ...","[{'name': 'Lineolic acids and derivatives', 'd...",Aliphatic heteropolycyclic compounds,"[Xanthophyll, Octadecanoid, Oxepane, Fatty aci...",This compound belongs to the class of organic ...,[],"[Acryloyl compounds, Alcohols and polyols, Alp...",...,Terpenoid molecules containing 10 consecutivel...,CHEMONTID:0001554,http://classyfire.wishartlab.com/tax_nodes/C00...,Xanthophylls,Carotenoids containing an oxygenated carotene ...,CHEMONTID:0001410,http://classyfire.wishartlab.com/tax_nodes/C00...,,,pyclassyfire


And now we remove rows for wich no identifier is returned

In [27]:
normalized_df_inchi_nona = normalized_df_inchi[normalized_df_inchi['identifier'].notna()]

In [28]:
normalized_df_inchi_nona

Unnamed: 0,identifier,smiles,inchikey,intermediate_nodes,alternative_parents,molecular_framework,substituents,description,external_descriptors,ancestors,...,subclass.description,subclass.chemont_id,subclass.url,direct_parent.name,direct_parent.description,direct_parent.chemont_id,direct_parent.url,subclass,report,label
0,Q7068026-11,CCCCCC\C=C/CCCCCCCC(=O)O[C@H]1CC(C)(C)C(=[C@@]...,InChIKey=XHONLVDYAICKMI-CTCVEWKCSA-N,"[{'name': 'Carotenoids', 'description': 'Organ...","[{'name': 'Oxepanes', 'description': 'Compound...",Aliphatic heteropolycyclic compounds,"[Xanthophyll, Oxepane, Fatty acid ester, Fatty...",This compound belongs to the class of organic ...,[],"[Acryloyl compounds, Alcohols and polyols, Alp...",...,Terpenoid molecules containing 10 consecutivel...,CHEMONTID:0001554,http://classyfire.wishartlab.com/tax_nodes/C00...,Xanthophylls,Carotenoids containing an oxygenated carotene ...,CHEMONTID:0001410,http://classyfire.wishartlab.com/tax_nodes/C00...,,,pyclassyfire
1,Q7068026-12,CCCCCC\C=C/CCCCCCCC(=O)O[C@H]1CC(C)(C)C(=[C@@]...,InChIKey=RMCGPKXYXZGYBD-YXZLPVEGSA-N,"[{'name': 'Carotenoids', 'description': 'Organ...","[{'name': 'Lineolic acids and derivatives', 'd...",Aliphatic heteropolycyclic compounds,"[Xanthophyll, Octadecanoid, Oxepane, Fatty aci...",This compound belongs to the class of organic ...,[],"[Acryloyl compounds, Alcohols and polyols, Alp...",...,Terpenoid molecules containing 10 consecutivel...,CHEMONTID:0001554,http://classyfire.wishartlab.com/tax_nodes/C00...,Xanthophylls,Carotenoids containing an oxygenated carotene ...,CHEMONTID:0001410,http://classyfire.wishartlab.com/tax_nodes/C00...,,,pyclassyfire
2,Q7068026-13,CCCCCCCC\C=C\CCCCCCCC(=O)O[C@@H]1C[C@@]2(C)O[C...,InChIKey=NIJXVFJGRRKJAV-RFBZYDAGSA-N,"[{'name': 'Carotenoids', 'description': 'Organ...","[{'name': 'Oxepanes', 'description': 'Compound...",Aliphatic heteropolycyclic compounds,"[Xanthophyll, Oxepane, Fatty acid ester, Fatty...",This compound belongs to the class of organic ...,[],"[Acryloyl compounds, Alcohols and polyols, Alp...",...,Terpenoid molecules containing 10 consecutivel...,CHEMONTID:0001554,http://classyfire.wishartlab.com/tax_nodes/C00...,Xanthophylls,Carotenoids containing an oxygenated carotene ...,CHEMONTID:0001410,http://classyfire.wishartlab.com/tax_nodes/C00...,,,pyclassyfire
3,Q7068026-14,CCCCCC\C=C/CCCCCCCC(=O)O[C@@H]1C[C@@]2(C)O[C@@...,InChIKey=FUSVDWMWPOQCIO-JPMRSYBPSA-N,"[{'name': 'Carotenoids', 'description': 'Organ...","[{'name': 'Oxepanes', 'description': 'Compound...",Aliphatic heteropolycyclic compounds,"[Xanthophyll, Oxepane, Fatty acid ester, Fatty...",This compound belongs to the class of organic ...,[],"[Acryloyl compounds, Alcohols and polyols, Alp...",...,Terpenoid molecules containing 10 consecutivel...,CHEMONTID:0001554,http://classyfire.wishartlab.com/tax_nodes/C00...,Xanthophylls,Carotenoids containing an oxygenated carotene ...,CHEMONTID:0001410,http://classyfire.wishartlab.com/tax_nodes/C00...,,,pyclassyfire
4,Q7068026-15,CCCCCC\C=C/CCCCCCCC(=O)O[C@H]1CC(C)(C)C(=[C@@]...,InChIKey=FMGAMESTTXTHIR-FHZDASMESA-N,"[{'name': 'Carotenoids', 'description': 'Organ...","[{'name': 'Lineolic acids and derivatives', 'd...",Aliphatic heteropolycyclic compounds,"[Xanthophyll, Octadecanoid, Oxepane, Fatty aci...",This compound belongs to the class of organic ...,[],"[Acryloyl compounds, Alcohols and polyols, Alp...",...,Terpenoid molecules containing 10 consecutivel...,CHEMONTID:0001554,http://classyfire.wishartlab.com/tax_nodes/C00...,Xanthophylls,Carotenoids containing an oxygenated carotene ...,CHEMONTID:0001410,http://classyfire.wishartlab.com/tax_nodes/C00...,,,pyclassyfire
5,Q7068026-16,CCCCCCCCCCCCCCCC(=O)O[C@H]1CC(C)(C)C(=[C@@]=C\...,InChIKey=XPOYCCSLZIKZGZ-TUWXWTLWSA-N,"[{'name': 'Carotenoids', 'description': 'Organ...","[{'name': 'Oxepanes', 'description': 'Compound...",Aliphatic heteropolycyclic compounds,"[Xanthophyll, Oxepane, Fatty acid ester, Fatty...",This compound belongs to the class of organic ...,[],"[Acryloyl compounds, Alcohols and polyols, Alp...",...,Terpenoid molecules containing 10 consecutivel...,CHEMONTID:0001554,http://classyfire.wishartlab.com/tax_nodes/C00...,Xanthophylls,Carotenoids containing an oxygenated carotene ...,CHEMONTID:0001410,http://classyfire.wishartlab.com/tax_nodes/C00...,,,pyclassyfire
6,Q7068026-17,CCCCCCCC\C=C\CCCCCCCC(=O)O[C@@H]1C[C@@]2(C)O[C...,InChIKey=CQRZBOJKOQJRSH-PKOBTPGCSA-N,"[{'name': 'Carotenoids', 'description': 'Organ...","[{'name': 'Oxepanes', 'description': 'Compound...",Aliphatic heteropolycyclic compounds,"[Xanthophyll, Oxepane, Fatty acid ester, Fatty...",This compound belongs to the class of organic ...,[],"[Acryloyl compounds, Alcohols and polyols, Alp...",...,Terpenoid molecules containing 10 consecutivel...,CHEMONTID:0001554,http://classyfire.wishartlab.com/tax_nodes/C00...,Xanthophylls,Carotenoids containing an oxygenated carotene ...,CHEMONTID:0001410,http://classyfire.wishartlab.com/tax_nodes/C00...,,,pyclassyfire
7,Q7068026-18,[H]C(=C=C1C(C)(C)C[C@@]([H])(C[C@@]1(C)O)OC(C)...,InChIKey=DYTLPXMYTFXUBJ-DOAYSRGSSA-N,[],"[{'name': 'Benzofurans', 'description': 'Organ...",Aliphatic heteropolycyclic compounds,"[Triterpenoid, Benzofuran, Tertiary alcohol, D...",This compound belongs to the class of organic ...,[],"[Alcohols and polyols, Benzofurans, Carbonyl c...",...,Terpene molecules containing six isoprene units.,CHEMONTID:0001553,http://classyfire.wishartlab.com/tax_nodes/C00...,Triterpenoids,Terpene molecules containing six isoprene units.,CHEMONTID:0001553,http://classyfire.wishartlab.com/tax_nodes/C00...,,,pyclassyfire
8,Q7068026-19,CC(=O)O[C@H]1CC(C)(C)C(=C=C\C(C)=C\C=C\C=C\C=C...,InChIKey=KQQRQKINDGGYIE-SKFWEWSPSA-N,[],"[{'name': 'Sesquiterpenoids', 'description': '...",Aliphatic heteropolycyclic compounds,"[Terpene lactone, Cyclofarsesane sesquiterpeno...",This compound belongs to the class of organic ...,[],"[Alcohols and polyols, Alpha,beta-unsaturated ...",...,Prenol lipids containing a lactone ring.,CHEMONTID:0001283,http://classyfire.wishartlab.com/tax_nodes/C00...,Terpene lactones,Prenol lipids containing a lactone ring.,CHEMONTID:0001283,http://classyfire.wishartlab.com/tax_nodes/C00...,,,pyclassyfire
9,Q7068026-20,CC[C@@H]1O[C@@H]2C[C@@H]3O[C@@H]3C[C@@H](O[C@@...,InChIKey=SNQLUCRWXUPRSU-ADULOXKHSA-N,[],"[{'name': 'Vinyl bromides', 'description': 'Vi...",Aliphatic heteropolycyclic compounds,"[Oxane, Oxacycle, Vinyl halide, Vinyl bromide,...",This compound belongs to the class of organic ...,[],[],...,,,,Oxanes,Compounds containing an oxane (tetrahydropyran...,CHEMONTID:0002012,http://classyfire.wishartlab.com/tax_nodes/C00...,,,pyclassyfire


In [29]:
normalized_df_inchi_nona.to_pickle('~/Tese/ClassyFire/classyfire_post_results.pkl')

## Join classyfire results

In [None]:
classyfire_join = pd.read_pickle('~/Tese/ClassyFire/cf_allraw.pkl')
classyfire_join = classyfire_join.append(pd.read_pickle('~/Tese/ClassyFire/classyfire_post_results.pkl'))
classyfire_join = classyfire_join.dropna(how='all').drop_duplicates('inchikey').reset_index(drop=True)
classyfire_join.info()
classyfire_join.to_pickle('~/Tese/ClassyFire/cf_allraw.pkl')

## Get unclassed compounds

Load Database and results from classyfire

In [78]:
all_db = pd.read_pickle('~/Tese/AllDB.pkl')[['InChI', 'InChIKey', 'SMILES']]
all_db['InChI'] = 'InChI=' + all_db['InChI']
all_db['InChIKey'] = 'InChIKey=' + all_db['InChIKey'] 
classyfire_df = pd.read_pickle('~/Tese/ClassyFire/cf_allraw.pkl')[['inchikey', 'smiles', 'kingdom.name']]

Join both Dataframes by InChIKey and check for entries with no structural information

In [81]:
df_merged = pd.merge(all_db, classyfire_df, left_on='InChIKey', right_on='inchikey', how='left')
print(df_merged[df_merged['InChI'].isna()])

df_merged_unclassed = df_merged[df_merged['kingdom.name'].isna()].drop_duplicates('InChIKey')
df_merged_unclassed = df_merged_unclassed.reset_index(drop=True)
df_merged_unclassed.info()
df_merged_unclassed[['InChI', 'InChIKey']].to_csv('unclassed.tsv', sep='\t', encoding="utf-8")

Empty DataFrame
Columns: [InChI, InChIKey, SMILES, inchikey, smiles, kingdom.name]
Index: []
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69 entries, 0 to 68
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   InChI         69 non-null     object
 1   InChIKey      69 non-null     object
 2   SMILES        67 non-null     object
 3   inchikey      69 non-null     object
 4   smiles        0 non-null      object
 5   kingdom.name  0 non-null      object
dtypes: object(6)
memory usage: 3.4+ KB


## Different Classyfire input/output InChI 

Classyfire results are joined to the main Dataset by InChIKey. For ~400 compounds, the InChIKey in the results from Classyfire does not match with the InChIKey corresponding to the input InChI, which has the structural information.

In [12]:
#input
input_inchis = pd.read_csv('unclassed_nomatch.tsv', delimiter='\t', index_col=0)
input_inchis['identifier'] = input_inchis['identifier'].astype(str)

In [13]:
#output
cf_output = pd.read_pickle('~/Tese/ClassyFire/ik_no_match/cf_output_ik_nomatch.pkl')[['identifier', 'inchikey']]
cf_output = cf_output.rename(columns={'inchikey': 'InChIKey_output'})

In [15]:
adb = pd.read_pickle('~/Tese/AllDB_w_IDs.pkl')
adb['InChI'] = 'InChI=' + adb['InChI']
adb['InChIKey'] = 'InChIKey=' + adb['InChIKey']

In [16]:
#Is these behaviour characteristic of a specific DB? No
pd.merge(adb, input_inchis[['InChI', 'InChIKey']], on='InChIKey', how='inner').info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 426 entries, 0 to 425
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   HMDB ID        118 non-null    object
 1   InChI_x        426 non-null    object
 2   InChIKey       426 non-null    object
 3   SMILES         399 non-null    object
 4   Lipid Maps ID  104 non-null    object
 5   KEGG ID        40 non-null     object
 6   ChEBI ID       213 non-null    object
 7   InChI_y        426 non-null    object
dtypes: object(8)
memory usage: 30.0+ KB


In [None]:
corresp_ik_in_out = pd.merge(cf_output, input_inchis, on='identifier', how='inner')
corresp_ik_in_out.to_pickle('~/Tese/ClassyFire/ik_no_match/ik_corresp_in_output.pkl')
corresp_ik_in_out

last 7068115 query id where I couldn't retrieve any compound classification

In [None]:
print(len(corresp_ik_in_out))
print(len(input_inchis))

In [17]:
cf_output = pd.read_pickle('~/Tese/ClassyFire/ik_no_match/cf_output_ik_nomatch.pkl')
corresp_ik_in_out = pd.read_pickle('~/Tese/ClassyFire/ik_no_match/ik_corresp_in_output.pkl')[['InChIKey_output', 'InChIKey', 'InChI']]

In [None]:
merge = pd.merge(cf_output, corresp_ik_in_out, left_on='inchikey', right_on='InChIKey_output', how='left')
merge = merge.drop(columns=['inchikey', 'InChIKey_output', 'InChI']).rename(columns={'InChIKey':'inchikey'})
merge.to_pickle('~/Tese/ClassyFire/ik_no_match/cf_output_ik_nomatch_changed_ik.pkl')

Join this results to the others

In [18]:
classyfire_join = pd.read_pickle('~/Tese/ClassyFire/cf_allraw.pkl')
print(classyfire_join.info())
classyfire_join = classyfire_join.append(pd.read_pickle('~/Tese/ClassyFire/ik_no_match/cf_output_ik_nomatch_changed_ik.pkl'))

print(classyfire_join.info())
classyfire_join = classyfire_join.dropna(how='all').drop_duplicates('inchikey').reset_index(drop=True)
print(classyfire_join.info())
classyfire_join.to_pickle('~/Tese/ClassyFire/cf_allraw.pkl')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 388251 entries, 0 to 388250
Data columns (total 40 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   smiles                     388251 non-null  object 
 1   inchikey                   388250 non-null  object 
 2   intermediate_nodes         388250 non-null  object 
 3   alternative_parents        388250 non-null  object 
 4   substituents               388250 non-null  object 
 5   description                388250 non-null  object 
 6   external_descriptors       388250 non-null  object 
 7   ancestors                  388250 non-null  object 
 8   predicted_chebi_terms      388250 non-null  object 
 9   predicted_lipidmaps_terms  388250 non-null  object 
 10  classification_version     388250 non-null  object 
 11  kingdom.name               388250 non-null  object 
 12  kingdom.description        388250 non-null  object 
 13  kingdom.chemont_id         38