# Crocodile Database Processing
(https://crocodile.ncbr.muni.cz/Apps/NAChRDB/index.html)

In [7]:
import json

import pandas as pd

In [11]:
file_path = 'entire_nachrdb.json'

with open(file_path, 'r') as f:
    raw_data = json.load(f)

# Extract and flatten the 'entries' list.
# This turns nested info like entries -> molecule -> molecule_name into a clean column named 'molecule.molecule_name'.
data_all = pd.json_normalize(raw_data,
                             record_path=['entries'],
                             meta=['name', 'version', 'id_in_croco'])

data_all.columns = [col.split('.')[-1] for col in data_all.columns]

print("Available columns:")
print(data_all.columns.tolist())

data_all

Available columns:
['id_in_nachr_db', 'version_in_nachr_db', 'molecule_type', 'molecule_name', 'molecule_name_in_db', 'molecule_source_db', 'molecule_id_in_source_db', 'molecule_organism', 'molecule_tissue', 'molecule_organ', 'molecule_chains', 'name', 'version', 'id_in_croco']


Unnamed: 0,id_in_nachr_db,version_in_nachr_db,molecule_type,molecule_name,molecule_name_in_db,molecule_source_db,molecule_id_in_source_db,molecule_organism,molecule_tissue,molecule_organ,molecule_chains,name,version,id_in_croco
0,PDB-PDB-00001,0.1.0,protein,gating movement in acetylcholine receptor ana...,gating movement in acetylcholine receptor ana...,PDB,4AQ5,Torpedo marmorata,electric organ,plasma membrane,"[{'chain_id': 1, 'chain_str_id': 'A', 'chain_t...",nAChR_Db,0.1.0,00001
1,PDB-PDB-00002,0.1.0,protein,gating movement in acetylcholine receptor ana...,gating movement in acetylcholine receptor ana...,PDB,4AQ9,Torpedo marmorata,electric organ,plasma membrane,"[{'chain_id': 1, 'chain_str_id': 'A', 'chain_t...",nAChR_Db,0.1.0,00001
2,PDB-PDB-00003,0.1.0,protein,x-ray structure of the human alpha4beta2 nico...,x-ray structure of the human alpha4beta2 nico...,PDB,5KXI,Homo sapiens,no_tissue_information,no_tissue_information,"[{'chain_id': 1, 'chain_str_id': 'A', 'chain_t...",nAChR_Db,0.1.0,00001
3,PDB-PDB-00004,0.1.0,protein,structure of the 2alpha3beta stiochiometry of...,structure of the 2alpha3beta stiochiometry of...,PDB,6CNJ,Homo sapiens,no_tissue_information,no_tissue_information,"[{'chain_id': 1, 'chain_str_id': 'A', 'chain_t...",nAChR_Db,0.1.0,00001
4,PDB-PDB-00005,0.1.0,protein,structure of the 3alpha2beta stiochiometry of...,structure of the 3alpha2beta stiochiometry of...,PDB,6CNK,Homo sapiens,no_tissue_information,no_tissue_information,"[{'chain_id': 1, 'chain_str_id': 'A', 'chain_t...",nAChR_Db,0.1.0,00001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,UNIPROT-XML-00110,0.1.0,protein,ACH1_CAEBR,ACH1_CAEBR,UNIPROT,A8WQK3,Caenorhabditis briggsae,no_tissue_information,no_organ_information,"[{'chain_id': 1, 'chain_str_id': 'A', 'chain_t...",nAChR_Db,0.1.0,00001
121,UNIPROT-XML-00111,0.1.0,protein,ACH1_SCHGR,ACH1_SCHGR,UNIPROT,P23414,Schistocerca gregaria,no_tissue_information,no_organ_information,"[{'chain_id': 1, 'chain_str_id': 'A', 'chain_t...",nAChR_Db,0.1.0,00001
122,UNIPROT-XML-00112,0.1.0,protein,ACH91_ONCMY,ACH91_ONCMY,UNIPROT,Q8JFN7,Oncorhynchus mykiss,no_tissue_information,no_organ_information,"[{'chain_id': 1, 'chain_str_id': 'A', 'chain_t...",nAChR_Db,0.1.0,00001
123,UNIPROT-XML-00113,0.1.0,protein,ACH92_ONCMY,ACH92_ONCMY,UNIPROT,Q68RJ7,Oncorhynchus mykiss,no_tissue_information,no_organ_information,"[{'chain_id': 1, 'chain_str_id': 'A', 'chain_t...",nAChR_Db,0.1.0,00001


---

The `molecule_chains` column is **nested**. Need to understand what is within this column's data.

In [14]:
data_all['molecule_chains'][0]

[{'chain_id': 1,
  'chain_str_id': 'A',
  'chain_type': 'Alpha',
  'long_description': 'acetylcholine receptor subunit alpha',
  'residues': [{'aa_code': 'S',
    'position': 1,
    'position_in_protein': 1,
    'annotations': [{'annotation_source_molecule': {'Database': 'PDB',
       'Database ID': '4AQ5',
       'Organism': 'Torpedo marmorata',
       'Subunit': 'Alpha',
       'Chain': 'A',
       'Residue': 'S',
       'Residue number': '1',
       'Residue number in protein': '1'},
      'annotation_id': 'S1_001',
      'annotation_evidence_type': 'Direct',
      'annotation_receptor_type': 'αβδαγ',
      'annotation_result_type': '1',
      'annotation_type': 'Direct/same_species',
      'annotation_lit_id': '119',
      'annotation_context': 'Might be a part of a charge transfer network involved in the nAChR gating',
      'annotation_curated_context': '',
      'annotation_literature': {'first_author': 'Chareshneu',
       'corr_author': 'Koca',
       'year': '2019',
       'l

## Filtering

#### Example:

In [None]:
filtered_data_all = data_all[data_all['molecule.molecule_name'].str.contains('acetylcholine', case=False, na=False)]

print("\nFiltered Entry Results:")
print(filtered_data_all[['id_in_nachr_db', 'molecule.molecule_type', 'molecule.molecule_name']].head())

# filtered_data_all.to_csv('cleaned_database.csv', index=False)