In [1]:
import pandas as pd
import json

# 1. Load the JSON file
try:
    with open('entire_nachrdb.json', 'r') as f:
        nachr_data = json.load(f)
except FileNotFoundError:
    print("Error: 'entire_nachr.json' not found. Check the file path.")
    exit()

# Initialize an empty list to store the flattened records
all_annotations = []
entries_data = nachr_data.get('entries',)

# 2. Extract residues level data
for entry in entries_data:
    # Safely get molecule information
    molecule_info = entry.get('molecule', {})
    
    # Check for non-human data early (Torpedo marmorata is common but generally not useful for human VEP model training)
    organism = molecule_info.get('molecule_organism', 'UNKNOWN')
    if organism not in ['Homo sapiens', 'Human', 'H. sapiens']: # Expand this list if you decide to include other mammalian data later
        continue 
        
    # Store essential context from the top level
    context = {
        'Organism': organism,
        'Source_Database': molecule_info.get('molecule_source_db'),
        'Source_ID': molecule_info.get('molecule_id_in_source_db'),
    }
    
    for chain in molecule_info.get('molecule_chains',):
        chain_info = {
            'Subunit_Type': chain.get('chain_type'),
        }
        
        for residue in chain.get('residues',):
            wt_aa = residue.get('aa_code')
            position = residue.get('position_in_protein')
            
            # Extract detailed annotations for this residue
            for annotation in residue.get('annotations',):
                lit_info = annotation.get('annotation_literature', {})
                
                # --- The Core Variant Data ---
                # NOTE: The NAChRDB format primarily stores annotations about the WILD-TYPE residue.
                # To get mutation data (WT -> Mut), you often need to find specific variants in the annotations.
                # The provided snippet example is primarily wild-type residue annotations ('S1', 'E4').
                
                # We prioritize the core data points needed for training
                record = {
                    # IDENTITY FIELDS (for consistency/merging)
                    'WT_AA': wt_aa,
                    'Position': position,
                    'Mutation_Code_NAChRDB': f"{wt_aa}{position}", # E.g., L247 (Wild-Type)
                    'Receptor_Type_Annotated': annotation.get('annotation_receptor_type'),
                    'Subunit_Type': chain_info,
                    'Organism': context['Organism'],
                    
                    # FUNCTIONAL/CONTEXTUAL FIELDS (The most valuable data)
                    'Functional_Context_Summary': annotation.get('annotation_context'), # This is the crucial descriptive summary
                    'Functional_Result_Type': annotation.get('annotation_result_type'), # Often a categorical label/code
                    'Evidence_Type': annotation.get('annotation_evidence_type'),
                    'Experimental_Study_Type': lit_info.get('study_type'), # e.g., Computational, Experimental 
                    'Source_DOI_or_Link': lit_info.get('link'),
                    'Source_Year': lit_info.get('year'),
                }
                all_annotations.append(record)

# 3. Create the final DataFrame
df_human_nachr = pd.DataFrame(all_annotations)
print(f"Total relevant records extracted: {len(df_human_nachr)}")
print("\n--- Processed Human NAChRDB Data Head ---")
print(df_human_nachr.head())

Total relevant records extracted: 25919

--- Processed Human NAChRDB Data Head ---
  WT_AA  Position Mutation_Code_NAChRDB Receptor_Type_Annotated  \
0     A         8                    A8                   αβδαγ   
1     A         8                    A8                   αβδαγ   
2     E        11                   E11                   αβδαγ   
3     E        11                   E11                   αβδαγ   
4     F        19                   F19                   αβδαγ   

                  Subunit_Type      Organism  \
0  {'Subunit_Type': 'Alpha 4'}  Homo sapiens   
1  {'Subunit_Type': 'Alpha 4'}  Homo sapiens   
2  {'Subunit_Type': 'Alpha 4'}  Homo sapiens   
3  {'Subunit_Type': 'Alpha 4'}  Homo sapiens   
4  {'Subunit_Type': 'Alpha 4'}  Homo sapiens   

                          Functional_Context_Summary Functional_Result_Type  \
0  Might be a part of a charge transfer network i...                      1   
1  Might be a part of a charge transfer network i...               

In [9]:
# Filter for rows where the context mentions "gating"
gating_data = df[df['Annotation_Context'].str.contains("gating", case=False, na=False)]
display(gating_data[['PDB_ID', 'Residue', 'Annotation_Context']].head())

Unnamed: 0,PDB_ID,Residue,Annotation_Context
0,4AQ5,S1,Might be a part of a charge transfer network i...
1,4AQ5,S1,Might be a part of a charge transfer network i...
2,4AQ5,E4,Might be a part of a charge transfer network i...
3,4AQ5,E4,Might be a part of a charge transfer network i...
7,4AQ5,E13,Might be a part of a charge transfer network i...


In [10]:
# Filter for "channel"
channel_residues = df[df['Annotation_Context'].str.contains("channel", case=False, na=False)]
display(channel_residues[['PDB_ID', 'Chain', 'Residue', 'Annotation_Context']].head())

Unnamed: 0,PDB_ID,Chain,Residue,Annotation_Context
4,4AQ5,A,L12,Predicted to line the nAChR channel
5,4AQ5,A,L12,Predicted to line the nAChR channel
6,4AQ5,A,L12,Predicted to line the nAChR channel
11,4AQ5,A,K17,Predicted to line the nAChR channel
12,4AQ5,A,K17,Predicted to line the nAChR channel


In [11]:
# Count the types of evidence
evidence_counts = df['Evidence_Type'].value_counts()
print(evidence_counts)

Evidence_Type
Inferred based on alignment    140844
Direct                           2587
Name: count, dtype: int64


In [4]:
import pandas as pd
import json

def json_to_csv_converter(json_filepath, csv_filepath):
    
    try:
        # Load the JSON file into a pandas DataFrame
        # pd.read_json is smart enough to handle most standard JSON formats
        df = pd.read_json(json_filepath)
        
        # Check if any column contains nested JSON/dictionaries
        # If the JSON is nested (e.g., {"user": {"name": "Alice"}}), 
        # you might need to flatten it using pd.json_normalize first.
        # This is a common step for complex JSON.
        
        # Save the DataFrame to a CSV file
        # index=False prevents pandas from writing the row index numbers as a column
        df.to_csv(csv_filepath, index=False)
        
        print(f"✅ Successfully converted '{json_filepath}' to '{csv_filepath}'")
        
    except ValueError as e:
        print(f" Error during conversion: {e}")
        print("\nTip: If your JSON is deeply nested, you may need to use `pd.json_normalize()` to flatten the data before calling `to_csv()`.")

# --- Replace 'input.json' and 'output.csv' with your file names ---
json_to_csv_converter('entire_nachrdb.json', 'output.csv')

✅ Successfully converted 'entire_nachrdb.json' to 'output.csv'
