In [2]:
import json
import re
from pathlib import Path

import pandas as pd

import table and specify pipeline output folder

In [3]:
table_path = './table.csv'
pipeline_output_path = './output/info_jsons/'
output_folder = Path(pipeline_output_path)

get list of json files in the folder

In [12]:
# .resolve() is used to get the absolute path
json_filelist = [f.resolve() for f in output_folder.glob('*.json')]

import each json file and extract the uniprot id. Use it to create a mapping between uniprot id and the json file path

In [13]:
def get_uniprot_id_from_json(json_file: Path) -> str:
    with open(json_file, 'r') as f:
        json_dict = json.load(f)
    return json_dict['query_uniprot_id']

json_file_map = {}
for file in json_filelist:
    print(get_uniprot_id_from_json(file), file)
    json_file_map[get_uniprot_id_from_json(file)] = file

A0A1U9X8D2 /Users/jackson/Dropbox (MIT)/work/07-SLiM_bioinformatics/04-orthoDB_local_orthogroup_creation/examples/ex2_table_with_uniprot_ids/output/info_jsons/9606_0_001c7b_Vertebrata_1567973at7742_info.json
Q8TC90 /Users/jackson/Dropbox (MIT)/work/07-SLiM_bioinformatics/04-orthoDB_local_orthogroup_creation/examples/ex2_table_with_uniprot_ids/output/info_jsons/9606_0_002f40_Vertebrata_869863at7742_info.json


map the file back to the table

In [14]:
table = pd.read_csv(table_path)
table.head()

Unnamed: 0,Uniprotid,hit_sequence
0,A6ND36,FPGPPRY
1,C9J302,FPTPPNY
2,Q8TC90,KNDDEEE
3,A0A1U9X8D2,FRHLLEY


In [15]:
table['ortholog group json'] = table['Uniprotid'].map(json_file_map)
table.head()

Unnamed: 0,Uniprotid,hit_sequence,ortholog group json
0,A6ND36,FPGPPRY,
1,C9J302,FPTPPNY,
2,Q8TC90,KNDDEEE,/Users/jackson/Dropbox (MIT)/work/07-SLiM_bioi...
3,A0A1U9X8D2,FRHLLEY,/Users/jackson/Dropbox (MIT)/work/07-SLiM_bioi...


notice that it failed to find 2 of the uniprot ids in the database. That's because they are not in the sample dataset that I created for this repo

---

you can easily add more information to the table from the json files<br>
Let's try adding the number of sequences in the final ortholog group and the odb_gene_id for each protein

In [18]:
with open(json_filelist[0]) as f:
    test_json = json.load(f)
for i in test_json.keys(): print(i)

query_odb_gene_id
query_sequence_str
ogid
oglevel
sequences
sequences_filtered
sequences_ldos
sequences_clustered_ldos
cdhit_command
query_uniprot_id
processing params


there are a lot of ways to do this but let's just repeat what we did above for the sake of time <br>
in a real situation, you would probably want to think about how to do this in a more efficient way

In [29]:
def get_odb_gene_id_from_json(json_file: Path) -> str:
    with open(json_file, 'r') as f:
        json_dict = json.load(f)
    return json_dict['query_odb_gene_id']

def get_n_clustered_ldos_from_json(json_file: Path) -> str:
    with open(json_file, 'r') as f:
        json_dict = json.load(f)
    return len(json_dict['sequences_clustered_ldos'])

In [30]:
id_map = {k: get_odb_gene_id_from_json(v) for k, v in json_file_map.items()}
n_clustered_ldos_map = {k: get_n_clustered_ldos_from_json(v) for k, v in json_file_map.items()}
table['odb gene id'] = table['Uniprotid'].map(id_map)
table['n clustered ldos'] = table['Uniprotid'].map(n_clustered_ldos_map)

In [31]:
table.head()

Unnamed: 0,Uniprotid,hit_sequence,ortholog group json,odb gene id,n clustered ldos
0,A6ND36,FPGPPRY,,,
1,C9J302,FPTPPNY,,,
2,Q8TC90,KNDDEEE,/Users/jackson/Dropbox (MIT)/work/07-SLiM_bioi...,9606_0:002f40,143.0
3,A0A1U9X8D2,FRHLLEY,/Users/jackson/Dropbox (MIT)/work/07-SLiM_bioi...,9606_0:001c7b,119.0


In [32]:
table.to_csv('table_with_results.csv', index=False)