In [1]:
import pandas as pd
from pathlib import Path
import json
import seaborn as sns
import matplotlib.pyplot as plt
import re

def open_ost(ost_path:Path):
    if not ost_path.exists():
        return -1, -1, -1, -1, -1, -1 ,-1
    with open(ost_path) as json_data:
        score_json = json.load(json_data)
    lddt = score_json["lddt"] if "lddt" in score_json else 0
    bb_lddt = score_json["bb_lddt"] if "bb_lddt" in score_json else 0
    tm_score = score_json["tm_score"] if "tm_score" in score_json else 0
    inconsistent_residues = score_json["inconsistent_residues"] if "inconsistent_residues" in score_json else -1
    length = len(score_json["local_lddt"]) if "local_lddt" in score_json else -1
    model_bad_bonds = len(score_json["reference_bad_bonds"]) if "reference_bad_bonds" in score_json else -1
    model_bad_angles = len(score_json["reference_bad_angles"]) if "reference_bad_angles" in score_json else -1
    return lddt, bb_lddt, tm_score, inconsistent_residues, length, model_bad_bonds, model_bad_angles

In [6]:
scores = []
for protein_path in Path("protein_output_03032025").glob("*"):
    print(protein_path)
    if not (protein_path/"strucs").is_dir():
        continue
    protein=protein_path.name
    for medoid_path in (protein_path/"strucs").glob("*inverse_folded_dir"):
        print(medoid_path)
        medoid = str(medoid_path.name)[len("medoid_")]
        for score_file in medoid_path.glob("*6217._ost.json"):
            print(score_file)
            lddt, bb_lddt, tm_score, inconsistent_residues, length, model_bad_bonds, model_bad_angles = open_ost(score_file)
            score_name = str(score_file.name)
            rank = int(re.split("_rank_",score_name)[1][:3])
            model = int(re.split("_ptm_model_",score_name)[1][0])
            curr_entry = {"protein":protein, "medoid": medoid, "rank":rank, "model":model, "lddt": lddt,"lddt_bb":bb_lddt,  "tm": tm_score, "inconsistent_residues": inconsistent_residues, "length": length,  "bad_bonds":  model_bad_bonds,  "bad_angles":  model_bad_angles}
            scores.append(curr_entry)

protein_output_03032025/2jmrA
protein_output_03032025/2jmrA/strucs/medoid_0_inverse_folded_dir
protein_output_03032025/2jmrA/strucs/medoid_0_inverse_folded_dir/prot_mpnn_unrelaxed_rank_005_alphafold2_ptm_model_3_seed_6217._ost.json
protein_output_03032025/2jmrA/strucs/medoid_0_inverse_folded_dir/prot_mpnn_unrelaxed_rank_002_alphafold2_ptm_model_2_seed_6217._ost.json
protein_output_03032025/2jmrA/strucs/medoid_0_inverse_folded_dir/prot_mpnn_unrelaxed_rank_001_alphafold2_ptm_model_2_seed_6217._ost.json
protein_output_03032025/2jmrA/strucs/medoid_0_inverse_folded_dir/prot_mpnn_unrelaxed_rank_001_alphafold2_ptm_model_1_seed_6217._ost.json
protein_output_03032025/2jmrA/strucs/medoid_0_inverse_folded_dir/prot_mpnn_unrelaxed_rank_002_alphafold2_ptm_model_1_seed_6217._ost.json
protein_output_03032025/2jmrA/strucs/medoid_0_inverse_folded_dir/prot_mpnn_unrelaxed_rank_003_alphafold2_ptm_model_5_seed_6217._ost.json
protein_output_03032025/2jmrA/strucs/medoid_0_inverse_folded_dir/prot_mpnn_unrelaxe

In [19]:
pd.DataFrame(scores).sort_values("lddt", ascending=False).head(50)

Unnamed: 0,protein,medoid,rank,model,lddt,lddt_bb,tm,inconsistent_residues,length,bad_bonds,bad_angles
203,1mbyA,0,1,3,0.563,0.933,0.578,[],12,45,52
202,1mbyA,0,2,1,0.545,0.849,0.477,[],12,45,52
158,2kb8A,1,1,3,0.523,0.815,0.441,[],21,76,86
141,2kb8A,0,1,3,0.489,0.714,0.452,[],21,76,85
155,2kb8A,1,4,5,0.489,0.739,0.332,[],21,76,86
157,2kb8A,1,2,1,0.474,0.701,0.332,[],21,76,86
204,1mbyA,1,3,1,0.467,0.6,0.324,[],12,46,53
140,2kb8A,0,2,1,0.458,0.638,0.395,[],21,76,85
208,1mbyA,1,1,3,0.456,0.57,0.305,[],12,46,53
142,2kb8A,0,3,5,0.448,0.628,0.392,[],21,76,85
