In [1]:
import pandas as pd
from pathlib import Path
import json

parent_path = Path("/data/jgut/msa-tests")
df = pd.read_csv(parent_path/"porter_data.csv", header=None)

def open_ost(ost_path:Path):
    if not ost_path.exists():
        return -1, -1, -1, -1, -1, -1 ,-1
    with open(ost_path) as json_data:
        score_json = json.load(json_data)
    lddt = score_json["lddt"] if "lddt" in score_json else 0
    rmsd = score_json["rmsd"] if "rmsd" in score_json else 100
    bb_lddt = score_json["bb_lddt"] if "bb_lddt" in score_json else 0
    tm_score = score_json["tm_score"] if "tm_score" in score_json else 0
    inconsistent_residues = score_json["inconsistent_residues"] if "inconsistent_residues" in score_json else -1
    length = len(score_json["local_lddt"]) if "local_lddt" in score_json else -1
    model_bad_bonds = len(score_json["reference_bad_bonds"]) if "reference_bad_bonds" in score_json else -1
    model_bad_angles = len(score_json["reference_bad_angles"]) if "reference_bad_angles" in score_json else -1
    return lddt, bb_lddt, tm_score, rmsd, inconsistent_residues, length, model_bad_bonds, model_bad_angles

In [2]:
scores = []
for it, row in list(df.iterrows()):
    struc_a = row[0]
    struc_b = row[1]
    case_name = struc_a+struc_b
    case_path = parent_path/"aaa_porter_all_models/porter_all_models"/case_name
    for comparison in [struc_a, struc_b]:
        comparison_path = case_path/f"{comparison}_prot_dir"
        for file_path in comparison_path.glob("*.json"):
            print(file_path)
            lddt, bb_lddt, tm_score, rmsd, inconsistent_residues, length, model_bad_bonds, model_bad_angles = open_ost(file_path)
            if lddt>0:
                rank = int(str(file_path).split("rank_")[1][0:3])
                model = int(str(file_path).split("_model_")[1][0])
                curr_entry = {"struc":comparison, "both": case_name, "rank":rank, "model":model, "lddt": lddt,"lddt_bb":bb_lddt,  "tm": tm_score, "rmsd": rmsd, "inconsistent_residues": inconsistent_residues, "length": length,  "bad_bonds":  model_bad_bonds,  "bad_angles":  model_bad_angles, "repacked":False}
                scores.append(curr_entry)
        comparison_packed = case_path/f"{struc_a}_prot_dir_packed"
        for file_path in comparison_packed.glob("*.json"):
            print(file_path)
            lddt, bb_lddt, tm_score, rmsd, inconsistent_residues, length, model_bad_bonds, model_bad_angles = open_ost(file_path)
            if lddt>0:
                rank = int(str(file_path).split("rank_")[1][0:3])
                model = int(str(file_path).split("_model_")[1][0])
                curr_entry = {"struc":comparison, "both": case_name, "rank":rank, "model":model, "lddt": lddt,"lddt_bb":bb_lddt,  "tm": tm_score, "rmsd": rmsd, "inconsistent_residues": inconsistent_residues, "length": length,  "bad_bonds":  model_bad_bonds,  "bad_angles":  model_bad_angles, "repacked":True}
                scores.append(curr_entry)

/data/jgut/msa-tests/aaa_porter_all_models/porter_all_models/7ahlE4yhdG/7ahlE_prot_dir/7ahlE_conf_unrelaxed_rank_004_alphafold2_ptm_model_2_seed_6217_score.json
/data/jgut/msa-tests/aaa_porter_all_models/porter_all_models/7ahlE4yhdG/7ahlE_prot_dir/7ahlE_conf_unrelaxed_rank_002_alphafold2_ptm_model_4_seed_6217_score.json
/data/jgut/msa-tests/aaa_porter_all_models/porter_all_models/7ahlE4yhdG/7ahlE_prot_dir/7ahlE_conf_unrelaxed_rank_001_alphafold2_ptm_model_5_seed_6217_score.json
/data/jgut/msa-tests/aaa_porter_all_models/porter_all_models/7ahlE4yhdG/7ahlE_prot_dir/7ahlE_conf_unrelaxed_rank_005_alphafold2_ptm_model_1_seed_6217_score.json
/data/jgut/msa-tests/aaa_porter_all_models/porter_all_models/7ahlE4yhdG/7ahlE_prot_dir/7ahlE_conf_unrelaxed_rank_003_alphafold2_ptm_model_3_seed_6217_score.json
/data/jgut/msa-tests/aaa_porter_all_models/porter_all_models/7ahlE4yhdG/7ahlE_prot_dir_packed/7ahlE_conf_unrelaxed_rank_004_alphafold2_ptm_model_2_seed_6217_score.json
/data/jgut/msa-tests/aaa_po

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
scores = pd.DataFrame(scores)
sec_struc_df = pd.read_csv(parent_path/"sec_struc.csv")
sec_struc_df["struc"] = sec_struc_df["pdb_path"].str.slice(18,23)
sec_struc_df = sec_struc_df.set_index("struc")
scores = scores.join(sec_struc_df[["-_total", "-_rel"]], on="struc")

In [4]:
scores_max = scores.groupby("struc").max()
print(f'All {len(scores["struc"].unique())}')
print(f'Passed {len(scores_max[sum([scores_max["tm"]>=0.5])>0])}')

All 180
Passed 148


In [5]:
all_strucs = [struc for struc in df[0]]+[struc for struc in df[1]]
set(all_strucs) - set(scores["struc"].unique())

set()

In [6]:
scores = []
for it, row in list(df.iterrows()):
    struc_a = row[0]
    struc_b = row[1]
    case_name = struc_a+struc_b
    case_path = parent_path/"aaa_porter_all_models/porter_all_models"/case_name
    for comparison in [struc_a, struc_b]:
        comparison_path = case_path/f"{comparison}_prot_dir"
        for file_path in comparison_path.glob("*.json"):
            print(file_path)
            lddt, bb_lddt, tm_score, rmsd, inconsistent_residues, length, model_bad_bonds, model_bad_angles = open_ost(file_path)
            if lddt>0:
                rank = int(str(file_path).split("rank_")[1][0:3])
                model = int(str(file_path).split("_model_")[1][0])
                curr_entry = {"struc":comparison, "both": case_name, "rank":rank, "model":model, "lddt": lddt,"lddt_bb":bb_lddt,  "tm": tm_score, "rmsd": rmsd, "inconsistent_residues": inconsistent_residues, "length": length,  "bad_bonds":  model_bad_bonds,  "bad_angles":  model_bad_angles, "repacked":False}
                scores.append(curr_entry)
        comparison_packed = case_path/f"{struc_a}_prot_dir_packed"
        for file_path in comparison_packed.glob("*.json"):
            print(file_path)
            lddt, bb_lddt, tm_score, rmsd, inconsistent_residues, length, model_bad_bonds, model_bad_angles = open_ost(file_path)
            if lddt>0:
                rank = int(str(file_path).split("rank_")[1][0:3])
                model = int(str(file_path).split("_model_")[1][0])
                curr_entry = {"struc":comparison, "both": case_name, "rank":rank, "model":model, "lddt": lddt,"lddt_bb":bb_lddt,  "tm": tm_score, "rmsd": rmsd, "inconsistent_residues": inconsistent_residues, "length": length,  "bad_bonds":  model_bad_bonds,  "bad_angles":  model_bad_angles, "repacked":True}
                scores.append(curr_entry)
scores = pd.DataFrame(scores)
sec_struc_df = pd.read_csv(parent_path/"sec_struc.csv")
sec_struc_df["struc"] = sec_struc_df["pdb_path"].str.slice(18,23)
sec_struc_df = sec_struc_df.set_index("struc")
scores = scores.join(sec_struc_df[["-_total", "-_rel"]], on="struc")

/data/jgut/msa-tests/aaa_porter_all_models/porter_all_models/7ahlE4yhdG/7ahlE_prot_dir/7ahlE_conf_unrelaxed_rank_004_alphafold2_ptm_model_2_seed_6217_score.json
/data/jgut/msa-tests/aaa_porter_all_models/porter_all_models/7ahlE4yhdG/7ahlE_prot_dir/7ahlE_conf_unrelaxed_rank_002_alphafold2_ptm_model_4_seed_6217_score.json
/data/jgut/msa-tests/aaa_porter_all_models/porter_all_models/7ahlE4yhdG/7ahlE_prot_dir/7ahlE_conf_unrelaxed_rank_001_alphafold2_ptm_model_5_seed_6217_score.json
/data/jgut/msa-tests/aaa_porter_all_models/porter_all_models/7ahlE4yhdG/7ahlE_prot_dir/7ahlE_conf_unrelaxed_rank_005_alphafold2_ptm_model_1_seed_6217_score.json
/data/jgut/msa-tests/aaa_porter_all_models/porter_all_models/7ahlE4yhdG/7ahlE_prot_dir/7ahlE_conf_unrelaxed_rank_003_alphafold2_ptm_model_3_seed_6217_score.json
/data/jgut/msa-tests/aaa_porter_all_models/porter_all_models/7ahlE4yhdG/7ahlE_prot_dir_packed/7ahlE_conf_unrelaxed_rank_004_alphafold2_ptm_model_2_seed_6217_score.json
/data/jgut/msa-tests/aaa_po

In [7]:
set(aui) - set(scores["struc"].unique())

NameError: name 'aui' is not defined

In [8]:
good_tm = scores_max[(scores_max["tm"]>=0.5) & (scores_max["lddt"]<0.7) & (scores_max["lddt_bb"]<0.7)][["lddt", "lddt_bb", "tm", "rmsd", "-_total", "-_rel"]]
print(f"the size of this list is {len(good_tm)}")
good_tm

the size of this list is 16


Unnamed: 0_level_0,lddt,lddt_bb,tm,rmsd,-_total,-_rel
struc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1nqdA,0.216,0.43,0.681,9.961,17.0,0.53125
1nqjB,0.351,0.628,0.681,9.961,23.0,0.69697
1xjtA,0.37,0.552,0.828,2.927,16.0,0.533333
2ougC,0.262,0.528,0.603,8.484,4.0,0.097561
2wcdX,0.354,0.551,0.627,6.236,4.0,0.235294
3njqA,0.382,0.613,0.854,2.848,12.0,0.48
3qy2A,0.035,0.033,0.525,0.064,3.0,1.0
3zwgN,0.353,0.598,0.668,2.875,9.0,0.6
4hddA,0.221,0.429,0.505,15.658,19.0,0.76
4jphB,0.235,0.52,0.518,5.679,9.0,0.409091


In [9]:
good_lddt = scores_max[(scores_max["tm"]<0.5) & ((scores_max["lddt"]>=0.7) | (scores_max["lddt_bb"]>=0.7) )][["lddt", "lddt_bb", "tm", "rmsd", "-_total", "-_rel"]]
print(f"the size of this list is {len(good_lddt)}")
good_lddt

the size of this list is 25


Unnamed: 0_level_0,lddt,lddt_bb,tm,rmsd,-_total,-_rel
struc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1mnmA,0.362,0.83,0.247,3.597,,
1mnmB,0.361,0.86,0.333,4.838,,
1xntA,0.763,0.861,0.467,0.916,230.0,0.434783
2jmrA,0.799,0.946,0.398,10.219,320.0,0.64
2k0qA,0.635,0.803,0.316,3.906,220.0,0.478261
2kb8A,0.742,0.878,0.382,12.221,150.0,0.178571
2kkwA,0.659,0.733,0.358,38.724,1224.0,0.26087
2lejA,0.68,0.71,0.245,5.902,180.0,0.6
2lelA,0.612,0.803,0.316,4.471,280.0,0.608696
2lv1A,0.679,0.71,0.253,5.902,260.0,0.866667


In [10]:
scores_max[(scores_max["tm"]<0.5) & (scores_max["lddt"]<0.7) & (scores_max["lddt_bb"]<0.7)].index

Index(['1mbyA', '2axzA', '2n0aD', '2naoF', '2nntA', '4fu4C', '5k5gA'], dtype='object', name='struc')

In [6]:
scores_max_pack = scores[scores["repacked"]].groupby("struc").max()
scores_max_no_pack = scores[~scores["repacked"]].groupby("struc").max()
print(f'Passed repacked {len(scores_max_pack[sum([scores_max_pack["tm"]>=0.5]+ [scores_max_pack["lddt"]>=0.7]+[scores_max_pack["lddt_bb"]>=0.7])>0])}')
print(f'Passed normal {len(scores_max_no_pack[sum([scores_max_no_pack["tm"]>=0.5]+ [scores_max_no_pack["lddt"]>=0.7]+[scores_max_no_pack["lddt_bb"]>=0.7])>0])}')

Passed repacked 162
Passed normal 164


In [7]:
for rank in range(1,6):
    scores_max_pack = scores[scores["repacked"] & scores["rank"].eq(rank)]
    scores_max_no_pack = scores[~scores["repacked"] & scores["rank"].eq(rank)]
    print(f'Rank {rank}: Passed repacked {len(scores_max_pack[sum([scores_max_pack["tm"]>=0.5]+ [scores_max_pack["lddt"]>=0.7]+[scores_max_pack["lddt_bb"]>=0.7])>0])}')
    print(f'Rank {rank}: Passed normal {len(scores_max_no_pack[sum([scores_max_no_pack["tm"]>=0.5]+ [scores_max_no_pack["lddt"]>=0.7]+[scores_max_no_pack["lddt_bb"]>=0.7])>0])}')

Rank 1: Passed repacked 156
Rank 1: Passed normal 158
Rank 2: Passed repacked 156
Rank 2: Passed normal 157
Rank 3: Passed repacked 154
Rank 3: Passed normal 156
Rank 4: Passed repacked 152
Rank 4: Passed normal 157
Rank 5: Passed repacked 154
Rank 5: Passed normal 156


In [8]:
for model in range(1,6):
    scores_max_pack = scores[scores["repacked"] & scores["model"].eq(model)]
    scores_max_no_pack = scores[~scores["repacked"] & scores["model"].eq(model)]
    print(f'Model {model}: Passed repacked {len(scores_max_pack[sum([scores_max_pack["tm"]>=0.5]+ [scores_max_pack["lddt"]>=0.7]+[scores_max_pack["lddt_bb"]>=0.7])>0])}')
    print(f'Model {model}: Passed normal {len(scores_max_no_pack[sum([scores_max_no_pack["tm"]>=0.5]+ [scores_max_no_pack["lddt"]>=0.7]+[scores_max_no_pack["lddt_bb"]>=0.7])>0])}')

Model 1: Passed repacked 152
Model 1: Passed normal 155
Model 2: Passed repacked 154
Model 2: Passed normal 156
Model 3: Passed repacked 154
Model 3: Passed normal 154
Model 4: Passed repacked 156
Model 4: Passed normal 159
Model 5: Passed repacked 156
Model 5: Passed normal 160


In [10]:
scores_max

Unnamed: 0_level_0,rank,model,lddt,lddt_bb,tm,inconsistent_residues,length,bad_bonds,bad_angles,repacked,-_total,-_rel
struc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1ceeB,5,5,0.775,0.917,0.515,[],59,21,12,True,380.0,0.791667
1dzlA,5,5,0.773,0.882,0.525,[],473,52,23,True,17.0,0.5
1eboE,5,5,0.82,0.976,0.907,[],111,54,26,True,2.0,0.035714
1g2cF,5,5,0.841,0.994,0.884,[],36,12,2,True,6.0,0.166667
1h38D,5,5,0.897,0.949,0.528,[],857,12,1,True,13.0,0.8125
1htmB,5,5,0.832,0.945,0.833,[],175,71,39,True,10.0,0.105263
1iytA,5,5,0.791,0.888,0.551,[],42,37,26,True,40.0,0.142857
1jfkA,5,5,0.815,0.873,0.547,[],134,6,1,True,19.0,0.218391
1jtiB,5,5,0.873,0.933,0.921,[],383,17,14,True,16.0,0.307692
1k0nA,5,5,0.928,0.97,0.947,[],226,17,4,True,18.0,0.428571


In [12]:
scores.to_csv(parent_path/"af2_and_dssp.csv")