In [5]:
import pandas as pd
import re

In [8]:
def load_confidence_data(path):
    # e.g. will match 2024-05-04 12:32:21,588 Query 1/1: ftsZ1copies_10-316_ftsZ_166-195 (length 337)
    fragment_name_pat = (
        r"Query 1\/1: ((.+)(\d)copies_(\d+)-(\d+)_(.+)_(\d+)-(\d+)) \(length (\d+)\)"
    )

    # e.g. will match 2023-05-26 20:03:23,691 rank_001_alphafold2_ptm_model_1_seed_000 pLDDT=93 pTM=0.801 ipTM=0.257
    confidence_pat = r"rank_00(\d)_alphafold2_ptm_model_\d_seed_\d{3} pLDDT=([+-]?[0-9]*[.]?[0-9]+) pTM=([+-]?[0-9]*[.]?[0-9]+) ipTM=([+-]?[0-9]*[.]?[0-9]+)"

    confidence_dict = {
        "fragment_name": [],
        "rank": [],
        "fragment_start_aa": [],
        "fragment_center_aa": [],
        "fragment_end_aa": [],
        "plddt": [],
        "ptm": [],
        "iptm": [],
    }

    fragment_name_match_count, conf_match_count = 0, 0
    with open(path, "r") as file:
        for i, line in enumerate(file):
            # search for fragment name
            match = re.search(pattern=fragment_name_pat, string=line)
            if match is not None:
                fragment_name_match_count += 1
                name = match[1]
                start, end = int(match[7]), int(match[8])
                center = (start + end) / 2
                continue

            # search for confidence data
            match = re.search(pattern=confidence_pat, string=line)
            if match is not None:
                if fragment_name_match_count == 0:
                    raise ValueError(
                        f"Fragment name should come before the confidence metrics\nfragment pattern: {fragment_name_pat}\nline:{line}\nfile:{path}"
                    )
                conf_match_count += 1
                pred_rank = int(match.group(1))
                plddt = float(match.group(2))
                ptm = float(match.group(3))
                iptm = float(match.group(4))

                confidence_dict["fragment_name"].append(name)
                confidence_dict["rank"].append(pred_rank)
                confidence_dict["fragment_start_aa"].append(start)
                confidence_dict["fragment_center_aa"].append(center)
                confidence_dict["fragment_end_aa"].append(end)
                confidence_dict["plddt"].append(plddt)
                confidence_dict["ptm"].append(ptm)
                confidence_dict["iptm"].append(iptm)
    if conf_match_count != 5:  # or fragment_name_match_count != 1:
        raise ValueError(
            f"Expected to find 5 lines with confidence metrics and 1 with the fragment name, \
                         instead found ({conf_match_count},{fragment_name_match_count}) when searching {path}"
        )

    return pd.DataFrame(confidence_dict)


In [9]:
load_confidence_data('/home/jch/Documents/data/fragfold_test/sample_test/0f/18672f71f9ae940c01b6aa6aa83909/log_file_2.txt')

Unnamed: 0,fragment_name,rank,fragment_start_aa,fragment_center_aa,fragment_end_aa,plddt,ptm,iptm
0,ftsZ1copies_10-316_ftsZ_260-289,1,260,274.5,289,92.5,0.907,0.83
1,ftsZ1copies_10-316_ftsZ_260-289,2,260,274.5,289,92.4,0.895,0.789
2,ftsZ1copies_10-316_ftsZ_260-289,3,260,274.5,289,89.6,0.878,0.775
3,ftsZ1copies_10-316_ftsZ_260-289,4,260,274.5,289,92.1,0.9,0.754
4,ftsZ1copies_10-316_ftsZ_260-289,5,260,274.5,289,90.1,0.874,0.745


In [10]:
load_confidence_data('/home/jch/Documents/data/fragfold_test/tnfa_test_2024-10-30/testrun/f1/69ca830eb55e6e67eaefb080a096b1/log_file_100.txt')

Unnamed: 0,fragment_name,rank,fragment_start_aa,fragment_center_aa,fragment_end_aa,plddt,ptm,iptm
0,P01375_TNFA_HUMAN_77_2331copies_1-157_P19438_T...,1,50,64.5,79,89.1,0.74,0.116
1,P01375_TNFA_HUMAN_77_2331copies_1-157_P19438_T...,2,50,64.5,79,86.4,0.732,0.104
2,P01375_TNFA_HUMAN_77_2331copies_1-157_P19438_T...,3,50,64.5,79,88.9,0.733,0.0969
3,P01375_TNFA_HUMAN_77_2331copies_1-157_P19438_T...,4,50,64.5,79,87.0,0.726,0.0836
4,P01375_TNFA_HUMAN_77_2331copies_1-157_P19438_T...,5,50,64.5,79,89.4,0.732,0.0688
