In [15]:
import Bio.PDB
from Bio.SubsMat.MatrixInfo import blosum62

def calculate_recovery_rate_w_blosum(pdb_file_1, pdb_file_ref, chain_id, start_pos, end_pos):
    # Load the PDB structures
    parser = Bio.PDB.PDBParser(QUIET=True)
    structure_1 = parser.get_structure('structure_1', pdb_file_1)
    structure_ref = parser.get_structure('structure_ref', pdb_file_ref)
    
    # Extract the specific chain from both structures
    chain_1 = structure_1[0][chain_id]
    chain_ref = structure_ref[0][chain_id]
    
    # Initialize variables for the score and count
    total_score = 0
    gt_score = 0
    position_count = 0
    
    # Iterate through the specified residue positions
    for pos in range(start_pos, end_pos + 1):
        try:
            residue_1 = chain_1[pos]
            residue_ref = chain_ref[pos]
            aa_1 = residue_1.get_resname()
            aa_ref = residue_ref.get_resname()
            
            # Convert residue names to one-letter codes
            aa_1 = Bio.PDB.Polypeptide.three_to_one(aa_1)
            aa_ref = Bio.PDB.Polypeptide.three_to_one(aa_ref)
            
            # Calculate the BLOSUM score
            pair = (aa_1, aa_ref)
            if pair in blosum62:
                score = blosum62[pair]
            elif (aa_ref, aa_1) in blosum62:  # Check the reverse pair
                score = blosum62[(aa_ref, aa_1)]
            else:
                score = 0  # No score available for this pair
            
            # Accumulate the score
            total_score += score
            gt_score += blosum62[(aa_ref, aa_ref)]
            position_count += 1
            
        except KeyError:
            # If the residue is not present in one of the structures, skip
            continue
    
    # Calculate the recovery rate
    if position_count == 0:
        return 0  # To avoid division by zero
    recovery_rate = total_score / gt_score
    print(f"Total Score: {total_score}")
    print(f"GT Score: {gt_score}")
    print(f"Position Count: {position_count}")
    return recovery_rate


# Example usage
folder = '/hanchenchen/ab_opt/AbDesign/test_results_20240725/codesign_single/0-5xku_C_B_A/H_CDR3'
chain_id = 'C'
start = 95
end = 102

recovery_rate = calculate_recovery_rate_w_blosum(
    folder + '/0000.pdb', 
    folder + '/REF1.pdb', 
    chain_id, 
    start, 
    end,
    )
print(f"Recovery Rate: {recovery_rate:.5f}")



Total Score: 10
GT Score: 42
Position Count: 8
Recovery Rate: 0.23810


In [16]:
import Bio.PDB
from Bio.SubsMat.MatrixInfo import blosum62

def calculate_recovery_rate(pdb_file_1, pdb_file_ref, chain_id, start_pos, end_pos):
    # Load the PDB structures
    parser = Bio.PDB.PDBParser(QUIET=True)
    structure_1 = parser.get_structure('structure_1', pdb_file_1)
    structure_ref = parser.get_structure('structure_ref', pdb_file_ref)
    
    # Extract the specific chain from both structures
    chain_1 = structure_1[0][chain_id]
    chain_ref = structure_ref[0][chain_id]
    
    # Initialize variables for the score and count
    total_score = 0
    gt_score = 0
    position_count = 0
    
    # Iterate through the specified residue positions
    for pos in range(start_pos, end_pos + 1):
        try:
            residue_1 = chain_1[pos]
            residue_ref = chain_ref[pos]
            aa_1 = residue_1.get_resname()
            aa_ref = residue_ref.get_resname()
            
            # Convert residue names to one-letter codes
            aa_1 = Bio.PDB.Polypeptide.three_to_one(aa_1)
            aa_ref = Bio.PDB.Polypeptide.three_to_one(aa_ref)
            
            # Calculate the BLOSUM score\
            score = int(aa_1 == aa_ref)
            # Accumulate the score
            total_score += score
            gt_score += 1
            position_count += 1
            
        except KeyError:
            # If the residue is not present in one of the structures, skip
            continue
    
    # Calculate the recovery rate
    if position_count == 0:
        return 0  # To avoid division by zero
    recovery_rate = total_score / gt_score
    print(f"Total Score: {total_score}")
    print(f"GT Score: {gt_score}")
    print(f"Position Count: {position_count}")
    return recovery_rate


# Example usage
folder = '/hanchenchen/ab_opt/AbDesign/test_results_20240725/codesign_single/0-5xku_C_B_A/H_CDR3'
chain_id = 'C'
start = 95
end = 102

recovery_rate = calculate_recovery_rate(
    folder + '/0000.pdb', 
    folder + '/REF1.pdb', 
    chain_id, 
    start, 
    end,
    )
print(f"Recovery Rate: {recovery_rate:.5f}")



Total Score: 3
GT Score: 8
Position Count: 8
Recovery Rate: 0.37500


In [17]:
import glob
import tqdm
import pandas as pd
import json

pdb_list = []
recovery_rate_list = []
recovery_rate_w_blosum_list = []
for directory in tqdm.tqdm(glob.glob('/hanchenchen/ab_opt/AbDesign/test_results/codesign_single/*/H_CDR3')):
    metadata_path = directory.replace('H_CDR3', 'metadata.json')
    metadata = json.load(open(metadata_path))
    assert len(metadata['items']) == 1
    chain_id = metadata['items'][0]["residue_first"][0]
    start = metadata['items'][0]["residue_first"][1]
    end = metadata['items'][0]["residue_last"][1]
    recovery_rate = calculate_recovery_rate(
        directory + '/0000.pdb', 
        directory + '/REF1.pdb', 
        chain_id, 
        start, 
        end,
        )
    recovery_rate_w_blosum = calculate_recovery_rate_w_blosum(
        directory + '/0000.pdb', 
        directory + '/REF1.pdb', 
        chain_id, 
        start, 
        end,
        )
    pdb_list.append(directory.split('/')[-2])
    recovery_rate_list.append(recovery_rate)
    recovery_rate_w_blosum_list.append(recovery_rate_w_blosum)

df = pd.DataFrame({
    'pdb': pdb_list, 
    'recovery_rate': recovery_rate_list,
    'recovery_rate_w_blosum': recovery_rate_w_blosum_list,
    })
df

  5%|▌         | 1/19 [00:00<00:03,  4.96it/s]

Total Score: 4
GT Score: 8
Position Count: 8
Total Score: 16
GT Score: 47
Position Count: 8


 11%|█         | 2/19 [00:00<00:05,  3.10it/s]

Total Score: 2
GT Score: 8
Position Count: 8
Total Score: 3
GT Score: 46
Position Count: 8


 16%|█▌        | 3/19 [00:00<00:03,  4.39it/s]

Total Score: 2
GT Score: 8
Position Count: 8
Total Score: 6
GT Score: 36
Position Count: 8


 21%|██        | 4/19 [00:01<00:03,  3.90it/s]

Total Score: 4
GT Score: 8
Position Count: 8
Total Score: 15
GT Score: 51
Position Count: 8
Total Score: 1
GT Score: 8
Position Count: 8


 26%|██▋       | 5/19 [00:01<00:03,  4.20it/s]

Total Score: -6
GT Score: 44
Position Count: 8


 32%|███▏      | 6/19 [00:02<00:06,  2.02it/s]

Total Score: 4
GT Score: 8
Position Count: 8
Total Score: 16
GT Score: 47
Position Count: 8
Total Score: 1
GT Score: 7
Position Count: 7
Total Score: -4
GT Score: 39
Position Count: 7


 42%|████▏     | 8/19 [00:02<00:03,  2.94it/s]

Total Score: 2
GT Score: 8
Position Count: 8
Total Score: 3
GT Score: 51
Position Count: 8
Total Score: 4
GT Score: 8
Position Count: 8


 47%|████▋     | 9/19 [00:02<00:03,  3.16it/s]

Total Score: 22
GT Score: 46
Position Count: 8
Total Score: 6
GT Score: 8
Position Count: 8


 58%|█████▊    | 11/19 [00:03<00:02,  3.73it/s]

Total Score: 28
GT Score: 46
Position Count: 8
Total Score: 4
GT Score: 8
Position Count: 8
Total Score: 24
GT Score: 46
Position Count: 8


 63%|██████▎   | 12/19 [00:03<00:02,  3.42it/s]

Total Score: 1
GT Score: 8
Position Count: 8
Total Score: -6
GT Score: 44
Position Count: 8


 68%|██████▊   | 13/19 [00:04<00:02,  2.94it/s]

Total Score: 3
GT Score: 8
Position Count: 8
Total Score: 10
GT Score: 42
Position Count: 8
Total Score: 4
GT Score: 8
Position Count: 8


 79%|███████▉  | 15/19 [00:04<00:00,  4.27it/s]

Total Score: 15
GT Score: 51
Position Count: 8
Total Score: 4
GT Score: 8
Position Count: 8
Total Score: 16
GT Score: 47
Position Count: 8


 84%|████████▍ | 16/19 [00:04<00:00,  3.20it/s]

Total Score: 3
GT Score: 8
Position Count: 8
Total Score: 13
GT Score: 46
Position Count: 8
Total Score: 4
GT Score: 8
Position Count: 8
Total Score: 15
GT Score: 47
Position Count: 8


100%|██████████| 19/19 [00:05<00:00,  3.54it/s]

Total Score: 5
GT Score: 8
Position Count: 8
Total Score: 24
GT Score: 41
Position Count: 8
Total Score: 5
GT Score: 8
Position Count: 8
Total Score: 22
GT Score: 41
Position Count: 8





Unnamed: 0,pdb,recovery_rate,recovery_rate_w_blosum
0,4-5tlk_B_A_X,0.5,0.340426
1,7-5w9h_H_I_G,0.25,0.065217
2,10-7bwj_H_L_E,0.25,0.166667
3,18-5tlk_D_C_X,0.5,0.294118
4,17-7che_A_B_R,0.125,-0.136364
5,8-5tlj_B_A_X,0.5,0.340426
6,13-8ds5_C_B_A,0.142857,-0.102564
7,5-5tlj_D_C_X,0.25,0.058824
8,12-5w9h_B_C_A,0.5,0.478261
9,2-7chf_H_L_R,0.75,0.608696


In [19]:
df[['recovery_rate', 'recovery_rate_w_blosum']].mean()

TypeError: Could not convert ['4-5tlk_B_A_X7-5w9h_H_I_G10-7bwj_H_L_E18-5tlk_D_C_X17-7che_A_B_R8-5tlj_B_A_X13-8ds5_C_B_A5-5tlj_D_C_X12-5w9h_B_C_A2-7chf_H_L_R11-7d6i_B_C_A1-7chf_A_B_R0-5xku_C_B_A16-5tlk_H_G_Y6-5tlk_F_E_Y15-5w9h_E_F_D9-5tl5_H_L_A3-7che_H_L_R14-7chb_H_L_R'] to numeric

In [18]:
df.to_csv('/hanchenchen/ab_opt/rebuttal_exps/6_recovery_rate_w_blosum/recovery_rate_w_blosum.csv', index=False)