In [2]:
from Bio import PDB
from Bio.SeqUtils import seq1
import numpy as np
import os

def extract_sequence(pdb_file, chain_id, start, end):
    parser = PDB.PDBParser(QUIET=True)
    structure = parser.get_structure('X', pdb_file)
    chain = structure[0][chain_id]
    sequence = ""
    for residue in chain:
        res_id = residue.get_id()[1]
        if start <= res_id <= end:
            sequence += seq1(residue.get_resname())
    return sequence

def sequence_identity(seq1, seq2):
    length = min(len(seq1), len(seq2))
    matches = sum(a == b for a, b in zip(seq1[:length], seq2[:length]))
    return matches / length

def pairwise_identity_matrix(folder, chain_id, start, end):
    pdb_files = [os.path.join(folder, f) for f in os.listdir(folder) if f.endswith('.pdb') and not f.startswith('REF') and not f.endswith('_patch.pdb')]
    num_files = len(pdb_files)
    identity_matrix = np.zeros((num_files, num_files))

    sequences = [extract_sequence(pdb_file, chain_id, start, end) for pdb_file in pdb_files]

    for i in range(num_files):
        for j in range(i+1, num_files):
            identity_matrix[i, j] = sequence_identity(sequences[i], sequences[j])
    print(sequences)
    upper_triangle_mean = np.mean(identity_matrix[np.triu_indices(num_files, k=1)])
    return upper_triangle_mean


# Example usage
folder = '/hanchenchen/ab_opt/AbDesign/test_results_20240725/codesign_single/0-5xku_C_B_A/H_CDR3'
chain_id = 'C'
start = 95
end = 102

average_identity = pairwise_identity_matrix(folder, chain_id, start, end)
print(f'Average pairwise sequence identity (upper triangle): {average_identity:.4f}')



['GGNTITGSGTSGLDV', 'GGNTITGSGTSGLDV', 'GGNTITGSGTSGLDV', 'GGNTITGSGTSGLDV', 'GGNTITGSGTSGLDV', 'GGNTITGSGTSGLDV', 'GGNTITGSGTSGLDV', 'GGNTITGSGTSGLDV', 'GGNTITGSGTSGLDV', 'GGNTITGSGTSGLDV']
Average pairwise sequence identity (upper triangle): 1.0000


In [5]:
import glob
import tqdm
import pandas as pd
import json

pdb_list = []
diversity_list = []
for directory in tqdm.tqdm(glob.glob('/hanchenchen/ab_opt/AbDesign/test_results/codesign_single/*/H_CDR3')):
    metadata_path = directory.replace('H_CDR3', 'metadata.json')
    metadata = json.load(open(metadata_path))
    assert len(metadata['items']) == 1
    chain_id = metadata['items'][0]["residue_first"][0]
    start = metadata['items'][0]["residue_first"][1]
    end = metadata['items'][0]["residue_last"][1]
    diversity = pairwise_identity_matrix(directory, chain_id, start, end)
    pdb_list.append(directory.split('/')[-2])
    diversity_list.append(diversity)

df = pd.DataFrame({'pdb': pdb_list, 'averaged_indentity': diversity_list})
df

  5%|▌         | 1/19 [00:00<00:08,  2.03it/s]

['SGYTYDLAMDY', 'SGYTYDLAMDY', 'SGYTYDLAMDY', 'SGYTYDLAMDY', 'SGYTYDLAMDY', 'SGYTYDLAMDY', 'SGYTYDLAMDY', 'SGYTYDLAMDY', 'SGYTYDLAMDY', 'SGYTYDLAMDY']


 11%|█         | 2/19 [00:01<00:15,  1.09it/s]

['EGTGGYTGAAMDY', 'EGTGGYTGAAMDY', 'EGTGGYTGAAMDY', 'EGTGGYTGAAMDY', 'EGTGGYTGAAMDY', 'EGTGGYTGAAMDY', 'EGTGGYTGAAMDY', 'EGTGGYTGAAMDY', 'EGTGGYTGAAMDY', 'EGTGGYTGAAMDY']


 16%|█▌        | 3/19 [00:01<00:09,  1.60it/s]

['GPLEYTGAGAYYYNWFDI', 'GPLEYTGAGAYYYNWFDI', 'GPLEYTGAGAYYYNWFDI', 'GPLEYTGAGAYYYNWFDI', 'GPLEYTGAGAYYYNWFDI', 'GPLEYTGAGAYYYNWFDI', 'GPLEYTGAGAYYYNWFDI', 'GPLEYTGAGAYYYNWFDI', 'GPLEYTGAGAYYYNWFDI', 'GPLEYTGAGAYYYNWFDI']


 21%|██        | 4/19 [00:02<00:07,  1.97it/s]

['DIGWRFDY', 'DIGWRFDY', 'DIGWRFDY', 'DIGWRFDY', 'DIGWRFDY', 'DIGWRFDY', 'DIGWRFDY', 'DIGWRFDY', 'DIGWRFDY', 'DIGWRFDY']


 26%|██▋       | 5/19 [00:02<00:06,  2.11it/s]

['AELGDGSSVYYYGLDV', 'AELGDGSSVYYYGLDV', 'AELGDGSSVYYYGLDV', 'AELGDGSSVYYYGLDV', 'AELGDGSSVYYYGLDV', 'AELGDGSSVYYYGLDV', 'AELGDGSSVYYYGLDV', 'AELGDGSSVYYYGLDV', 'AELGDGSSVYYYGLDV', 'AELGDGSSVYYYGLDV']


 32%|███▏      | 6/19 [00:03<00:05,  2.42it/s]

['SGYTYDLAMDY', 'SGYTYDLAMDY', 'SGYTYDLAMDY', 'SGYTYDLAMDY', 'SGYTYDLAMDY', 'SGYTYDLAMDY', 'SGYTYDLAMDY', 'SGYTYDLAMDY', 'SGYTYDLAMDY', 'SGYTYDLAMDY']


 37%|███▋      | 7/19 [00:03<00:06,  1.95it/s]

['GYYIFDI', 'GYYIFDI', 'GYYIFDI', 'GYYIFDI', 'GYYIFDI', 'GYYIFDI', 'GYYIFDI', 'GYYIFDI', 'GYYIFDI', 'GYYIFDI']


 42%|████▏     | 8/19 [00:04<00:07,  1.41it/s]

['VVGTGMDV', 'VVGTGMDV', 'VVGTGMDV', 'VGGTGMDV', 'VGGTGMDV', 'VVGTGMDV', 'VVGTGMDV', 'VVGTGMDV', 'VVGTGMDV', 'VGGTGMDV']


 47%|████▋     | 9/19 [00:05<00:06,  1.50it/s]

['EGYYGYGGAAFDY', 'EGYYGYGGAAFDY', 'EGYYGYGGAAFDY', 'EGYYGYGGAAFDY', 'EGYYGYGGAAFDY', 'EGYYGYGGAAFDY', 'EGYYGYGGAAFDY', 'EGYYGYGGAAFDY', 'EGYYGYGGAAFDY', 'EGYYGYGGAAFDY']


 53%|█████▎    | 10/19 [00:05<00:05,  1.66it/s]

['DLVVYGMDV', 'DLVVYGMDV', 'DLQVYGMDV', 'DLVVYGMDV', 'DLVVYGMDV', 'DLVVYGMDV', 'DLVVYGMDV', 'DLQVYGMDV', 'DLVVYGMDV', 'DLVVYGMDV']


 58%|█████▊    | 11/19 [00:06<00:05,  1.54it/s]

['LWWLSGYFDI', 'LWWLSGYFDI', 'LWWLSGYFDI', 'LWWLSGYFDI', 'LWWLSGYFDI', 'LWWLSGYFDI', 'LWWLSGYFDI', 'LWWLSGYFDI', 'LWWLSGYFDI', 'LWWLSGYFDI']


 63%|██████▎   | 12/19 [00:07<00:05,  1.40it/s]

['AELGDGSSVYYYGLDV', 'AELGDGSSVYYYGLDV', 'AELGDGSSVYYYGLDV', 'AELGDGSSVYYYGLDV', 'AELGDGSSVYYYGLDV', 'AELGDGSSVYYYGLDV', 'AELGDGSSVYYYGLDV', 'AELGDGSSVYYYGLDV', 'AELGDGSSVYYYGLDV', 'AELGDGSSVYYYGLDV']


 68%|██████▊   | 13/19 [00:08<00:04,  1.48it/s]

['GGNTITGSGTSGLDV', 'GGNTITGSGTSGLDV', 'GGNTITGSGTSGLDV', 'GGNTITGSGTSGLDV', 'GGNTITGSGTSGLDV', 'GGNTITGSGTSGLDV', 'GGNTITGSGTSGLDV', 'GGNTITGSGTSGLDV', 'GGNTITGSGTSGLDV', 'GGNTITGSGTSGLDV']


 74%|███████▎  | 14/19 [00:08<00:02,  1.79it/s]

['DIGWRFDY', 'DIGWRFDY', 'DIGWRFDY', 'DIGWRFDY', 'DIGWRFDY', 'DIGWRFDY', 'DIGWRFDY', 'DIGWRFDY', 'DIGWRFDY', 'DIGWRFDY']


 79%|███████▉  | 15/19 [00:09<00:02,  1.73it/s]

['SGYTYDLAMDY', 'SGYTYDLAMDY', 'SGYTYDLAMDY', 'SGYTYDLAMDY', 'SGYTYDLAMDY', 'SGYTYDLAMDY', 'SGYTYDLAMDY', 'SGYTYDLAMDY', 'SGYTYDLAMDY', 'SGYTYDLAMDY']


 84%|████████▍ | 16/19 [00:09<00:01,  1.58it/s]

['EGTYGYAGAAFDY', 'EGTYGYAGAAFDY', 'EGTYGYAGAAFDY', 'EGTYGYAGAAFDY', 'EGTYGYAGAAFDY', 'EGTYGYAGAAFDY', 'EGTYGYAGAAFDY', 'EGTYGYAGAAFDY', 'EGTYGYAGAAFDY', 'EGTYGYAGAAFDY']


 89%|████████▉ | 17/19 [00:10<00:01,  1.90it/s]

['SYYTRDLAMDY', 'SYYTRDLAMDY', 'SYYTRDLAMDY', 'SYYTRDLAMDY', 'SYYTRDLAMDY', 'SYYTRDLAMDY', 'SYYTRDLAMDY', 'SYYTRDLAMDY', 'SYYTRDLAMDY', 'SYYTRDLAMDY']


 95%|█████████▍| 18/19 [00:10<00:00,  2.15it/s]

['DLAGGGGLDV', 'DLAGGGGLDV', 'DLAGGGGLDV', 'DLAGGGGLDV', 'DLAGGGGLDV', 'DLAGGGGLDV', 'DLAGGGGLDV', 'DLAGGGGLDV', 'DLAGGGGLDV', 'DLAGGGGLDV']


100%|██████████| 19/19 [00:10<00:00,  1.75it/s]

['DLAGYGGLDV', 'DLAGYGGLDV', 'DLAGYGGLDV', 'DLAGYGGLDV', 'DLAGYGGLDV', 'DLAGYGGLDV', 'DLAGYGGLDV', 'DLAGYGGLDV', 'DLAGYGGLDV', 'DLAGYGGLDV']





Unnamed: 0,pdb,averaged_indentity
0,4-5tlk_B_A_X,1.0
1,7-5w9h_H_I_G,1.0
2,10-7bwj_H_L_E,1.0
3,18-5tlk_D_C_X,1.0
4,17-7che_A_B_R,1.0
5,8-5tlj_B_A_X,1.0
6,13-8ds5_C_B_A,1.0
7,5-5tlj_D_C_X,0.941667
8,12-5w9h_B_C_A,1.0
9,2-7chf_H_L_R,0.960494


In [6]:
df.to_csv('/hanchenchen/ab_opt/rebuttal_exps/2_diversity/results/averaged_upper_triangle_identity.csv', index=False)