In [101]:
def get_contacts_of(pdb_id, i, THRESHOLD_DISTANCE = 4.0):
    # Fetch and load structure
    mmtf_file = mmtf.MMTFFile.read(rcsb.fetch(pdb_id, "mmtf"))
    structure = mmtf.get_structure(mmtf_file, model=1)

    base_id = i
    comp_base_id = 19-(i-2)
    # Separate structure into the DNA and the two identical protein chains
    dna = structure[
        np.isin(structure.chain_id, ["Z"]) & (structure.res_id == base_id) | np.isin(structure.chain_id, ["X"]) & (structure.res_id == comp_base_id) 
    ]

    protein_A = structure[
        (structure.chain_id == "A") & (structure.hetero == False)
    ]
    protein_B = structure[
        (structure.chain_id == "B") & (structure.hetero == False)
    ]
    
    residues = []
    res_id_set = set(dna.res_id)
    for res_id in res_id_set:
        residues.append(dna.chain_id[dna.res_id == res_id][0] + dna.res_name[dna.res_id == res_id][0] + str(res_id))
    residues = sorted(residues, reverse=True)
        
    # Quick check if the two protein chains are really identical
    assert len(struc.get_residues(protein_A)) == len(struc.get_residues(protein_B))

    # Fast identification of contacts via a cell list:
    # The cell list is initiliazed with the coordinates of the DNA
    # and later provided with the atom coordinates of the two protein chains
    cell_list = struc.CellList(dna, cell_size=THRESHOLD_DISTANCE)

    # Sets to store the residue IDs of contact residues
    # for each protein chain
    id_set_A = set()
    id_set_B = set()

    for protein, res_id_set in zip((protein_A, protein_B), (id_set_A, id_set_B)):
        # For each atom in the protein chain,
        # find all atoms in the DNA that are in contact with it
        contacts = cell_list.get_atoms(protein.coord, radius=THRESHOLD_DISTANCE)
        # Only retain atoms in the protein with contact
        # to at least one atom of the DNA
        contact_indices = np.where((contacts != -1).any(axis=1))[0]
        # Get residue IDs for the atoms in the protein
        contact_res_ids = protein.res_id[contact_indices]
        # Put the residue IDs into the set,
        # duplicate IDs are automatically removed in this process
        res_id_set.update(contact_res_ids)

    # Print output
    base = residues[0][1:]
    contacts_A = []
    contacts_B = []
    
    #print(f"Residues in contact with {residues[0]} and {residues[1]}:")
    for res_id in sorted(id_set_A):
        res_name = protein_A.res_name[protein_A.res_id == res_id][0]
        contacts_A.append(res_name.capitalize() + str(res_id))
        #print('chain A '+ res_name.capitalize() + str(res_id))
    
    for res_id in sorted(id_set_B):
        res_name = protein_B.res_name[protein_B.res_id == res_id][0]
        contacts_B.append(res_name.capitalize() + str(res_id))
        #print('chain A '+ res_name.capitalize() + str(res_id))
    
    return base, contacts_A, contacts_B

In [102]:
data = {
    'Base':[],
    'contacts of chain A':[],
    'contacts of chain B':[]
}
for i in range(2,18):
    base, A, B = get_contacts_of("5jub", i)
    data['Base'].append(base)
    data['contacts of chain A'].append(A)
    data['contacts of chain B'].append(B)

In [103]:
import pandas as pd
df = pd.DataFrame(data)
df

Unnamed: 0,Base,contacts of chain A,contacts of chain B
0,DA2,[],[]
1,DG3,[],[Lys42]
2,DT4,[],"[Arg35, Ile38, Lys42]"
3,DG5,[],[Arg35]
4,DA6,[],"[Thr33, Arg35, Arg39]"
5,DC7,[],"[Glu31, Leu32, Thr33, Gln36, Arg39, Arg51]"
6,DA8,[],"[Gln36, Arg39, Ile40, Ser45, Leu46, Pro47, Ser..."
7,DT9,[],"[Gln36, Glu44, Ser45, Leu46, Ser48]"
8,DA10,[],[]
9,DT11,[],[]


In [104]:
df.to_csv('contacts.csv', index=False)