### Plan: Project interaction data onto the structure
Work with TEM1- betalactmase

#### Plan:
- Load all contact data, sort contact data to be with res numbers
- Identify all TEM-1 residues that should be looked at.
- Convert TEM1 residues to msa residues
- Convert all contact matrices to msa scored matrices
- For tem1 residue, identify contacts and frequency conserved...
- Output results.


- Can add filtering etc... at a later date

In [1]:
from typing import Any
import re
import pandas as pd
import numpy as np

In [2]:
"""
Set of functions to make pymol projections of the contacts data results.

"""
from pathlib import Path

def project_pymol_res_res_scores(
    res_res_scores: dict[tuple[int, int], float],
    out_file: str,
    res_res_colours: dict[tuple[int, int], bool]
) -> None:
    """
    Write out a PyMOL compatible python script to project residue-residue scores.
    The scores will be depicted as as cylinders between each interacting residue pair.
    Cylinder size will be controlled accordning to the relative score and colour by
    the interaction type.

    Parameters
    ----------
    res_res_scores : dict[tuple[int, int], float]
        Keys are the residue pair and each value is their associated scores.
        Key formatting is a tuple of 2 integers, residue number 1 and residue number 2.

    out_file: str
        Save the file to this path.

    res_res_colours: dict[tuple[int, int], bool]
        Control what colours to use. Will need refactoring later. 

    """
    # Header of output file.
    out_file_contents = ""
    out_file_contents += (
        "# You can run me in several ways, perhaps the easiest way is to:\n"
    )
    out_file_contents += "# 1. Load the PDB file of your system in PyMOL.\n"
    out_file_contents += "# 2. Download and run the draw_links.py script.\n"
    out_file_contents += "# It can be obtained from: "
    out_file_contents += (
        "# http://pldserver1.biochem.queensu.ca/~rlc/work/pymol/draw_links.py \n"
    )
    out_file_contents += "# 3. Type: @[FILE_NAME.py] in the command line.\n"
    out_file_contents += (
        "# 4. Make sure the .py files are in the same directory as the pdb.\n"
    )

    # main section. 
    for interaction_pair, cylinder_radii in res_res_scores.items():
        res1, res2 = interaction_pair
        color = res_res_colours[(res1, res2)]

        feature_rep = (
            f"draw_links selection1=resi {str(res1)}, "
            + f"selection2=resi {str(res2)}, "
            + f"color={color}, "
            + f"radius={cylinder_radii} \n"
        )
        out_file_contents += feature_rep
    # Finally, group all cylinders made together,
    # (easier for user to handle in PyMOL)
    out_file_contents += "group All_Cylinders, link*\n"

    # Save file.
    out_file_safe = Path(out_file)
    with open(out_file_safe, "w+", encoding="utf-8") as file_out:
        file_out.write(out_file_contents)
    print(f"The file: {out_file_safe} was written to disk.")


def rescale_scores(
    input_dict: dict[tuple[int, int], float], max_value: float = 0.5
) -> dict[tuple[int, int], float]:
    """
    Rescale a dictionary containing per residue or residue-residue scores/counts etc..

    Parameters
    ----------
    input_dict : dict[tuple[int, int], float]
        Keys are the residue pairs, values are the associated value for the pair.
        Key formatting is a tuple of 2 integers, residue number 1 and residue number 2.

    max_value : float
        Rescale the values so that this is the max value in the returned dictionary
        Default = 0.5, good for PyMOL residue-residue connection representation.

    Returns
    ----------
    dict[tuple[int, int], float]
        Rescaled scores, has the same formatting as the input dictionary.
    """
    max_strength = max(list(input_dict.values()))
    scale_factor = max_strength / max_value

    rescaled_dict = {}
    for key, curr_value in input_dict.items():
        new_value = round((curr_value / scale_factor), 4)
        rescaled_dict.update({key: new_value})

    return rescaled_dict


#### Figure out which residues on the msa tem1 and other proteins have 

In [3]:
with open(r"../structure_names.txt") as file:
    structure_names = [line.rstrip() for line in file]
structure_names.remove("1BSG_Sabla") # temporary
structure_names

['1BTL_tem1', '1BUE_NmcA', '1BZA_Toho1', '1E25_Per1', '3BLM_blaZ']

In [4]:
# MSA data
msa_data = np.genfromtxt(r"../pos_ranking_nostar.dat", names=True, dtype=None, encoding="utf-8")
msa_df = pd.DataFrame.from_records(msa_data)
msa_columns = list(msa_df.columns)
msa_columns

['pos_scr', '1BSG', '1BTL', '1BUE', '1BZA', '1E25', '3BLM']

In [5]:
def pdb_to_msa_indexs(msa_sequence, to_convert):
    """
    TODO - add docstring.
    """
    curr_msa_number, curr_pdb_numb = 0, 0
    index_pdb_msa = {}
    for msa_residue in msa_sequence:
        if msa_residue == "-":
            curr_msa_number += 1
        else: 
            curr_msa_number += 1
            curr_pdb_numb += 1
        
            index_pdb_msa[curr_pdb_numb] = curr_msa_number


    converted_list = []
    for msa_contact in to_convert:
        pdb_res1, pdb_res2 = msa_contact
        msa_res1 = index_pdb_msa[pdb_res1]
        msa_res2 = index_pdb_msa[pdb_res2]

        new_label = (msa_res1, msa_res2)
        converted_list.append(new_label)
    
    return converted_list

In [6]:
raw_contacts = {}
for protein in structure_names:
    df = pd.read_csv(f"../{protein}_contacts.csv")
    interactions = list(df.columns)

    # Now extract the residue numbers corresponding for each pair.
    pdb_pairs = []
    for interaction in interactions:
        
        # # TODO - refactor. 
        # if "Hbond" in interaction:
        res1, res2 = re.findall(r"\d+", interaction)
        pdb_pairs.append((int(res1), int(res2)))
    
    # to match names in the msa file. 
    protein_name = protein[0:4]

    # convert from pdb to msa_numbering:
    msa_pairs = pdb_to_msa_indexs(msa_sequence=list(msa_df[protein_name]), to_convert=pdb_pairs)

    raw_contacts[protein_name] = msa_pairs

In [7]:
print(raw_contacts['1BTL'])

[(1, 4), (1, 5), (1, 22), (1, 24), (1, 34), (1, 284), (2, 5), (2, 6), (2, 34), (3, 6), (3, 8), (3, 276), (3, 283), (3, 284), (4, 8), (4, 9), (4, 22), (4, 24), (4, 273), (4, 276), (4, 277), (4, 284), (5, 9), (5, 10), (5, 22), (5, 34), (5, 35), (5, 36), (6, 10), (6, 11), (8, 11), (8, 12), (8, 269), (8, 272), (8, 273), (8, 276), (9, 12), (9, 13), (9, 20), (9, 22), (9, 36), (9, 273), (10, 13), (10, 14), (10, 36), (11, 14), (11, 15), (11, 269), (12, 15), (12, 16), (12, 20), (12, 269), (12, 273), (13, 16), (13, 17), (13, 18), (13, 19), (13, 20), (13, 36), (13, 37), (14, 17), (15, 260), (15, 264), (15, 268), (15, 269), (16, 20), (16, 255), (16, 258), (16, 259), (16, 260), (16, 264), (16, 265), (16, 269), (17, 257), (17, 258), (17, 259), (18, 255), (18, 256), (18, 257), (18, 258), (19, 37), (19, 40), (19, 41), (19, 42), (19, 254), (19, 255), (19, 256), (20, 36), (20, 37), (20, 42), (20, 253), (20, 254), (20, 255), (20, 273), (21, 36), (21, 37), (21, 42), (21, 167), (21, 251), (21, 253), (21, 2

### Edits here.
The above contacts are not MSA indexed, so fixing this now. 

#### Figure out which residues on the msa tem1 and other proteins have 

In [8]:
all_msa_res_numbs = {}
for protein in msa_columns[2:]: # TODO - change to 1 later, once 1BSG included. 
    sequence = list(msa_df[protein])

    # now go through each sequence, determine what msa residues it contains. 
    msa_residues = []
    for msa_numb, residue in enumerate(sequence):
        if residue != "-":
            msa_residues.append(msa_numb + 1) # counting starts from 1, not 0...
            
    all_msa_res_numbs[protein] = msa_residues
    
print(all_msa_res_numbs["1BTL"])

[1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 81, 82, 83, 84, 85, 86, 87, 88, 89, 92, 93, 94, 95, 96, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 222, 227, 228, 229, 230, 231, 232, 233, 234,

### Count all contacts, mark if found in TEM1 or not.

Tem1 = 1BTL

In [9]:
raw_contacts.keys(), all_msa_res_numbs.keys()

(dict_keys(['1BTL', '1BUE', '1BZA', '1E25', '3BLM']),
 dict_keys(['1BTL', '1BUE', '1BZA', '1E25', '3BLM']))

In [10]:
key_protein = "1BTL"

# make all unique contacts, 
all_possible_contacts = []
for protein, contacts in raw_contacts.items():
    for msa_res1, msa_res2 in contacts:
        if (msa_res1, msa_res2) not in all_possible_contacts and (msa_res2, msa_res1) not in all_possible_contacts:        
            # check if possible for projecting protein to make this contact:
            if (msa_res1 in all_msa_res_numbs[key_protein]) and (msa_res2 in all_msa_res_numbs[key_protein]):
                all_possible_contacts.append((msa_res1, msa_res2))

print(len(all_possible_contacts), all_possible_contacts)

1368 [(1, 4), (1, 5), (1, 22), (1, 24), (1, 34), (1, 284), (2, 5), (2, 6), (2, 34), (3, 6), (3, 8), (3, 276), (3, 283), (3, 284), (4, 8), (4, 9), (4, 22), (4, 24), (4, 273), (4, 276), (4, 277), (4, 284), (5, 9), (5, 10), (5, 22), (5, 34), (5, 35), (5, 36), (6, 10), (6, 11), (8, 11), (8, 12), (8, 269), (8, 272), (8, 273), (8, 276), (9, 12), (9, 13), (9, 20), (9, 22), (9, 36), (9, 273), (10, 13), (10, 14), (10, 36), (11, 14), (11, 15), (11, 269), (12, 15), (12, 16), (12, 20), (12, 269), (12, 273), (13, 16), (13, 17), (13, 18), (13, 19), (13, 20), (13, 36), (13, 37), (14, 17), (15, 260), (15, 264), (15, 268), (15, 269), (16, 20), (16, 255), (16, 258), (16, 259), (16, 260), (16, 264), (16, 265), (16, 269), (17, 257), (17, 258), (17, 259), (18, 255), (18, 256), (18, 257), (18, 258), (19, 37), (19, 40), (19, 41), (19, 42), (19, 254), (19, 255), (19, 256), (20, 36), (20, 37), (20, 42), (20, 253), (20, 254), (20, 255), (20, 273), (21, 36), (21, 37), (21, 42), (21, 167), (21, 251), (21, 253), (

In [11]:
contact_information = {}
for msa_res1, msa_res2 in all_possible_contacts:
    
    found, not_found, not_possible = 0, 0, 0 
    present_in_key = False
    for protein, prot_contacts in raw_contacts.items():

        # check if interaction in this one.    
        if (msa_res1, msa_res2) in prot_contacts: 
            found += 1
            if protein == key_protein:
                present_in_key = True
        
        elif (msa_res2, msa_res1) in prot_contacts:
            found += 1
            if protein == key_protein:
                present_in_key = True
        
        else:
            # check to see if possible to form the interaction.
            if (msa_res1 not in all_msa_res_numbs[protein]) or (msa_res2 not in all_msa_res_numbs[protein]):
                not_possible += 1
                continue
            not_found += 1

    occupancy = found / (found + not_found)
    contact_information[msa_res1, msa_res2] = {"occupancy": occupancy, "found": found, "not_found": not_found, 
                                                "not_possible": not_possible, "present in key": present_in_key}
print(contact_information)

{(1, 4): {'occupancy': 0.5, 'found': 2, 'not_found': 2, 'not_possible': 1, 'present in key': True}, (1, 5): {'occupancy': 1.0, 'found': 4, 'not_found': 0, 'not_possible': 1, 'present in key': True}, (1, 22): {'occupancy': 0.25, 'found': 1, 'not_found': 3, 'not_possible': 1, 'present in key': True}, (1, 24): {'occupancy': 0.5, 'found': 2, 'not_found': 2, 'not_possible': 1, 'present in key': True}, (1, 34): {'occupancy': 1.0, 'found': 1, 'not_found': 0, 'not_possible': 4, 'present in key': True}, (1, 284): {'occupancy': 0.5, 'found': 2, 'not_found': 2, 'not_possible': 1, 'present in key': True}, (2, 5): {'occupancy': 1.0, 'found': 4, 'not_found': 0, 'not_possible': 1, 'present in key': True}, (2, 6): {'occupancy': 1.0, 'found': 4, 'not_found': 0, 'not_possible': 1, 'present in key': True}, (2, 34): {'occupancy': 1.0, 'found': 1, 'not_found': 0, 'not_possible': 4, 'present in key': True}, (3, 6): {'occupancy': 1.0, 'found': 4, 'not_found': 0, 'not_possible': 1, 'present in key': True}, (3

### Convert from MSA to pdb numbering

In [12]:
def indexing_msa_to_pdb(
    msa_sequence: list[str], 
    to_convert: dict[tuple[int, int], Any]
) -> dict[tuple[int, int], Any]:
    """
    TODO - add docstring.
    """
    curr_msa_number, curr_pdb_numb = 0, 0
    index_pdb_msa = {}
    for msa_residue in msa_sequence:
        if msa_residue == "-":
            curr_msa_number += 1
        else: 
            curr_msa_number += 1
            curr_pdb_numb += 1
        
            index_pdb_msa[curr_msa_number] = curr_pdb_numb


    converted_dict = {}
    for contact, content in to_convert.items():
        msa_res1, msa_res2 = contact
        pdb_res1 = index_pdb_msa[msa_res1]
        pdb_res2 = index_pdb_msa[msa_res2]

        new_label = (pdb_res1, pdb_res2)

        converted_dict[new_label] = content
    
    return converted_dict

In [13]:
pdb_contact_information = indexing_msa_to_pdb(msa_sequence=list(msa_df["1BTL"]), to_convert=contact_information)

In [14]:
print(contact_information.keys())
print(pdb_contact_information.keys())

dict_keys([(1, 4), (1, 5), (1, 22), (1, 24), (1, 34), (1, 284), (2, 5), (2, 6), (2, 34), (3, 6), (3, 8), (3, 276), (3, 283), (3, 284), (4, 8), (4, 9), (4, 22), (4, 24), (4, 273), (4, 276), (4, 277), (4, 284), (5, 9), (5, 10), (5, 22), (5, 34), (5, 35), (5, 36), (6, 10), (6, 11), (8, 11), (8, 12), (8, 269), (8, 272), (8, 273), (8, 276), (9, 12), (9, 13), (9, 20), (9, 22), (9, 36), (9, 273), (10, 13), (10, 14), (10, 36), (11, 14), (11, 15), (11, 269), (12, 15), (12, 16), (12, 20), (12, 269), (12, 273), (13, 16), (13, 17), (13, 18), (13, 19), (13, 20), (13, 36), (13, 37), (14, 17), (15, 260), (15, 264), (15, 268), (15, 269), (16, 20), (16, 255), (16, 258), (16, 259), (16, 260), (16, 264), (16, 265), (16, 269), (17, 257), (17, 258), (17, 259), (18, 255), (18, 256), (18, 257), (18, 258), (19, 37), (19, 40), (19, 41), (19, 42), (19, 254), (19, 255), (19, 256), (20, 36), (20, 37), (20, 42), (20, 253), (20, 254), (20, 255), (20, 273), (21, 36), (21, 37), (21, 42), (21, 167), (21, 251), (21, 25

### Pymol Projections

In [15]:
# prep the contact data above for projections. 
res_res_scores, res_res_colours = {}, {}
for contact, contact_info in pdb_contact_information.items():
    res_res_scores[contact] = contact_info["occupancy"]

    if contact_info["present in key"]:
        res_res_colours[contact] = "red"
    else: 
        res_res_colours[contact] = "blue"

res_res_scores_scaled = rescale_scores(res_res_scores)

In [16]:
print(res_res_scores_scaled)
print(res_res_colours)

{(1, 4): 0.25, (1, 5): 0.5, (1, 21): 0.125, (1, 23): 0.25, (1, 33): 0.5, (1, 262): 0.25, (2, 5): 0.5, (2, 6): 0.5, (2, 33): 0.5, (3, 6): 0.5, (3, 7): 0.375, (3, 258): 0.125, (3, 261): 0.25, (3, 262): 0.25, (4, 7): 0.5, (4, 8): 0.375, (4, 21): 0.125, (4, 23): 0.125, (4, 255): 0.125, (4, 258): 0.375, (4, 259): 0.25, (4, 262): 0.375, (5, 8): 0.5, (5, 9): 0.375, (5, 21): 0.125, (5, 33): 0.5, (5, 34): 0.375, (5, 35): 0.375, (6, 9): 0.4, (6, 10): 0.4, (7, 10): 0.5, (7, 11): 0.5, (7, 251): 0.1, (7, 254): 0.2, (7, 255): 0.1, (7, 258): 0.4, (8, 11): 0.5, (8, 12): 0.5, (8, 19): 0.4, (8, 21): 0.5, (8, 35): 0.5, (8, 255): 0.5, (9, 12): 0.5, (9, 13): 0.5, (9, 35): 0.5, (10, 13): 0.5, (10, 14): 0.4, (10, 251): 0.1, (11, 14): 0.4, (11, 15): 0.5, (11, 19): 0.4, (11, 251): 0.5, (11, 255): 0.5, (12, 15): 0.5, (12, 16): 0.4, (12, 17): 0.4, (12, 18): 0.4, (12, 19): 0.5, (12, 35): 0.5, (12, 36): 0.4, (13, 16): 0.4, (14, 243): 0.2, (14, 247): 0.1, (14, 250): 0.1, (14, 251): 0.2, (15, 19): 0.2, (15, 238): 0.

In [17]:
project_pymol_res_res_scores(
    res_res_scores=res_res_scores_scaled,
    out_file="pymol_TEM1_contact_preservation.py",
    res_res_colours=res_res_colours
)

The file: pymol_TEM1_contact_preservation.py was written to disk.
