### Plan: Project interaction data onto the structure
Work with TEM1- betalactmase

#### Plan:
- Load all contact data, sort contact data to be with res numbers
- Identify all TEM-1 residues that should be looked at.
- Convert TEM1 residues to msa residues
- Convert all contact matrices to msa scored matrices
- For tem1 residue, identify contacts and frequency conserved...
- Output results.


- Can add filtering etc... at a later date

In [1]:
from typing import Any
import re
import pandas as pd
import numpy as np

from tools_proj.utils import open_many_single_frame_contacts_files
from tools_proj.sequences import seq_align_file_to_sequences
from tools_proj.msa_conversion import add_msa_numbering_to_interaction_data

In [2]:
MSA_FILE = r"../../contact_analysis/multi_structure_test/bettaLac.ali"
CRYSTAL_CONTACTS_FOLDER = r"../../network_analysis/crystal_structure_contacts/raw_contacts/"

In [None]:
msa_seqs = seq_align_file_to_sequences(alignment_file=MSA_FILE, output_msa_style=True)
prot_seqs = seq_align_file_to_sequences(alignment_file=MSA_FILE, output_msa_style=False)
print(msa_seqs)

In [None]:
all_protein_contacts = open_many_single_frame_contacts_files(
    folder_path=CRYSTAL_CONTACTS_FOLDER, protein_names=list(msa_seqs.keys())
)
print(all_protein_contacts)

## OLD BELOW

#### Figure out which residues on the msa tem1 and other proteins have 

In [None]:
with open(r"../structure_names.txt") as file:
    structure_names = [line.rstrip() for line in file]
structure_names.remove("1BSG_Sabla") # temporary
structure_names

In [None]:
# MSA data
msa_data = np.genfromtxt(r"../pos_ranking_nostar.dat", names=True, dtype=None, encoding="utf-8")
msa_df = pd.DataFrame.from_records(msa_data)
msa_columns = list(msa_df.columns)
msa_columns

In [None]:
def pdb_to_msa_indexs(msa_sequence, to_convert):
    """
    TODO - add docstring.
    """
    curr_msa_number, curr_pdb_numb = 0, 0
    index_pdb_msa = {}
    for msa_residue in msa_sequence:
        if msa_residue == "-":
            curr_msa_number += 1
        else: 
            curr_msa_number += 1
            curr_pdb_numb += 1
        
            index_pdb_msa[curr_pdb_numb] = curr_msa_number


    converted_list = []
    for msa_contact in to_convert:
        pdb_res1, pdb_res2 = msa_contact
        msa_res1 = index_pdb_msa[pdb_res1]
        msa_res2 = index_pdb_msa[pdb_res2]

        new_label = (msa_res1, msa_res2)
        converted_list.append(new_label)
    
    return converted_list

In [None]:
raw_contacts = {}
for protein in structure_names:
    df = pd.read_csv(f"../{protein}_contacts.csv")
    interactions = list(df.columns)

    # Now extract the residue numbers corresponding for each pair.
    pdb_pairs = []
    for interaction in interactions:
        
        # # TODO - refactor. 
        # if "Hbond" in interaction:
        res1, res2 = re.findall(r"\d+", interaction)
        pdb_pairs.append((int(res1), int(res2)))
    
    # to match names in the msa file. 
    protein_name = protein[0:4]

    # convert from pdb to msa_numbering:
    msa_pairs = pdb_to_msa_indexs(msa_sequence=list(msa_df[protein_name]), to_convert=pdb_pairs)

    raw_contacts[protein_name] = msa_pairs

In [None]:
print(raw_contacts['1BTL'])

### Edits here.
The above contacts are not MSA indexed, so fixing this now. 

#### Figure out which residues on the msa tem1 and other proteins have 

In [None]:
all_msa_res_numbs = {}
for protein in msa_columns[2:]: # TODO - change to 1 later, once 1BSG included. 
    sequence = list(msa_df[protein])

    # now go through each sequence, determine what msa residues it contains. 
    msa_residues = []
    for msa_numb, residue in enumerate(sequence):
        if residue != "-":
            msa_residues.append(msa_numb + 1) # counting starts from 1, not 0...
            
    all_msa_res_numbs[protein] = msa_residues
    
print(all_msa_res_numbs["1BTL"])

### Count all contacts, mark if found in TEM1 or not.

Tem1 = 1BTL

In [None]:
raw_contacts.keys(), all_msa_res_numbs.keys()

In [None]:
key_protein = "1BTL"

# make all unique contacts, 
all_possible_contacts = []
for protein, contacts in raw_contacts.items():
    for msa_res1, msa_res2 in contacts:
        if (msa_res1, msa_res2) not in all_possible_contacts and (msa_res2, msa_res1) not in all_possible_contacts:        
            # check if possible for projecting protein to make this contact:
            if (msa_res1 in all_msa_res_numbs[key_protein]) and (msa_res2 in all_msa_res_numbs[key_protein]):
                all_possible_contacts.append((msa_res1, msa_res2))

print(len(all_possible_contacts), all_possible_contacts)

In [None]:
contact_information = {}
for msa_res1, msa_res2 in all_possible_contacts:
    
    found, not_found, not_possible = 0, 0, 0 
    present_in_key = False
    for protein, prot_contacts in raw_contacts.items():

        # check if interaction in this one.    
        if (msa_res1, msa_res2) in prot_contacts: 
            found += 1
            if protein == key_protein:
                present_in_key = True
        
        elif (msa_res2, msa_res1) in prot_contacts:
            found += 1
            if protein == key_protein:
                present_in_key = True
        
        else:
            # check to see if possible to form the interaction.
            if (msa_res1 not in all_msa_res_numbs[protein]) or (msa_res2 not in all_msa_res_numbs[protein]):
                not_possible += 1
                continue
            not_found += 1

    occupancy = found / (found + not_found)
    contact_information[msa_res1, msa_res2] = {"occupancy": occupancy, "found": found, "not_found": not_found, 
                                                "not_possible": not_possible, "present in key": present_in_key}
print(contact_information)

### Convert from MSA to pdb numbering

In [None]:
def indexing_msa_to_pdb(
    msa_sequence: list[str], 
    to_convert: dict[tuple[int, int], Any]
) -> dict[tuple[int, int], Any]:
    """
    TODO - add docstring.
    """
    curr_msa_number, curr_pdb_numb = 0, 0
    index_pdb_msa = {}
    for msa_residue in msa_sequence:
        if msa_residue == "-":
            curr_msa_number += 1
        else: 
            curr_msa_number += 1
            curr_pdb_numb += 1
        
            index_pdb_msa[curr_msa_number] = curr_pdb_numb


    converted_dict = {}
    for contact, content in to_convert.items():
        msa_res1, msa_res2 = contact
        pdb_res1 = index_pdb_msa[msa_res1]
        pdb_res2 = index_pdb_msa[msa_res2]

        new_label = (pdb_res1, pdb_res2)

        converted_dict[new_label] = content
    
    return converted_dict

In [None]:
pdb_contact_information = indexing_msa_to_pdb(msa_sequence=list(msa_df["1BTL"]), to_convert=contact_information)

In [None]:
print(contact_information.keys())
print(pdb_contact_information.keys())

### Pymol Projections

In [None]:
# prep the contact data above for projections. 
res_res_scores, res_res_colours = {}, {}
for contact, contact_info in pdb_contact_information.items():
    res_res_scores[contact] = contact_info["occupancy"]

    if contact_info["present in key"]:
        res_res_colours[contact] = "red"
    else: 
        res_res_colours[contact] = "blue"

res_res_scores_scaled = rescale_scores(res_res_scores)

In [None]:
print(res_res_scores_scaled)
print(res_res_colours)

In [None]:
project_pymol_res_res_scores(
    res_res_scores=res_res_scores_scaled,
    out_file="pymol_TEM1_contact_preservation.py",
    res_res_colours=res_res_colours
)