In [None]:
"""
This Notebook has snippets of scripts for each protein to calculate the pairwise distances between pairs
we detected as potential epistatic pairs.
In order to do that, one needs to first align the sequence to the sequence in the 3D structure to match the positions
then the pairwise distances can be taken.

At the end, a Wilcoxon test is ran comparing the pariwise distance distribution between the pairs we detected
and all pairwise distances. The Wilcoxon test is done with R.

So the general steps are:
1- Loading the structure that corresponds to the protein we are working on
2- Extracting the sequence from the protein and aligning our sequence to that and map positions
3- Loading the pairs with the significant p-value (we need their positions)
4- Calculating pairwise distances for these pairs and for all possible pairs in the structure
5- Wilcoxon test to check if the distances between the pairs we detected is significantly smaller than the total
distribution

Note: There will be repetetive codes in each snippet for each protein, I did not want to separate them in functions
or join them, to make each snipper separate, in case one wants to copy it for just one protein, or modefy it for
to be used on another protein
"""


"""
I think the calculations for all the distances is wrong, definitely need to check this again
"""

In [1]:
import sys
import os
import random
from Bio.PDB import *
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
from Bio.Align import substitution_matrices
from sw_algorithm import sw_cpp
import subprocess

In [2]:
def wilcoxon_R(epi_distances, all_distances, current_seq):
    # This function outputs an R script to calculate Wilcoxon test
    # runs the R script in a subprocess then deletes intermediate files and keeps the final value
    # in tmp_wilcoxon.txt
    # This script can be ran after running any of the previous snippets because it only needs the epi_distances dict
    # and the all_distances dict
    r_script = f"""
    # Read in the data
    options(warn=-1)

    distances_file <- scan("tmp.txt", what="", sep="\n")

    epi_distances = as.double(strsplit(distances_file[1], ", ")[[1]])
    all_distances = as.double(strsplit(distances_file[2], ", ")[[1]])

    w_test <- wilcox.test(epi_distances, all_distances)
    print("These values are for {current_seq}")
    print(paste0("The welcox test statistic is: ", w_test$statistic, " and the p-value is: ", w_test$p.value))
    """

    out_r_script =  open("tmp.r", "w")
    out_r_script.write(r_script)
    out_r_script.close()

    out_file = open("tmp.txt", "w")
    out_file.write(str(list(epi_distances.values()))[1:-1] + "\n")
    out_file.write(str(list(all_distances.values()))[1:-1] + "\n")
    out_file.close()
    # print(wilcoxon(all_epi_distances_sampled, all_distances_sampled, zero_method="wilcox"))
    subprocess.call("Rscript tmp.r > tmp_wilcoxon.txt", shell=True)

    subprocess.call("rm tmp.r", shell=True)
    subprocess.call("rm tmp.txt", shell=True)

In [3]:
blosum62 = substitution_matrices.load("BLOSUM62")
# The structures we used (bilogical assemblies pre-downloaded)
structures = ["1NN2_N2.pdb1", "1RUZ_H1.pdb1", "2VIU_H3.pdb1", "3BEQ_N1.pdb1", 
              "5C7K_HIV1_subtype_b.pdb1", "5C7K_HIV1_subtype_b.pdb1", "6MYY_HIV1_subtype_c.pdb1"]

protein_letters_3to1 = {'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E',
'PHE': 'F', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LYS': 'K', 'LEU': 'L',
'MET': 'M', 'ASN': 'N', 'PRO': 'P', 'GLN': 'Q', 'ARG': 'R', 'SER': 'S',
'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y'}

# The sequences to align to the structures
my_seq_1nn2 = "MNPNQKIITIGSVSLTIATVCFLMQTAILVTTVTLHFKQYECDSPASNQVMPCEPIIIERNITEIVYLNNTTIEKEICPKVVEYRNWSKPQCQITGFAPFSKDNSIRLSAGGDIWVTREPYVSCDHGKCYQFALGQGTTLDNKHSNDTIHDRIPHRTLLMNELGVPFHLGTRQVCIAWSSSSCHDGKAWLHVCITGDDKNATASFIYDGRLVDSIGSWSQNILRTQESECVCINGTCTVVMTDGSASGRADTRILFIEEGKIVHISPLSGSAQHVEECSCYPRYPGVRCICRDNWKGSNRPVVDINMEDYSIDSSYVCSGLVGDTPRNDDRSSNSNCRNPNNERGNQGVKGWAFDNGDDVWMGRTISKDLRSGYETFKVIGGWSTPNSKSQINRQVIVDSDNRSGYSGIFSVEGKSCINRCFYVELIRGRKQEARVWWTSNSIVVFCGTSGTYGTGSWPDGANINFMPI"
my_seq_1ruz = "MEARLLVLLCAFAATNADTICIGYHANNSTDTVDTVLEKNVTVTHSVNLLEDSHNGKLCKLKGIAPLQLGKCNIAGWLLGNPECDLLLTASSWSYIVETSNSENGTCYPGDFIDYEELREQLSSVSSFEKFEIFPKTSSWPNHETTGVTAACSYAGASSFYRNLLWLTKKGSSYPKLSKSYVNNKGKEVLVLWGVHHPPTGTDQQSLYQNADAYVSVGSSKYNRRFTPEIAARPKVRDQAGRMNYYWTLLEPGDTITFEATGNLIAPWYAFALNRGSGSGIITSDAPVHDCNTKCQTPHGAINSSLPFQNIHPVTIGECPKYVRSTKLRMATGLRNIPSIQSRGLFGAIAGFIEGGWTGMIDGWYGYHHQNEQGSGYAADQKSTQNAIDGITNKVNSVIEKMNTQFTAVGKEFNNLERRIENLNKKVDDGFLDIWTYNAELLVLLENERTLDFHDSNVRNLYEKVKSQLKNNAKEIGNGCFEFYHKCDDACMESVRNGTYDYPKYSEESKLNREEIDGVKLESMGVYQILAIYSTVASSLVLLVSLGAISFWMCSNGSLQCRICI"
my_seq_2viu = "MKTIIALSYIFCLALGQDLPGNDNSTATLCLGHHAVPNGTLVKTITDDQIEVTNATELVQSSSTGKICNNPHRILDGIDCTLIDALLGDPHCDVFQNETWDLFVERSKAFSNCYPYDVPDYASLRSLVASSGTLEFITEGFTWTGVTQNGGSNACKRGPGSGFFSRLNWLTKSGSTYPVLNVTMPNNDNFDKLYIWGIHHPSTNQEQTSLYVQASGRVTVSTRRSQQTIIPNIGSRPWVRGLSSRISIYWTIVKPGDVLVINSNGNLIAPRGYFKMRTGKSSIMRSDAPIDTCISECITPNGSIPNDKPFQNVNKITYGACPKYVKQNTLKLATGMRNVPEKQTRGLFGAIAGFIENGWEGMIDGWYGFRHQNSEGTGQAADLKSTQAAIDQINGKLNRVIEKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTRRQLRENAEEMGNGCFKIYHKCDNACIESIRNGTYDHDVYRDEALNNRFQIKGVELKSGYKDWILWISFAISCFLLCVVLLGFIMWACQRGNIRCNICI"
my_seq_3beq = "MNPNQKIITIGSISIAIGIISLMLQIGNIISIWASHSIQTGSQNNTGICNQRIITYENSTWVNHTYVNINNTNVVAGEDKTSVTLAGNSSLCSISGWAIYTKDNSIRIGSKGDVFVIREPFISCSHLECRTFFLTQGALLNDKHSNGTVKDRSPYRALMSCPLGEAPSPYNSKFESVAWSASACHDGMGWLTIGISGPDNGAVAVLKYNGIITGTIKSWKKQILRTQESECVCMNGSCFTIMTDGPSNKAASYKIFKIEKGKVTKSIELNAPNFHYEECSCYPDTGIVMCVCRDNWHGSNRPWVSFNQNLDYQIGYICSGVFGDNPRPEDGEGSCNPVTVDGANGVKGFSYKYDNGVWIGRTKSNRLRKGFEMIWDPNGWTNTDSDFSVKQDVVAITDWSGYSGSFVQHPELTGLDCIRPCFWVELVRGLPRENTTIWTSGSSISFCGVNSDTANWSWPDGAELPFTIDK"
my_seq_5c7k_A = "MRVKGIQMNSLRWGMLGWVTVYYGVPVWKDAETTLFCASDAKAYDAEVHNIWATHACVPTDPNPQEINLNVTEEFNMWKNNMVEQMHTDIISLWDQGLKPCVKLTPLCVTLDCTNCSYNVTKVSSLFYKLDVVQYRLINCNTSAITQACPKVTFEPIPIHYCAPAGFAILKCKDEKFNGTGLCKNVSTVQCTHGIKPVVSTQLLLNGSLAEEVRIRSENITNNAKNIIVQLASPVTINCIRPNNNTRGPGAYIIGEIRQAHCNVSEWNSTLQKVANQLFSGGDLEITTHSFNCGGEFFYCNTSGLFLQCRIKQIINMWQRAGQAIYAPPIPGVIRCKSNITGLILTRDETFRPGGGDMRDNWRSELYKYKVVKIEPIGVAPTRAKRRVVEREKRAIGAVFIGFLGAAGSTMGAASVTLTVQARQLLSGIVQQQSNLLRAIEAQQHLLKLTVWGIKQLQARVLAVERYLKDQQLLGIWGCSGKLICTTNVPWNSSWSNKSEIWNMTWLQWDKEVSNYTQIIYTLIEESQNQQEKNEQDLLALDKWASLWNWFNISQWLWYIKIFIIIVGGLIGLRIVFAVLSVINRVRQGYSPLSFQTPPGLDRPGRIEEEGGEQDRGRSIRLVSGFLALAWDDLRSLCLFSYHRLRDFILIATRTVEGWESLKYLGNLLVYWGRELKISAINLCDTIAIAVAGWTDRVIELGQRLCRAIHIPRRIRQGFERALL"
my_seq_5c7k = "MRVMGIQMNCWRWGMLGWVTVYYGVPVWKDAETTLFCASDAKAYDTEVHNTWATHACVPTDPNPQEIQLNVTEKFNMWKNNMVEQMHEDIISLWDQSLKPCVKLTPLCVTLECKNCSYNMTKVYSLFYRLDIVPYRLINCNTSAITQACPKVSFEPIPIHYCAPAGFAILKCNDIDFNGTGLCKNVSTVQCTHGIRPVVSTQLLLNGSLAEEVRIRSENITNNAKTIIVQLDQPVNITCMRPNNNTRGPGAYIIGNIREAHCNVSDWNNTLQKVATQLFSGGDLEITTHSFNCEGEFFYCNTSGLFLPCRIKQIINMWQRAGQAIYAPPIPGVIQCQSNITGLLLTRDETFRPGGGNMRDNWRSELYKYKVVKIEPLGVAPTTARRRVVEREKRALGALFIGFLGAAGSTMGAASVTLTVQARQLLSGIVQQQNNLLRAIEAQQHLLKLTVWGIKQLQARVLAVERYLHDQQLLGIWGCSGKLICTTNVPWNDSRSNKSDIWNMTWMQWDEEISNYTQIIYNLLEKSQNQQEKNEQELLALDKWASLWNWFNISNWLWYIRIFIIIVGGLIGLRIVFAVLSIINRVRQGYSPLSFQTPPRLDRPGRTEEEGGEQGRDRSIRLVNGFLALAWEDLRSLCLWSYHLLRDFILVAARTVEGWEGLKYLGNILLYWARELKISATSLLDTIAIVVAGWTDRAIEITQNICRGIHIPRRIRQGFERALL"
my_seq_6myy = "MRAKGIQRNWWIWGILGWVTVYYGVPVWKEAKTTLFCASDAKGYDTEVHNVWATHACVPTDPNPQELVLNVTENFNMWENDMVDQMHQDIISLWDQSLKPCVKLTPLCVTLNCKNCSFNMTKVHALFYRLDVPYRLINCNTSAITQACPKVTFDPIPIHYCAPAGYAILKCNNKTFNGTGPCTKVSTVQCTHGIKPVVSTQLLLNGSLAEEIIIRSKNITDNTKTIIVHLNESVEINCTRPNNNTRGPGTYIIGDIREAHCNISAWNKTLHELSKKLFTGGDLEITTHSFNCRGEFFYCNTTQLFISCRIKQIINMWQEVGRAMYAPPIRGMITCRSNITGILLTRDEIFRPGGGDMRDNWRSELYKYKVVEINPLGIAPTAKRRVVEREKRALGAVFLGFLGAAGSTMGAASITLTVQARQLLSGIVQQQSNLLRAIEAQQHMLQLTVWGIKQLQTRVLALERYLRDQQLLGIWGCSGKLICTTNVPWNSSWSNKSNIWNMTWMQWDREINNYTDTIYRLLEESQNQQEQNEKDLLALDKWQNLWNWFNISNWLWYIKIFIMIIGGLIGLRIIFAVLSIVNRVRQGYSPLSFQTPPRPGGLGGIEEEGGEQDKDRSIRLVSGFLSLAWDDLRSLCLFSYRQLRDLILIAARAAEGWEILKYLGSLVQYWGLELKKSAISLLDTIAIAVAEGTDRIIEVIQRIWEIINIPRRIRQGFEAALQ"

my_seqs = [my_seq_1nn2, my_seq_1ruz, my_seq_2viu, my_seq_3beq, my_seq_5c7k_A, my_seq_5c7k, my_seq_6myy]

# The files with p-values, output from our tool and filtered with filter_p_values.py
p_values = ["n2_all_prot_pvalues_output_filtered.tsv",
            "h1_all_prot_pvalue_output_filtered.tsv",
            "h3_all_prot_pvalues_output_filtered.tsv",
            "n1_all_prot_pvalues_output_filtered.tsv", 
            "hiv1_extant_subclass_a_pvalues_output_filtered.tsv",
            "hiv1_extant_subclass_b_pvalues_output_filtered.tsv",
            "hiv1_extant_subclass_c_pvalues_output_filtered.tsv"]


In [4]:
# For 1NN2 (N2)
parser = PDBParser()
current_seq = 0  # this index is then using with structures, my_seqs, p_values
structure = parser.get_structure(structures[current_seq], structures[current_seq])  # parsing structure

# generating a dict with structure info (model_id:model) and inside each model we have (chain_id:chain)
structure_info = dict()
for model in structure:
    model_id = model.get_full_id()[1]
    structure_info[model_id] = dict()
    for chain in model.get_chains():
        chain_id = chain.get_full_id()[2]
        structure_info[model_id][chain_id] = {"residues":[], "seq":""}
        for residue in chain.get_residues():
            if residue.get_id()[0] == " ":
                structure_info[model_id][chain_id]["residues"].append(residue)
                structure_info[model_id][chain_id]["seq"] += protein_letters_3to1[residue.resname]

# Aligning the sequence to chain A (Asymmetric unit)
alignments = pairwise2.align.globaldx(my_seqs[current_seq], structure_info[0]["A"]["seq"], blosum62)
# Putting all residues in a list
all_residues = structure_info[0]["A"]["residues"]

# matches here is a dictionary that matches a certain aa in the sequence to a residue in the structure
# this is needed because the residue ids in the structure is not the same as the cannonical sequence
# and this mapping is needed
alignment = alignments[0]
seq_pos = -1
residue_pos = -1
matches = dict()
for idx in range(len(alignment[0])):
    if alignment[0][idx] != "-":
        seq_pos += 1
    if alignment[1][idx] != "-":
        residue_pos += 1
    if (alignment[0][idx] != "-") and (alignment[1][idx] != "-"):
        assert alignment[0][idx] == my_seqs[current_seq][seq_pos]
        matches[seq_pos] = (seq_pos, alignment[0][idx], residue_pos,
                        protein_letters_3to1[all_residues[residue_pos].resname])

# Loading the epistatic pairs
epi_pairs = []
with open(p_values[current_seq], "r") as in_file:
    next(in_file)
    for l in in_file:
        l = l.split("\t")
        # first and second column of that table
        pos1 = int(l[0])
        pos2 = int(l[1])
        if pos1 > pos2:
            epi_pairs.append((pos1, pos2))
        else:
            epi_pairs.append((pos2, pos1))


# for each pair, we calculate the distance accross models and keep the smallest one
chain = "A"
epi_distances = dict()
for pos1, pos2 in epi_pairs:
    if (pos1 in matches) and (pos2 in matches):  # checking if that position is present in the structure
        distance = 2000
        for model in structure_info.values():
            c_dist = structure_info[0][chain]["residues"][matches[pos1][2]]["CA"] - model[chain]["residues"][matches[pos2][2]]["CA"]
            if c_dist < distance:
                distance = c_dist
        epi_distances[(pos1, pos2)] = distance

# getting all pairwise distances
all_distances = dict()
for residue1 in structure_info[0][chain]["residues"]:
    distance = 2000
    for model in structure_info.values():
        for residue2 in model[chain]["residues"]:
            if residue1["CA"] - residue2["CA"] < distance:
                distance = residue1["CA"] - residue2["CA"]
                all_distances[(residue1._id[1], residue2._id[1])] = distance
                
wilcoxon_R(epi_distances, all_distances, structures[current_seq])



In [5]:
# For 1RUZ (H1)
parser = PDBParser()
current_seq = 1
structure = parser.get_structure(structures[current_seq], structures[current_seq])

# generating a dict with structure info (model_id:model) and inside each model we have (chain_id:chain)
structure_info = dict()
for model in structure:
    model_id = model.get_full_id()[1]
    structure_info[model_id] = dict()
    for chain in model.get_chains():
        chain_id = chain.get_full_id()[2]
        structure_info[model_id][chain_id] = {"residues":[], "seq":""}
        for residue in chain.get_residues():
            if residue.get_id()[0] == " ":
                structure_info[model_id][chain_id]["residues"].append(residue)
                structure_info[model_id][chain_id]["seq"] += protein_letters_3to1[residue.resname]
                
# we need to merge some chains because it's the sequence is split into different chains
# chains HI, JK, and LM have the same sequence
merging = ["HI", "JK", "LM"]
for k in merging:
    structure_info[0][k] = dict()
    structure_info[0][k]["seq"] = structure_info[0][k[0]]["seq"] + structure_info[0][k[1]]["seq"]
    structure_info[0][k]["residues"] = structure_info[0][k[0]]["residues"] + structure_info[0][k[1]]["residues"]


alignments = pairwise2.align.globaldx(my_seqs[current_seq], structure_info[0]["HI"]["seq"], blosum62)
all_residues = structure_info[0]["HI"]["residues"]

# matches here is a dictionary that matches a certain aa in the sequence to a residue in the structure
# this is needed because the residue ids in the structure is not the same as the cannonical sequence
# and this mapping is needed
alignment = alignments[0]
seq_pos = -1
residue_pos = -1
matches = dict()
for idx in range(len(alignment[0])):
    if alignment[0][idx] != "-":
        seq_pos += 1
    if alignment[1][idx] != "-":
        residue_pos += 1
    if (alignment[0][idx] != "-") and (alignment[1][idx] != "-"):
        assert alignment[0][idx] == my_seqs[current_seq][seq_pos]
        matches[seq_pos] = (seq_pos, alignment[0][idx], residue_pos,
                        protein_letters_3to1[all_residues[residue_pos].resname])

        
# Loading the epistatic pairs
epi_pairs = []
with open(p_values[current_seq], "r") as in_file:
    next(in_file)
    for l in in_file:
        l = l.split("\t")
        # first and second column of that table
        pos1 = int(l[0])
        pos2 = int(l[1])
        if pos1 > pos2:
            epi_pairs.append((pos1, pos2))
        else:
            epi_pairs.append((pos2, pos1))
            

# There's only 1 model in this structure
# but we need to check the distances between HI-JK and HI-LM and take the smaller
epi_distances = dict()
for pos1, pos2 in epi_pairs:
    if (pos1 in matches) and (pos2 in matches):
        distance = 2000
        for chain in ["HI", "JK", "LM"]:
            c_dist = structure_info[0]["HI"]["residues"][matches[pos1][2]]["CA"] - structure_info[0][chain]["residues"][matches[pos2][2]]["CA"]
            if c_dist < distance:
                distance = c_dist
        epi_distances[(pos1, pos2)] = distance
 
all_distances = dict()
for residue1 in structure_info[0]["HI"]["residues"]:
    distance = 2000
    for chain in ["HI", "JK", "LM"]:
        for residue2 in structure_info[0][chain]["residues"]:
            if residue1["CA"] - residue2["CA"] < distance:
                distance = residue1["CA"] - residue2["CA"]
                all_distances[(residue1._id[1], residue2._id[1])] = distance


wilcoxon_R(epi_distances, all_distances, structures[current_seq])



In [6]:
# For 2VIU (H3)
parser = PDBParser()
current_seq = 2
structure = parser.get_structure(structures[current_seq], structures[current_seq])

# generating a dict with structure info (model_id:model) and inside each model we have (chain_id:chain)
structure_info = dict()
for model in structure:
    model_id = model.get_full_id()[1]
    structure_info[model_id] = dict()
    for chain in model.get_chains():
        chain_id = chain.get_full_id()[2]
        structure_info[model_id][chain_id] = {"residues":[], "seq":""}
        for residue in chain.get_residues():
            if residue.get_id()[0] == " ":
                structure_info[model_id][chain_id]["residues"].append(residue)
                structure_info[model_id][chain_id]["seq"] += protein_letters_3to1[residue.resname]
                
# Mergin chains A and B then aligning to that
for model in structure_info.keys():
    structure_info[model]["AB"] = dict()
    structure_info[model]["AB"]["seq"] = structure_info[model]["A"]["seq"] + structure_info[model]["B"]["seq"]
    structure_info[model]["AB"]["residues"] = structure_info[model]["A"]["residues"] + structure_info[model]["B"]["residues"]


alignments = pairwise2.align.globaldx(my_seqs[current_seq], structure_info[0]["AB"]["seq"], blosum62)
all_residues = structure_info[0]["AB"]["residues"]

# matches here is a dictionary that matches a certain aa in the sequence to a residue in the structure
# this is needed because the residue ids in the structure is not the same as the cannonical sequence
# and this mapping is needed
alignment = alignments[0]
seq_pos = -1
residue_pos = -1
matches = dict()
for idx in range(len(alignment[0])):
    if alignment[0][idx] != "-":
        seq_pos += 1
    if alignment[1][idx] != "-":
        residue_pos += 1
    if (alignment[0][idx] != "-") and (alignment[1][idx] != "-"):
        assert alignment[0][idx] == my_seqs[current_seq][seq_pos]
        matches[seq_pos] = (seq_pos, alignment[0][idx], residue_pos,
                        protein_letters_3to1[all_residues[residue_pos].resname])

        
# Loading the epistatic pairs
epi_pairs = []
with open(p_values[current_seq], "r") as in_file:
    next(in_file)
    for l in in_file:
        l = l.split("\t")
        # first and second column of that table
        pos1 = int(l[0])
        pos2 = int(l[1])
        if pos1 > pos2:
            epi_pairs.append((pos1, pos2))
        else:
            epi_pairs.append((pos2, pos1))
            

# we need to check the distances accross models between residues in AB
chain = "AB"
epi_distances = dict()
for pos1, pos2 in epi_pairs:
    if (pos1 in matches) and (pos2 in matches):
        distance = 2000
        for model in structure_info.values():
            c_dist = structure_info[0][chain]["residues"][matches[pos1][2]]["CA"] - model[chain]["residues"][matches[pos2][2]]["CA"]
            if c_dist < distance:
                distance = c_dist
        epi_distances[(pos1, pos2)] = distance
                
all_distances = dict()
for residue1 in structure_info[0][chain]["residues"]:
    distance = 2000
    for model in structure_info.values():
        for residue2 in model[chain]["residues"]:
            if residue1["CA"] - residue2["CA"] < distance:
                all_distances[(residue1._id[1], residue2._id[1])] = residue1["CA"] - residue2["CA"]

                
wilcoxon_R(epi_distances, all_distances, structures[current_seq])



In [8]:
# For 3BEQ (N1)
parser = PDBParser()
current_seq = 3
structure = parser.get_structure(structures[current_seq], structures[current_seq])

# generating a dict with structure info (model_id:model) and inside each model we have (chain_id:chain)
structure_info = dict()
for model in structure:
    model_id = model.get_full_id()[1]
    structure_info[model_id] = dict()
    for chain in model.get_chains():
        chain_id = chain.get_full_id()[2]
        structure_info[model_id][chain_id] = {"residues":[], "seq":""}
        for residue in chain.get_residues():
            if residue.get_id()[0] == " ":
                structure_info[model_id][chain_id]["residues"].append(residue)
                structure_info[model_id][chain_id]["seq"] += protein_letters_3to1[residue.resname]


# I need to consider both chains A and B, but they are the same so I can align to only one
alignments = pairwise2.align.globaldx(my_seqs[current_seq], structure_info[0]["A"]["seq"], blosum62)
# However all_residues will be different for A and B
all_residues = structure_info[0]["A"]["residues"]

print(format_alignment(*alignments[0]))

# matches here is a dictionary that matches a certain aa in the sequence to a residue in the structure
# this is needed because the residue ids in the structure is not the same as the cannonical sequence
# and this mapping is needed
alignment = alignments[0]
seq_pos = -1
residue_pos = -1
matches = dict()
for idx in range(len(alignment[0])):
    if alignment[0][idx] != "-":
        seq_pos += 1
    if alignment[1][idx] != "-":
        residue_pos += 1
    if (alignment[0][idx] != "-") and (alignment[1][idx] != "-"):
        assert alignment[0][idx] == my_seqs[current_seq][seq_pos]
        matches[seq_pos] = (seq_pos, alignment[0][idx], residue_pos,
                        protein_letters_3to1[all_residues[residue_pos].resname])

        
# Loading the epistatic pairs
epi_pairs = []
with open(p_values[current_seq], "r") as in_file:
    next(in_file)
    for l in in_file:
        l = l.split("\t")
        # first and second column of that table
        pos1 = int(l[0])
        pos2 = int(l[1])
        if pos1 > pos2:
            epi_pairs.append((pos1, pos2))
        else:
            epi_pairs.append((pos2, pos1))
            
            
            
# There will be 4 comparisons, between A and A in first, A and B in the first
# between A and A in the second model and between A and B in the second model

# same model
epi_distances = dict()
for pos1, pos2 in epi_pairs:
    if (pos1 in matches) and (pos2 in matches):
        # now only in first model
        distance = 2000
        for chain in ["A", "B"]:
            c_dist = structure_info[0]["A"]["residues"][matches[pos1][2]]["CA"] - structure_info[0][chain]["residues"][matches[pos2][2]]["CA"]
            if c_dist < distance:
                distance = c_dist
                
        epi_distances[(pos1, pos2)] = distance

# checking the distance between A in first model with A and B in second model
for pos1, pos2 in epi_distances.keys():
    for chain in ["A", "B"]:
        c_dist = structure_info[0]["A"]["residues"][matches[pos1][2]]["CA"] - structure_info[1][chain]["residues"][matches[pos2][2]]["CA"]
        if c_dist < epi_distances[(pos1, pos2)]:
            epi_distances[(pos1, pos2)] = c_dist
            
# distances between A and B in first model
all_distances = dict()
for residue1 in structure_info[0]["A"]["residues"]:
    distance = 2000
    for chain in ["A", "B"]:
        for residue2 in structure_info[0][chain]["residues"]:
            if residue1["CA"] - residue2["CA"] < distance:
                distance = residue1["CA"] - residue2["CA"]
                all_distances[(residue1._id[1], residue2._id[1])] = residue1["CA"] - residue2["CA"]

# distances between A and A,B in second model
for residue1 in structure_info[0]["A"]["residues"]:
    for chain in ["A", "B"]:
        for residue2 in structure_info[1][chain]["residues"]:
            distance = residue1["CA"] - residue2["CA"]
            if distance < all_distances[(residue1._id[1], residue2._id[1])]:
                all_distances[(residue1._id[1], residue2._id[1])] = distance

print(epi_distances)
wilcoxon_R(epi_distances, all_distances, structures[current_seq])



MNPNQKIITIGSISIAIGIISLMLQIGNIISIWASHSIQTGSQNNTGICNQRIITYENSTWVNHTYVNINNTNVVAGEDKTSVTLAGNSSLCS-ISGWAIYTKDNS-IRIGSKGDVFVIREPFISCSHLECRTFFLTQGALLNDKHSNGTVKDRSPYRA-LMSCPLGEAPSPYNSKFESVAWSASACHDGMGWLTIGISGPDNGAVAVLKYNGIITG-TIKSWKKQ--ILRTQESECV-CMNGSCFTIMTDGPSN-KAASYKIF-KIEKGKVTKSIELNAPNFHYEECSCYPDTGI-VMCVCRDNWHGSNRPWVSFNQNLDYQIGYICSGVFGDNPRPE-DGE-GSCN-PVTV-DGANGVKGFSYKYDNGVWIGRTKS-NRL-RK-GFEMIWDPNGWTN-TDSD-FSVKQDVVAITDWSGYSGSFVQHPELTGLDCIRPCFWVELVRGL-PRENTTIWTSGSSISFCGVNSDTAN--WSWPDGAELPFTIDK
                  .|   |                                                           |  ||||||  |||||||.|||  |||||||||||||||||||||||||||||||||||||||||||||||||||  |||||.|||||||||.||||||||||||||||||||||||||||||||||||||||  ||||| .   |||||||||  |.|||||||||||||| . |||||  |||||||||||||||||.||||||||||||  |||||||||||||||||||.|||||||||||||||||||||  ||  |||  ||.  .||||.||||..|||||||||||| .   |  ||||||||||||  |||  |||.||.||||||||||||||||||||||||.||||||||.||  |.|| ||||||||||||||||||    |||||||||||.|  
--------------

KeyError: (83, 84)

In [10]:
# For 5C7K (HIV1 Subtype A and B)
parser = PDBParser()
current_seq = 5
# The same script can be used for Subtype B, just changing the current_seq to 5
structure = parser.get_structure(structures[current_seq], structures[current_seq])

# generating a dict with structure info (model_id:model) and inside each model we have (chain_id:chain)
structure_info = dict()
for model in structure:
    model_id = model.get_full_id()[1]
    structure_info[model_id] = dict()
    for chain in model.get_chains():
        chain_id = chain.get_full_id()[2]
        structure_info[model_id][chain_id] = {"residues":[], "seq":""}
        for residue in chain.get_residues():
            if residue.get_id()[0] == " ":
                structure_info[model_id][chain_id]["residues"].append(residue)
                structure_info[model_id][chain_id]["seq"] += protein_letters_3to1[residue.resname]


# chains A, B, E, and F are Homo Sapiens, but chains C and D are HIV
for model in structure_info.keys():
    structure_info[model]["CD"] = dict()
    structure_info[model]["CD"]["seq"] = structure_info[model]["C"]["seq"] + structure_info[model]["D"]["seq"]
    structure_info[model]["CD"]["residues"] = structure_info[model]["C"]["residues"] + structure_info[model]["D"]["residues"]
    print(structure_info[model]["CD"]["seq"])
    
# aligning to the concatenation of C and D
alignments = pairwise2.align.globaldx(my_seqs[current_seq], structure_info[0]["CD"]["seq"], blosum62)

# However all_residues will be different for A and B
all_residues = structure_info[0]["CD"]["residues"]

# matches here is a dictionary that matches a certain aa in the sequence to a residue in the structure
# this is needed because the residue ids in the structure is not the same as the cannonical sequence
# and this mapping is needed
alignment = alignments[0]
seq_pos = -1
residue_pos = -1
matches = dict()
for idx in range(len(alignment[0])):
    if alignment[0][idx] != "-":
        seq_pos += 1
    if alignment[1][idx] != "-":
        residue_pos += 1
    if (alignment[0][idx] != "-") and (alignment[1][idx] != "-"):
        assert alignment[0][idx] == my_seqs[current_seq][seq_pos]
        matches[seq_pos] = (seq_pos, alignment[0][idx], residue_pos,
                        protein_letters_3to1[all_residues[residue_pos].resname])

        
# Loading the epistatic pairs
epi_pairs = []
with open(p_values[current_seq], "r") as in_file:
    next(in_file)
    for l in in_file:
        l = l.split("\t")
        # first and second column of that table
        pos1 = int(l[0])
        pos2 = int(l[1])
        if pos1 > pos2:
            epi_pairs.append((pos1, pos2))
        else:
            epi_pairs.append((pos2, pos1))
            

# for each pair, we calculate the distance accross models and keep the smallest one
chain = "CD"
epi_distances = dict()
for pos1, pos2 in epi_pairs:
    if (pos1 in matches) and (pos2 in matches):
        distance = 2000
        for model in structure_info.values():
#             assert matches[pos1][1] == protein_letters_3to1[model[chain]["residues"][matches[pos1][2]].resname]
            c_dist = structure_info[0][chain]["residues"][matches[pos1][2]]["CA"] - model[chain]["residues"][matches[pos2][2]]["CA"]
            if c_dist < distance:
                distance = c_dist
        epi_distances[(pos1, pos2)] = distance

# getting all pairwise distances
all_distances = dict()
for residue1 in structure_info[0][chain]["residues"]:
    distance = 2000
    for model in structure_info.values():
        for residue2 in model[chain]["residues"]:
            if residue1["CA"] - residue2["CA"] < distance:
                distance = residue1["CA"] - residue2["CA"]
                all_distances[(residue1._id[1], residue2._id[1])] = distance


wilcoxon_R(epi_distances, all_distances, structures[current_seq])



RAENLWVTVYYGVPVWKDAETTLFCASDAKAYETEKHNVWATHACVPTDPNPQEIHLENVTEEFNMWKNNMVEQMHTDIISLWDQSLKPCVKLTPLCVTLQCTNVTDMRGELKNCSFNMTTELRDKKQKVYSLFYRLDVVQINNKEYRLINCNTSAITQACPKVSFEPIPIHYCAPAGFAILKCKDKKFNGTGPCPSVSTVQCTHGIKPVVSTQLLLNGSLAEEEVMIRSENITNNAKNILVQFNTPVQINCTRPNNNTRKSIRIGPGQAFYATGDIIGDIRQAHCNVSKATWNETLGKVVKQLRKHFGNNTIIRFANSSGGDLEVTTHSFNCGGEFFYCNTSGLFNSTWISNTSVNDSITLPCRIKQIINMWQRIGQAMYAPPIQGVIRCVSNITGLILTRDGGSTNSTTETFRPGGGDMRDNWRSELYKYKVVKIEPLGVAPTRCKRRFLGAAGSTMGAASMTLTVQARNLLKLTVWGIKQLQARVLAVERYLRDQQLLGIWGCSGKLICCTNVPWNSSWSNRNLSEIWDNMTWLQWDKEISNYTQIIYGLLEESQNQQEKNEQDLLALD
RAENLWVTVYYGVPVWKDAETTLFCASDAKAYETEKHNVWATHACVPTDPNPQEIHLENVTEEFNMWKNNMVEQMHTDIISLWDQSLKPCVKLTPLCVTLQCTNVTDMRGELKNCSFNMTTELRDKKQKVYSLFYRLDVVQINNKEYRLINCNTSAITQACPKVSFEPIPIHYCAPAGFAILKCKDKKFNGTGPCPSVSTVQCTHGIKPVVSTQLLLNGSLAEEEVMIRSENITNNAKNILVQFNTPVQINCTRPNNNTRKSIRIGPGQAFYATGDIIGDIRQAHCNVSKATWNETLGKVVKQLRKHFGNNTIIRFANSSGGDLEVTTHSFNCGGEFFYCNTSGLFNSTWISNTSVNDSITLPCRIKQIINMWQRIGQAMYAPPIQGVIRCVSNITGLILTRDGGSTNSTTETFRPGGGDMRDNWRS

In [11]:
# For 6MYY (HIV1 Subtype C)
parser = PDBParser()
current_seq = 6
# The same script can be used for Subtype B, just changing the current_seq to 5
structure = parser.get_structure(structures[current_seq], structures[current_seq])
# generating a dict with structure info (model_id:model) and inside each model we have (chain_id:chain)
structure_info = dict()
for model in structure:
    model_id = model.get_full_id()[1]
    structure_info[model_id] = dict()
    for chain in model.get_chains():
        chain_id = chain.get_full_id()[2]
        structure_info[model_id][chain_id] = {"residues":[], "seq":""}
        for residue in chain.get_residues():
            if residue.get_id()[0] == " ":
                structure_info[model_id][chain_id]["residues"].append(residue)
                structure_info[model_id][chain_id]["seq"] += protein_letters_3to1[residue.resname]

    
# aligning to A, because A, B, and E are the same
alignments = pairwise2.align.globaldx(my_seqs[current_seq], structure_info[0]["A"]["seq"], blosum62)

# However all_residues will be different for A and B
all_residues = structure_info[0]["A"]["residues"]

# matches here is a dictionary that matches a certain aa in the sequence to a residue in the structure
# this is needed because the residue ids in the structure is not the same as the cannonical sequence
# and this mapping is needed
alignment = alignments[0]
seq_pos = -1
residue_pos = -1
matches = dict()
for idx in range(len(alignment[0])):
    if alignment[0][idx] != "-":
        seq_pos += 1
    if alignment[1][idx] != "-":
        residue_pos += 1
    if (alignment[0][idx] != "-") and (alignment[1][idx] != "-"):
        assert alignment[0][idx] == my_seqs[current_seq][seq_pos]
        matches[seq_pos] = (seq_pos, alignment[0][idx], residue_pos,
                        protein_letters_3to1[all_residues[residue_pos].resname])

        
# Loading the epistatic pairs
epi_pairs = []
with open(p_values[current_seq], "r") as in_file:
    next(in_file)
    for l in in_file:
        l = l.split("\t")
        # first and second column of that table
        pos1 = int(l[0])
        pos2 = int(l[1])
        if pos1 > pos2:
            epi_pairs.append((pos1, pos2))
        else:
            epi_pairs.append((pos2, pos1))
            
# for each pair we calculate the distance between A
epi_distances = dict()
for pos1, pos2 in epi_pairs:
    if (pos1 in matches) and (pos2 in matches):
        distance = 2000
        for chain in ["A", "B", "E"]:
#         for chain in structure_info.values():
#             assert matches[pos1][1] == protein_letters_3to1[model[chain]["residues"][matches[pos1][2]].resname]
            c_dist = structure_info[0]["A"]["residues"][matches[pos1][2]]["CA"] - structure_info[0][chain]["residues"][matches[pos2][2]]["CA"]
            if c_dist < distance:
                distance = c_dist
        epi_distances[(pos1, pos2)] = distance

all_distances = dict()
for residue1 in structure_info[0]["A"]["residues"]:
    distance = 2000
    for chain in ["A", "B", "E"]:
        for residue2 in structure_info[0][chain]["residues"]:
            if residue1["CA"] - residue2["CA"] < distance:
                distance = residue1["CA"] - residue2["CA"]
                all_distances[(residue1._id[1], residue2._id[1])] = distance

                
wilcoxon_R(epi_distances, all_distances, structures[current_seq])



0