In [1]:
import sys
sys.path.append("../src/")
from pairwiseGlobalAlignment import pairwiseGlobalAlignment
import pandas as pd

In [2]:
# Import sequence alignment data
trainingDataFeatures = pd.read_csv("../data/Table S7 CS training data with all features.csv")

In [3]:
# Check the shape of the imported data
trainingDataFeatures.shape

(3646, 1387)

"E13083.pdb" and "E13082.pdb" have the same surrounding 21mer sequences, so I'm going to exclude "E13082" to match the output from Table S7. For more information, see the "duplicate_21mers_fromS7.csv" table in this directory.

In [4]:
# Remove E13082 (see note above)
trainingDataFeatures = trainingDataFeatures[trainingDataFeatures["Protein ID"] != "E13082"]

# Get the unique 21mer training data features and their associated labels
kmerSequences = list(trainingDataFeatures["Surrounding 21mer"].unique())
siteByStructureLabels = list(trainingDataFeatures["Site by Structure"].unique())

In [5]:
# Just a quick informal check. These lengths should definitely be equal
len(kmerSequences)
len(kmerSequences) == len(siteByStructureLabels)

True

In [6]:
# Get pairwise global alginments of our kmer sequences
alignments = pairwiseGlobalAlignment(kmerSequences, "../matricies/")

In [7]:
# Set up a dataframe to view the pairwise global alignments more easily
alignmentsdf = pd.DataFrame(alignments[0])
alignmentsdf.columns = siteByStructureLabels
alignmentsdf["Site by Structure"] = siteByStructureLabels
alignmentsdf.set_index("Site by Structure", inplace = True)

In [14]:
# Take a peak at the alignments
alignmentsdf.iloc[0:5,0:5]

Unnamed: 0_level_0,2zjr_C.pdb_K116,2zjr_C.pdb_K129,2zjr_C.pdb_K87,2zjr_C.pdb_P18,2zjr_C.pdb_P20
Site by Structure,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2zjr_C.pdb_K116,26.0,-0.8,1.4,1.0,-0.4
2zjr_C.pdb_K129,-0.8,29.0,1.0,1.6,-0.8
2zjr_C.pdb_K87,1.4,1.0,31.8,1.2,1.6
2zjr_C.pdb_P18,1.0,1.6,1.2,29.8,17.2
2zjr_C.pdb_P20,-0.4,-0.8,1.6,17.2,28.8


In [9]:
# Grab the true alignments from the S7 table, order them in the same order as the alignmentsdf above
trainingDataFeatures_ordered = pd.DataFrame()
trainingDataFeatures_unique = trainingDataFeatures.drop_duplicates("Site by Structure")

for site in siteByStructureLabels:
    trainingDataFeatures_ordered[site] = trainingDataFeatures_unique[site]
    
trainingDataFeatures_ordered["Site by Structure"] = trainingDataFeatures["Site by Structure"]
trainingDataFeatures_ordered.set_index("Site by Structure", inplace = True)

In [17]:
# Take a peak at the alignments from the S7 table
trainingDataFeatures_ordered.iloc[0:5,0:5]
trainingDataFeatures_ordered.shape

(979, 979)

In [32]:
import numpy as np
diff_python_matlab = np.zeros((979, 979))

for i in range(979):
    for j in range(979):
        difference = trainingDataFeatures_ordered.iloc[i, j] - alignmentsdf.iloc[i,j]
        diff_python_matlab[i, j] = difference 
        
print(diff_python_matlab.max())

1.4210854715202004e-14
