In [1]:
# Importing required classes
from matplotlib import pyplot as plt
import numpy as np

# Importing main.py
from main import Main

# Importing helper functions from main
from main import init_weight_array_modified
from main import epitope_distance

In [2]:
# Importing Python Files from src Directory
""" Python File Imports"""
from src.pngs import PNGS
from src.panel import Panel
from src.blosum import BLOSUM
from src.weights import Weights
from src.epitope_dist import get_epitope_distance
from src.ic50 import IC50

""" Relative Python Paths """
rel_panel_path = './files/seap2020/136_panel_with_4lts.fa'
rel_weight_path = './files/seap2020/vrc01_wts.4lts.txt'
rel_blosum_path = './files/seap2020/BLOSUM62.txt'
rel_ic50_path = './files/seap2020/vrc01_ic50.txt'

""" Instantiating Each Class """
panel = Panel(rel_panel_path)
blosum = BLOSUM(rel_blosum_path)
weights = Weights(rel_weight_path)
weight_array_modified = np.zeros(panel.get_seq_length())
ic50 = IC50(rel_ic50_path, (panel.get_number_of_seq() - 2))

# print('5 lowest sequences', ic50.get_lowest_ic50_sequences(5))

In [3]:
# If I wanted to make modifications to any of the parameters for epitope_distance

# Editing the reference sequence
reference_sequence = panel.get_reference_sequence(ic50.get_lowest_ic50_sequences(10)) # This panel.get_reference_sequence() function has one parameter, a numpy array of all the of the sequence names you want to read

from scipy.stats import spearmanr

def get_data(ref_seq): 
    blosum_dict = Main.get_blosum_dict()
    ic50_weights = Main.log_base_10(Main.get_ic50_weights()) # Get the ic50 weight array, and then log it by 10 
    # 2D matrix containing epitope distance and its respective IC50 concentration
    data_2d = epitope_distance(ref_seq, blosum_dict, ic50_weights)
    
    # Epitope Distances
    x = data_2d[0]
    # IC50 Concentrations
    y = data_2d[1]
    
    # Calculate Spearman's correlation
    coef, p = spearmanr(x, y)
    
    # print("Spearman's correlation coefficient: %.3f" % coef) # Try to improve this value
    # print("P Value: %.3f" % p)
    
    return coef, p

In [4]:
test = get_data(reference_sequence)
print('Correlation Coefficient:', test[0])
print('P Value: ', test[1])

ep_dist 0.031158147298151834 IC50 Concentration 1.5767903881385104 i 6
ep_dist 0.05551982401548036 IC50 Concentration 1.6989700043360187 i 18
ep_dist 0.06465122122202956 IC50 Concentration 1.6989700043360187 i 37
ep_dist 0.04451873509620585 IC50 Concentration 1.6989700043360187 i 38
ep_dist 0.034694557609772164 IC50 Concentration 1.6989700043360187 i 39
ep_dist 0.05716715686538226 IC50 Concentration 1.6989700043360187 i 51
ep_dist 0.030904341667618157 IC50 Concentration 1.6989700043360187 i 79
ep_dist 0.04097860502574471 IC50 Concentration 1.6989700043360187 i 80
ep_dist 0.06254081992387167 IC50 Concentration 1.6989700043360187 i 108
ep_dist 0.05711080553048442 IC50 Concentration 1.6989700043360187 i 113
ep_dist 0.016389492506742598 IC50 Concentration 1.6989700043360187 i 121
ep_dist 0.036455582113197466 IC50 Concentration 1.6989700043360187 i 125
Correlation Coefficient: 0.11685258288191519
P Value:  0.17548402884339026


In [5]:
ans_list = []

for i in range(1, 20): #change the range(0, 25) to range(1, 135)
    new_ref_seq = panel.get_reference_sequence(ic50.get_lowest_ic50_sequences(i))
    raw_data = get_data(new_ref_seq)
    spearman = raw_data[0]
    p_value = raw_data[1]
    ans_list.append(spearman)

print('ans_list', ans_list)

maxCoef = 0
index = 0
for i in range(0, len(ans_list)):
    if ans_list[i] > maxCoef:
        maxCoef = ans_list[i]
        index = i

print('Max Spearman Correlation Coefficient', maxCoef, ' and its Index is', index)

ep_dist 0.021662457572168003 IC50 Concentration 1.5767903881385104 i 6
ep_dist 0.07502270739737928 IC50 Concentration 1.6989700043360187 i 18
ep_dist 0.0728249795393046 IC50 Concentration 1.6989700043360187 i 37
ep_dist 0.03189079002184811 IC50 Concentration 1.6989700043360187 i 38
ep_dist 0.05327277913759268 IC50 Concentration 1.6989700043360187 i 39
ep_dist 0.049640444795071514 IC50 Concentration 1.6989700043360187 i 51
ep_dist 0.03471586822744843 IC50 Concentration 1.6989700043360187 i 79
ep_dist 0.04481150822593312 IC50 Concentration 1.6989700043360187 i 80
ep_dist 0.08229184571971157 IC50 Concentration 1.6989700043360187 i 108
ep_dist 0.0745853530195924 IC50 Concentration 1.6989700043360187 i 113
ep_dist 0.04983551586659338 IC50 Concentration 1.6989700043360187 i 121
ep_dist 0.07454382374781028 IC50 Concentration 1.6989700043360187 i 125
ep_dist 0.02009239314892336 IC50 Concentration 1.5767903881385104 i 6
ep_dist 0.06662512128723988 IC50 Concentration 1.6989700043360187 i 18
ep_d

ep_dist 0.05716715686538226 IC50 Concentration 1.6989700043360187 i 51
ep_dist 0.030904341667618157 IC50 Concentration 1.6989700043360187 i 79
ep_dist 0.04097860502574471 IC50 Concentration 1.6989700043360187 i 80
ep_dist 0.06254081992387167 IC50 Concentration 1.6989700043360187 i 108
ep_dist 0.05711080553048442 IC50 Concentration 1.6989700043360187 i 113
ep_dist 0.016389492506742598 IC50 Concentration 1.6989700043360187 i 121
ep_dist 0.036455582113197466 IC50 Concentration 1.6989700043360187 i 125
ep_dist 0.031158147298151834 IC50 Concentration 1.5767903881385104 i 6
ep_dist 0.05551982401548036 IC50 Concentration 1.6989700043360187 i 18
ep_dist 0.06465122122202956 IC50 Concentration 1.6989700043360187 i 37
ep_dist 0.04451873509620585 IC50 Concentration 1.6989700043360187 i 38
ep_dist 0.034694557609772164 IC50 Concentration 1.6989700043360187 i 39
ep_dist 0.05716715686538226 IC50 Concentration 1.6989700043360187 i 51
ep_dist 0.030904341667618157 IC50 Concentration 1.6989700043360187 i 