In [93]:
import numpy as np
import pandas as pd
import random
import math
from time import time
import matplotlib.pyplot as plt

In [58]:
peptides_file = r'C:\Users\kcf\Desktop\DTU\22125 Algorithms in bioinformatics Jun 20\Project\predictions_smm' + '\predictions_smm.txt'
#peptides_file = data_dir + "PSSM/A0201.small_lig"
#peptides_file = data_dir + "PSSM/A0201.large_lig"

data = pd.read_csv(peptides_file, sep='\t')
random.seed(42)
#peptides = np.loadtxt(peptides_file, dtype=str).tolist()

In [59]:
print(data.head())
print(data.tail())

      species       allele  length  cv     sequence inequality      ic50  \
0  chimpanzee  Patr A*0901      11   0  ACISSEATTPV          =  221.2610   
1  chimpanzee  Patr A*0901      11   0  AQISSEATTPV          =   41.1889   
2  chimpanzee  Patr A*0901      11   0  ARISSEATTPV          =  621.9980   
3  chimpanzee  Patr A*0901      11   0  AYESSEATTPV          =  421.1170   
4  chimpanzee  Patr A*0901      11   0  AYFSSEATTPV          =   54.9194   

     smm  
0   53.8  
1   53.8  
2   53.8  
3  166.0  
4  166.0  
      species  allele  length  cv   sequence inequality         ic50  \
48823   mouse  H-2 Ld       9   4  SLSAYIIRV          >  77142.90000   
48824   mouse  H-2 Ld       9   4  SPTVWLSVI          =  27000.00000   
48825   mouse  H-2 Ld       9   4  SSYRRPVGI          >  77142.90000   
48826   mouse  H-2 Ld       9   4  SYIPSAEKI          =  30816.10000   
48827   mouse  H-2 Ld       9   4  YPHYMPTNL          =      9.64286   

             smm  
48823    42442.4  
48824 

In [60]:
len(data)

48828

In [61]:
data_dir = r'C:\Users\kcf\Desktop\DTU\22125 Algorithms in bioinformatics Jun 20\data'

In [62]:
alphabet_file = data_dir + r"\Matrices\alphabet"
alphabet = np.loadtxt(alphabet_file, dtype=str)

#alphabet

bg_file = data_dir + r"\Matrices\bg.freq.fmt"
_bg = np.loadtxt(bg_file, dtype=float)

bg = {}
for i in range(0, len(alphabet)):
    bg[alphabet[i]] = _bg[i]

bg

blosum_file = data_dir + r"\Matrices\blosum62.freq_rownorm"
_blosum62 = np.loadtxt(blosum_file, dtype=float).reshape((20, 20)).T

blosum62 = {}

for i, letter_1 in enumerate(alphabet):
    
    blosum62[letter_1] = {}

    for j, letter_2 in enumerate(alphabet):
        
        blosum62[letter_1][letter_2] = _blosum62[i, j]

#blosum62

In [63]:
def initialize_matrix(core_len, alphabet):

    init_matrix = [0]*core_len

    for i in range(0, core_len):

        row = {}

        for letter in alphabet: 
            row[letter] = 0.0

        #fancy way:  row = dict( zip( alphabet, [0.0]*len(alphabet) ) )

        init_matrix[i] = row
        
    return init_matrix



def put_to_zero(matrix):
    
    for i in range(0, len(matrix)):

        for key in matrix[i].keys():
        
            matrix[i][key] = 0.0
    
    return matrix


            
def get_log_odds(peptides, alphabet, bg, scoring_scheme, core_len, c_matrix, f_matrix, g_matrix, p_matrix, w_matrix):

    # Amino Acid Count Matrix (c)

    c_matrix = put_to_zero(c_matrix)

    for position in range(0, core_len):

        # peptides has two elements; element[0] is the peptide sequence, element [1] is the core location
        for element in peptides:
            
            peptide = element[0]
            
            core_start = element[1]
            
            c_matrix[position][peptide[core_start+position]] += 1

            
    # Sequence Weighting
    weights = {}
    sequence_weighting = True
    #sequence_weighting = False

    for element in peptides:

        peptide = element[0]
        core_start = element[1]
            
        # apply sequence weighting
        if sequence_weighting:

            w = 0.0
            neff = 0.0

            for position in range(0, core_len):

                r = 0

                for letter in alphabet:        

                    if c_matrix[position][letter] != 0:

                        r += 1

                s = c_matrix[position][peptide[core_start+position]]

                w += 1.0/(r * s)

                neff += r

            neff = neff / core_len

        # do not apply sequence weighting
        else:

            w = 1  

            neff = len(peptides)  


        weights[peptide] = w

    
    # Observed Frequencies Matrix (f)
    f_matrix = put_to_zero(f_matrix)

    for position in range(0, core_len):

        n = 0;

        for element in peptides:

            peptide = element[0]
            
            core_start = element[1]
              
            f_matrix[position][peptide[core_start+position]] += weights[peptide]

            n += weights[peptide]

        for letter in alphabet: 

            f_matrix[position][letter] = f_matrix[position][letter]/n

    
    # Pseudo Frequencies Matrix (g)
    g_matrix = put_to_zero(g_matrix)

    for position in range(0, core_len):

        for letter_1 in alphabet:
            
            for letter_2 in alphabet:

                 g_matrix[position][letter_1] += f_matrix[position][letter_2] * scoring_scheme[letter_1][letter_2]

                    
    # Combined Frequencies Matrix (p)

    alpha = neff - 1
    beta = 50

    for position in range(0, core_len):

        for letter in alphabet:

            num = alpha*f_matrix[position][letter] + beta*g_matrix[position][letter]
            
            den = alpha + beta

            p_matrix[position][letter] = num / den
        

    # Log Odds Weight Matrix (w)
    for position in range(0, core_len):

        for letter in alphabet:

            if p_matrix[position][letter] != 0:
                
                w_matrix[position][letter] = math.log(p_matrix[position][letter]/bg[letter])/math.log(2)
    
    # Calculate the overall score of the peptides to the LO matrix
    _sum = 0
    for position in range(0, core_len):
        for letter in alphabet:
            _sum += f_matrix[position][letter] * w_matrix[position][letter]
    
    return w_matrix, _sum, p_matrix

In [64]:
def to_psi_blast(matrix):

    header = ["", "A", "R", "N", "D", "C", "Q", "E", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"]

    print('{:>4} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8}'.format(*header)) 

    letter_order = ["A", "R", "N", "D", "C", "Q", "E", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"]

    for i, row in enumerate(matrix):

        scores = []

        scores.append(str(i+1) + " A")

        for letter in letter_order:

            score = row[letter]

            scores.append(round(score, 4))

        print('{:>4} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8} {:>8}'.format(*scores)) 


In [65]:
def score_peptide(peptide, core_start, core_len, matrix):
    acum = 0
    for i in range(0, core_len):
        acum += matrix[i][peptide[i+core_start]]
    return acum

In [66]:
def initialize():
    
    # define M temperature steps, from T_i to T_f

    #core_len = min_pep_len
    core_len = 9

    T_i = 0.1
    T_f = 0.0001
    T_steps = 10
    T_delta = (T_f - T_i) / T_steps
    
    T = np.linspace(T_i,T_f,T_steps )
    iters_per_point = 4
    iters = len(peptides)*iters_per_point

    return T, iters

In [91]:
peptides = []
core_len=9
discard=0
for index, row in data.iterrows():
    if (len(row['sequence']) >= core_len) and (row['ic50']<100) and (row['allele']=='HLA A*0201'):
        peptides.append(row['sequence'])
    else:
        #print ("Peptide length too short discard", row['sequence'])
        discard+=1
print ('{} peptides discarded'.format(discard))
print ('{} peptides longer than {} amino acids and IC50 < 100'.format(len(peptides),core_len-1))

peptides = sorted(peptides, key=len)
min_pep_len = len(peptides[0])
max_pep_len = len(peptides[-1])

    # random core start
np.random.shuffle(peptides)
cores_start = [0]*len(peptides)



for i in range(0, len(cores_start)):
    if len(peptides[i]) != core_len:
        min_core_start = 0
        max_core_start = len(peptides[i]) - core_len
        cores_start[i] = random.randint(min_core_start, max_core_start)
        
#peptides = list(zip(peptides, cores_start))
peptides = list(map(list,zip(peptides, cores_start)))

47707 peptides discarded
1121 peptides longer than 8 amino acids and IC50 < 100


In [92]:
debug = False
#debug = True
import copy
from math import floor
random.seed(42)
to_keep=10
p_size=50
class one_of_many():
    def __init__(self,peptides):
        self.peptides=copy.deepcopy(peptides)
        self.score=-100
        self.get_score()
        
        
    def get_score(self):
        log_odds_matrix, self.score, _ = get_log_odds(self.peptides, alphabet, bg, blosum62, core_len, c_matrix, f_matrix, g_matrix, p_matrix, w_matrix)
    
    def mutate(self,peptides=None, mutation_rate=0.5):
        self.mu_r=mutation_rate
        if peptides==None:
            for i in range(len(self.peptides)):
                if random.uniform(0,1)<=self.mu_r:
                    max_core_start = len(self.peptides[i][0]) - core_len
                    self.peptides[i][1]=random.randint(0, max_core_start)
            return
        for i in range(len(self.peptides)):
            if random.uniform(0,1)<=self.mu_r:
                max_core_start = len(self.peptides[i][0]) - core_len
                self.peptides[i][1]=random.randint(0, max_core_start)
            else:
                self.peptides[i][1]=peptides[i][1]
        self.get_score()
        
    def splice(self,peptides_1, peptides_2,mutation_rate=0.0):
        size=len(peptides_1)
        split_at=random.randint(0,size)
        for i in range (0,size):
            
            if i <split_at:
                self.peptides[i][1]=peptides_1[i][1]
            else:
                self.peptides[i][1]=peptides_2[i][1]
        if mutation_rate!=0.0:
            self.mutate(mutation_rate=mutation_rate)
        self.get_score()    
    
population=[]
for i in range(p_size):
    
    population.append(one_of_many(peptides))
    population[-1].mutate(mutation_rate=1.0)
    
population.sort(key=lambda tup: tup.score, reverse=True)    
kld = []
iter_=2000
t_start=time()
mr=0.2
for i in range (iter_):
    for j in range (to_keep,p_size):
        if j<int(to_keep*3):
            #partner_1=random.randint(0,to_keep)
            #partner_2=random.randint(0,to_keep)
            #population[j].splice(population[partner_1].peptides,population[partner_2].peptides,mutation_rate=0.02)
            if bool(random.getrandbits(1)):
                partner_2=random.randint(0,to_keep)
                population[j].splice(population[0].peptides,population[partner_2].peptides,mutation_rate=0.02)
            else:
                partner_1=random.randint(0,to_keep)
                population[j].splice(population[partner_1].peptides,population[0].peptides,mutation_rate=0.02)
            
            #print (population[j].score,population[partner_1].score,population[partner_2].score)
        else:
            #parent=random.randint(0,to_keep)
            #population[j].mutate(population[parent].peptides,mutation_rate=mr)
            population[j].mutate(population[0].peptides,mutation_rate=mr)
    population.sort(key=lambda tup: tup.score, reverse=True)
    kld.append(population[0].score)
    if i%50==0:
        print ('Time: {:.1f} min\tIteration: {}\tscore: {:.2f}'.format((time()-t_start)/60,i,population[0].score))
        #mr=0.4*0.9
print ('Time: {:.1f} min\tIteration: {}\tscore: {:.2f}'.format((time()-t_start)/60,i,population[0].score))

Time: 0.1 min	Iteration: 0	score: 2.72
Time: 2.8 min	Iteration: 50	score: 3.07
Time: 6.1 min	Iteration: 100	score: 3.14
Time: 8.9 min	Iteration: 150	score: 3.16
Time: 11.6 min	Iteration: 200	score: 3.18
Time: 14.3 min	Iteration: 250	score: 3.20
Time: 17.7 min	Iteration: 300	score: 3.20
Time: 20.5 min	Iteration: 350	score: 3.21
Time: 23.3 min	Iteration: 400	score: 3.21
Time: 28.4 min	Iteration: 450	score: 3.21
Time: 34.2 min	Iteration: 500	score: 3.22
Time: 41.2 min	Iteration: 550	score: 3.22
Time: 47.0 min	Iteration: 600	score: 3.22
Time: 52.9 min	Iteration: 650	score: 3.22
Time: 58.9 min	Iteration: 700	score: 3.22
Time: 64.7 min	Iteration: 750	score: 3.22
Time: 70.8 min	Iteration: 800	score: 3.22


KeyboardInterrupt: 

In [90]:
#core_len = min_pep_len
core_len = 9


T, iters = initialize()

c_matrix = initialize_matrix(core_len, alphabet)
f_matrix = initialize_matrix(core_len, alphabet)
g_matrix = initialize_matrix(core_len, alphabet)
p_matrix = initialize_matrix(core_len, alphabet)
w_matrix = initialize_matrix(core_len, alphabet)

np.random.seed( 1 ) 

log_odds_matrix, peptide_scores, _ = get_log_odds(peptides, alphabet, bg, blosum62, core_len, c_matrix, f_matrix, g_matrix, p_matrix, w_matrix)

debug = False
#debug = True

kld = []

print( "Initial KLD score: " + str(peptide_scores))
kld.append( peptide_scores )

t0 = time()

for t in T:

    for i in range(0, iters):
            
        # extract peptide
        rand_index = random.randint(0,len(peptides)-1)
        peptide = peptides[rand_index][0]
        core_start_original = peptides[rand_index][1]

        # print stuff
        if debug:
            print("")
            print("------------")
            print("T: " + str(t) + ", i: " + str(i))
            print("------------")
            print("Peptide: " + str(peptide)),
            print("Core start: " + str(core_start_original) + " (" + peptide[core_start_original:core_start_original+core_len] + ")")


        if len(peptide) != core_len:
                
            max_core_start = len(peptide) - core_len

            core_start_shifted = random.randint(0, max_core_start)
                 
            #if debug: print("Shifted core start: " + str(peptide) + " " + str(core_start_shifted) + " (" + peptide[core_start_shifted:core_start_shifted+core_len] +")")
                
            # remove peptide from list
            peptides.remove(peptides[rand_index])

            # get base log_odds
            log_odds_matrix, peptide_scores, p_matrix = get_log_odds(peptides, alphabet, bg, blosum62, core_len, c_matrix, f_matrix, g_matrix, p_matrix, w_matrix)
            #pprint(log_odds_matrix)
                
            # score peptide against log_odds
            e_original = score_peptide(peptide, core_start_original, core_len, log_odds_matrix)
            if debug: print("Energy before shifting: " + str(e_original))

            # score shifted peptide against log_odds
            e_shift = score_peptide(peptide, core_start_shifted, core_len, log_odds_matrix)
            if debug: print("Energy after shifting: " + str(e_shift))

            # energy differential
            de = e_shift-e_original
            if debug: print("Energy differential: " + str(de))
    
            # probability of accepting move
            if ( de > 0):
                p = 1
            else:
                p = np.exp(de/t)
        
            if debug: print("Probability of shifting peptide: " + str(p))
    
            # throw coin
            coin = np.random.uniform(0.0, 1.0, 1)[0]
            if debug: print("RNG: " + str(coin))

            if coin < p:
                if debug: print("RNG < P, Move accepted")
                peptides.append((peptide, core_start_shifted))
                kld.append(peptide_scores)

            else:
                if debug: print("RNG >= P, Move rejected")
                peptides.append((peptide, core_start_original))      
                      
        else:
            if debug: print("Can't shift peptide, it is a " + str(core_len) + "mer")
              
    print( "KLD score t: " + str(t) + " KLD: " + str(peptide_scores))
    
t1 = time()

print("Time elapsed (m):", (t1-t0)/60)

Initial KLD score: 2.687991742662903
KLD score t: 0.1 KLD: 3.0417393959487864
KLD score t: 0.0889 KLD: 3.1255645963486827
KLD score t: 0.07780000000000001 KLD: 3.1313330112220013
KLD score t: 0.06670000000000001 KLD: 3.128789482190224
KLD score t: 0.055600000000000004 KLD: 3.136382045030785
KLD score t: 0.044500000000000005 KLD: 3.1287399667641136
KLD score t: 0.0334 KLD: 3.1305132981050328


KeyboardInterrupt: 

In [84]:
peptide_scores

1.1009358445386024