# Hobohm 1

## Python Imports

In [161]:
import numpy as np
from time import time
from math import sqrt
import pandas as pd
import glob
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity

## Data Imports

## DEFINE THE PATH TO YOUR COURSE DIRECTORY

In [15]:
data_dir = "data/"

In [16]:
alphabet_file = data_dir + "Matrices/alphabet"
alphabet = np.loadtxt(alphabet_file, dtype=str)

alphabet

blosum_file = data_dir + "Matrices/BLOSUM50"
_blosum50 = np.loadtxt(blosum_file, dtype=int).T

blosum50 = {}

for i, letter_1 in enumerate(alphabet):
    
    blosum50[letter_1] = {}

    for j, letter_2 in enumerate(alphabet):
        
        blosum50[letter_1][letter_2] = _blosum50[i, j]
        
blosum50      

{'A': {'A': 5,
  'R': -2,
  'N': -1,
  'D': -2,
  'C': -1,
  'Q': -1,
  'E': -1,
  'G': 0,
  'H': -2,
  'I': -1,
  'L': -2,
  'K': -1,
  'M': -1,
  'F': -3,
  'P': -1,
  'S': 1,
  'T': 0,
  'W': -3,
  'Y': -2,
  'V': 0},
 'R': {'A': -2,
  'R': 7,
  'N': -1,
  'D': -2,
  'C': -4,
  'Q': 1,
  'E': 0,
  'G': -3,
  'H': 0,
  'I': -4,
  'L': -3,
  'K': 3,
  'M': -2,
  'F': -3,
  'P': -3,
  'S': -1,
  'T': -1,
  'W': -3,
  'Y': -1,
  'V': -3},
 'N': {'A': -1,
  'R': -1,
  'N': 7,
  'D': 2,
  'C': -2,
  'Q': 0,
  'E': 0,
  'G': 0,
  'H': 1,
  'I': -3,
  'L': -4,
  'K': 0,
  'M': -2,
  'F': -4,
  'P': -2,
  'S': 1,
  'T': 0,
  'W': -4,
  'Y': -2,
  'V': -3},
 'D': {'A': -2,
  'R': -2,
  'N': 2,
  'D': 8,
  'C': -4,
  'Q': 0,
  'E': 2,
  'G': -1,
  'H': -1,
  'I': -4,
  'L': -4,
  'K': -1,
  'M': -4,
  'F': -5,
  'P': -1,
  'S': 0,
  'T': -1,
  'W': -5,
  'Y': -3,
  'V': -4},
 'C': {'A': -1,
  'R': -4,
  'N': -2,
  'D': -4,
  'C': 13,
  'Q': -3,
  'E': -3,
  'G': -3,
  'H': -3,
  'I': -2,
  'L'

### Sequences

In [76]:
def load_CSV_sequences(alm_file):
    
    data = pd.read_csv(alm_file)
    
    return data

In [149]:
def encode(peptides, encoding_scheme, alphabet):
    encoded_peptides = []
        
    for peptide in peptides:
        encoded_peptide = []

        for peptide_letter in peptide:
            
            for alphabet_letter in alphabet:
                
                encoded_peptide.append(encoding_scheme[peptide_letter][alphabet_letter])
                
        #add a 1 (bias)
        encoded_peptide.append(1)

        #store peptide
        encoded_peptides.append(encoded_peptide)
               
    return np.array(encoded_peptides)

In [165]:
def homology_function(alignment_length, matches, threshold, peptide1 = None, peptide2 = None):
    if (peptide1!=None) & (peptide2!=None):
         if len(peptide1) == len(peptide2):
                e_peptides = encode([peptide1, peptide2], blosum50, alphabet)
                peptide1 = np.array(e_peptides[0])
                peptide2 = np.array(e_peptides[1])
                homology_score = cosine(peptide1, peptide2)
                if homology_score < threshold:
                    
                    return "discard", homology_score
                else:
                    return "keep", homology_score


    homology_score = 2.9 * sqrt(alignment_length)

    if matches > homology_score: ## Add the inequally sign
        return "discard", homology_score
    else:
        return "keep", homology_score

## Smith-Waterman O2

### This code is identical to the code you wrote the other day

In [18]:
def smith_waterman(query, database, scoring_scheme, gap_open, gap_extension):
    
    P_matrix, Q_matrix, D_matrix, E_matrix, i_max, j_max, max_score = smith_waterman_alignment(query, database, scoring_scheme, gap_open, gap_extension)
    
    aligned_query, aligned_database, matches = smith_waterman_traceback(E_matrix, D_matrix, i_max, j_max, query, database, gap_open, gap_extension)
    
    return aligned_query, aligned_database, matches


def smith_waterman_alignment(query, database, scoring_scheme, gap_open, gap_extension):

    # Matrix imensions
    M = len(query)
    N = len(database)
    
    # D matrix change to float
    D_matrix = np.zeros((M+1, N+1), np.int)

    # P matrix
    P_matrix = np.zeros((M+1, N+1), np.int)
    
    # Q matrix
    Q_matrix = np.zeros((M+1, N+1), np.int)

    # E matrix
    E_matrix = np.zeros((M+1, N+1), dtype=object)

    # Main loop
    D_matrix_max_score, D_matrix_i_max, D_matrix_i_max = -9, -9, -9
    for i in range(M-1, -1, -1):
        for j in range(N-1, -1, -1):
            
            # Q_matrix[i,j] entry
            gap_open_database = D_matrix[i+1,j] + gap_open
            gap_extension_database = Q_matrix[i+1,j] + gap_extension
            max_gap_database = max(gap_open_database, gap_extension_database)
            
            Q_matrix[i,j] = max_gap_database
                
            # P_matrix[i,j] entry
            gap_open_query = D_matrix[i,j+1] + gap_open
            gap_extension_query = P_matrix[i,j+1] + gap_extension
            max_gap_query = max(gap_open_query, gap_extension_query)
            
            P_matrix[i,j] = max_gap_query
            
            # D_matrix[i,j] entry
            diagonal_score = D_matrix[i+1,j+1] + scoring_scheme[query[i]][database[j]]    
            
            # E_matrix[i,j] entry
            candidates = [(1, diagonal_score),
                          (2, gap_open_database),
                          (4, gap_open_query),
                          (3, gap_extension_database),
                          (5, gap_extension_query)]
            
            direction, max_score = max(candidates, key=lambda x: x[1])
            
            
            # check entry sign
            if max_score > 0:
                E_matrix[i,j] = direction
            else:
                E_matrix[i,j] = 0
            
            # check max score sign
            if max_score > 0:
                D_matrix[i, j] = max_score
            else:
                D_matrix[i, j] = 0

            # fetch global max score
            if max_score > D_matrix_max_score:
                D_matrix_max_score = max_score
                D_matrix_i_max = i
                D_matrix_j_max = j
            
    return P_matrix, Q_matrix, D_matrix, E_matrix, D_matrix_i_max, D_matrix_j_max, D_matrix_max_score


def smith_waterman_traceback(E_matrix, D_matrix, i_max, j_max, query, database, gap_open, gap_extension):
    
    # Matrix imensions
    M = len(query)
    N = len(database)
    
    # aligned query string
    aligned_query = []
    
    # aligned database string
    aligned_database = []
    
    # total identical matches
    matches = 0

        
    # start from max_i, max_j
    i, j = i_max, j_max
    while i < M and j < N:

        # E[i,j] = 0, stop back tracking
        if E_matrix[i, j] == 0:
            break
        
        # E[i,j] = 1, match
        if E_matrix[i, j] == 1:
            aligned_query.append(query[i])
            aligned_database.append(database[j])
            if ( query[i] == database[j]):
                matches += 1
            i += 1
            j += 1
        
        
        # E[i,j] = 2, gap opening in database
        if E_matrix[i, j] == 2:
            aligned_database.append("-")
            aligned_query.append(query[i])
            i += 1

            
        # E[i,j] = 3, gap extension in database
        if E_matrix[i, j] == 3:
                   
            count = i + 2
            score = D_matrix[count, j] + gap_open + gap_extension
            
            # Find length of gap
            while((score - D_matrix[i, j])*(score - D_matrix[i, j]) >= 0.00001):   
                count += 1
                score = D_matrix[count, j] + gap_open + (count-i-1)*gap_extension

            for k in range(i, count):
                aligned_database.append("-")
                aligned_query.append(query[i])
                i += 1
            
            
        # E[i,j] = 4, gap opening in query
        if E_matrix[i, j] == 4:
            aligned_query.append("-")
            aligned_database.append(database[j])
            j += 1
        
        
        # E[i,j] = 5, gap extension in query
        if E_matrix[i, j] == 5:
             
            count = j + 2
            score = D_matrix[i, count] + gap_open + gap_extension
            
            # Find length of gap
            while((score - D_matrix[i, j])*(score - D_matrix[i, j]) >= 0.0001): 
                count += 1
                score = D_matrix[i, count] + gap_open + (count-j-1)*gap_extension

            for k in range(j, count):
                aligned_query.append("-")
                aligned_database.append(database[j])
                j += 1

                
    return aligned_query, aligned_database, matches

## Hobohm 1

### Similarity Function

### This code defines the threshold for similarity

In [116]:
#def homology_function(alignment_length, matches):
#
#    homology_score = 2.9 * sqrt(alignment_length) # FIX FOR SHORT PEPTIDES
#    
#    if matches > homology_score: ## Add the inequally sign
#        return "discard", homology_score
#    else:
#        return "keep", homology_score

### Main Loop

In [None]:
# load list
training_files = glob.glob("data/train/*train.csv")
#training_files = "\\data\\HLA A_0101_train.csv"

for training_file in training_files:
    print(training_file)
    
    alm_file = training_file
    data = load_CSV_sequences(alm_file)
    #print(data)
    candidate_sequences = data['sequence']

    print ("# Numner of elements:", len(candidate_sequences))

    accepted_sequences = []
    clusters = {}
    cluster_number = 0

    accepted_sequences.append(candidate_sequences[0])
    #accepted_ids.append(candidate_ids[0])
    clusters[candidate_sequences[0]] = cluster_number
    cluster_number = cluster_number +1

    #print ("# Unique.", 0, len(accepted_sequences)-1)#, accepted_ids[0])

    # parameters
    scoring_scheme = blosum50
    gap_open = -11
    gap_extension = -1
    threshold = 0.35

    t0 = time()

    for i in range(1, len(candidate_sequences)):

        for j in range(0, len(accepted_sequences)):

            query = candidate_sequences[i]
            database = accepted_sequences[j]

            aligned_query, aligned_database, matches = smith_waterman(query, database, scoring_scheme, gap_open, gap_extension)

            alignment_length = len(aligned_query)

            homology_outcome, homology_score = homology_function(alignment_length, matches, threshold, query, database)
            if homology_score < threshold:
                print(query,database,homology_score)
            # query is not unique
            if homology_outcome == "discard":

                #print ("# Not unique.", i, candidate_sequences[i], "is homolog to", accepted_sequences[j], homology_score)

                get_cluster = clusters[accepted_sequences[j]]
                clusters[candidate_sequences[i]] = get_cluster 

                break

        # query is unique
        if homology_outcome == "keep":
            accepted_sequences.append(query)
            #accepted_ids.append(candidate_ids[i])
            clusters[query] = cluster_number
            cluster_number = cluster_number +1
            #print (i, candidate_sequences[i], homology_score)

    t1 = time()

    #print ("Elapsed time (m):", (t1-t0)/60)

    lst_clust = list(clusters.values())
    
    cluster_counts = []
    
    for cluster in lst_clust:
        cluster_counts.append(lst_clust.count(cluster))
    
    ratios = []
    
    for count in cluster_counts:
        ratios.append(1/count)        
    
    data['cluster'] = lst_clust
    data['ratio'] = ratios

    data.to_csv(training_file+'_cluster.csv', index=False)

data/train\HLA A_0101_train.csv
# Numner of elements: 82
IADMGHLKY ISDYDYYRY 0.30495950339585887
SSPLFNNFY VSSIFISFY 0.346102117328987
FADINGKLY HTEFEGQVY 0.32442606393466467
ATDKAAAAY LTDDMIAAY 0.33732969947120406
KSDGTGTIY VSDGGPNLY 0.3428653849206721
LSAFSLHSY LTEIDIRDY 0.3220514861749578
NSDPEFNVL YTNPQFNVY 0.26244513733273434
LSDDAVVCY LTDDMIAAY 0.23074025477521187
HLDMLRHLY RLDAFRQTY 0.29100384870818974
LADTSLSGY LTEIDIRDY 0.3266767837403092
YRSDIVGTY LTDDMIAAY 0.3050376350614249
GTEMFRHGY GTQLFEDNY 0.22731246739210853
AADKAAAAY ATAKAAAAY 0.16437923223170503
FVSVYFSDY YVDHYYRDY 0.2999309046751629
LTKQYLNLY VSSIFISFY 0.3464873554195964
LIENELMNY MIEPRTLQY 0.29782366577008534
CSNDKSLVY GSEDRDLLY 0.2606152320676305
FLDQWWTEY YVDHYYRDY 0.3153940124744733
data/train\HLA A_0201_train.csv
# Numner of elements: 944
FLYGGLLLA VILGVLLLI 0.32855434626087177
FLYGALVLA VILGVLLLI 0.28288601620876486
IILFILFFA VILGVLLLI 0.2913432103341277
RLLDDLVIV RFLEDYFGV 0.31583488404763704
LIFFVIILA VILGVL

FLKDVMESM YLPEVISTI 0.26479810543089644
FANHAFTLV FANNEFTLV 0.12504194263603574
ALLKHRFEI ALIHHNTHL 0.3311757447223854
TMLSIILVI VILGVLLLI 0.15838940386401001
LLAQFTSAI VLHSFTDAI 0.23398720334033618
NVFISPASI SMFLMTATL 0.30641081365096257
FIYSIMETI VLHSFTDAI 0.3466321304478923
VLIQRNPQL ALIHHNTHL 0.3072595732216944
FMDGKQACV FLGGTPVCL 0.259294336233882
ILPVIFLSI AITLVVISV 0.26052604820537006
QLFHLCLII HLIKIPLLI 0.33328134178702007
LLIHFLLSL VILGVLLLI 0.32448850273400265
ILGLPTQTV LLAVGATKV 0.33952150043480533
KLFYVYYNL AMFTTMYNI 0.31262844379025856
LLSEIRFYI LLTEVETYV 0.18010376077327217
FLYGALALA VILGVLLLI 0.34323387416476225
LLNMRDLIV CLALSDLLV 0.24575451772801826
FIDILLFVI FLAIKLYGV 0.337407970784922
KVIKLVKSL KIMEIVSHL 0.1985454822844258
ILAQVPFSV ITYQVPFSV 0.16616773488859293
FLSHDFTLV FANNEFTLV 0.18898653726460413
ALQNVMISI AITLVVISV 0.286565577285285
FTLNHVLAL ILYDNVVTL 0.342941006002786
YLLPAIVHI YLFPGPVYV 0.2775083611733298
SLSTKLKQV RLEARIAQL 0.27119647688039683
YLLEKSRAI RLL

RLVDFFPDI RLYDYFTRV 0.2388836786669536
YLVSSLSEI FVIGGMTGV 0.2755499940209084
VLYDEFVTI KVYDKLFPV 0.34081032335848105
KVAEIVHFL KIMEIVSHL 0.2853438118017446
FVNYDFTIV FANNEFTLV 0.21771022235403248
RLAELIGPA GLTEVFGST 0.3135841259113681
HLINKLLST QIIGYVIGT 0.3303533213599882
ELDSNLYRI RLEARIAQL 0.33645740726407103
KVAELVHFL KIMEIVSHL 0.301203254608316
KMIPLLFIL VILGVLLLI 0.3133329823199823
FMMVLPGAA FSAVISGSV 0.3401474546441028
VLAKDGTEV LLAVGATKV 0.33506487145142316
VICSFLVFL VILGVLLLI 0.27760294530227414
NIAEYIAGL HMWNFISGI 0.33810083625202414
LLALQQLEV LMTLYQIQV 0.1942788398665639
LLLEAGALV LLTEVETYV 0.3446925080821652
TLFLLFLEI AITLVVISV 0.30194906571063473
YFVAYQATV YLVAYQATT 0.12220621148913491
FLHYCNSYA YLYFCSSDV 0.33826208528678436
HIFYQLANV LMWYELSKI 0.3493248189033412
YIALCKVTV YLYFCSSDV 0.3417050759150805
LLDCIMFQS ILDSVGIEA 0.3473107146657931
TLWAIINTI AMYVAIQAV 0.3484515220386315
ALAKAAAAM ATAKAAAAV 0.1721835677801593
KLDYWSFQL SVDFYQFRV 0.2507828438673321
ILMQVPFSV ITYQVPF

ALAKAAAAL ATAKAAAAV 0.1453084022641018
SLYNTVATI YLFNAIETM 0.2978377916313093
YTVAYQATV YLVAYQATT 0.16584525641421421
FIISVISLV VILGVLLLI 0.2446000539733062
NVIGLIVIL VILGVLLLI 0.2064896099695256
ILDIAGFEI VLSIMAFIL 0.30907797666367554
FLLLTSIPI FLLIRYITT 0.32778464671777463
ITFQVPFSV ITYQVPFSV 0.026762754950225842
IMPQGEAGL ILPDKIDGL 0.3487606356109211
KLFGSLAFV DLMGYIPLV 0.3406354539625691
RLNDFLGLL SLSHYFTLV 0.33059371344564004
LLYILRYIV VLSIMAFIL 0.25984651636374345
FVFRSPFIV YLFPGPVYV 0.30828353929014896
YTDQVPFSV ITYQVPFSV 0.25807933425368335
LLFGAPVYV VILGVLLLI 0.3409991028154492
RMILYLESV RLAVYIDKV 0.19246331745736223
FVDYNFTIV FANNEFTLV 0.255819998944403
GILGFVFTL VILGVLLLI 0.3115619017850151
TLIDIWFLA SMLGIWFFT 0.15123363622985386
SLSSQLSNL RLEARIAQL 0.2911498235965616
YLCTFMIIT YLRLYIILA 0.3241619696616683
LLDLFGPEV LLTFWNPPT 0.3433581364661136
TLPELNLSL SVSDFDLRI 0.2819423912418416
FLWHVRKRV ALWEIQQVV 0.31358817380230053
SLFSLLLVI VILGVLLLI 0.22749036429820246
GLDVLTAKV GMK

In [176]:
data

Unnamed: 0,species,allele,length,cv,sequence,inequality,ic50,smm,cluster,ratio
0,human,HLA A*0101,9,1,YTDDYPMYK,=,18.8700,292.2,0,1.0
1,human,HLA A*0101,9,1,QSITRSLIY,=,94.4151,341.8,1,1.0
2,human,HLA A*0101,9,3,KSDPIMLLK,=,345.2000,1098.7,2,1.0
3,human,HLA A*0101,9,0,GTATYLPPY,=,120.0000,797.4,3,1.0
4,human,HLA A*0101,9,0,SSPLFNNFY,=,117.0000,1077.8,4,1.0
...,...,...,...,...,...,...,...,...,...,...
77,human,HLA A*0101,9,4,FVSVYFSDY,=,135.0000,1340.3,66,0.5
78,human,HLA A*0101,9,0,ISDYDYYRY,=,47.6600,180.5,71,1.0
79,human,HLA A*0101,9,1,CADGTRHTY,=,438.5000,67.7,72,1.0
80,human,HLA A*0101,9,0,LTEIDIRDY,=,65.0000,287.7,73,1.0


In [156]:
data

Unnamed: 0,species,allele,length,cv,sequence,inequality,ic50,smm,cluster,ratio
0,human,HLA B*0801,9,3,FVRTLFQQM,=,188.0,1023.5,0,0.333333
1,human,HLA B*0801,9,2,YLKKWLNSF,=,69.5,3199.7,1,1.0
2,human,HLA B*0801,9,2,RDALGRTAL,=,499.5,3602.5,0,0.333333
3,human,HLA B*0801,9,2,FPRGQGVPI,=,129.5,7094.6,2,1.0
4,human,HLA B*0801,9,2,NNKSRLVAF,=,161.0,1823.9,0,0.333333
5,human,HLA B*0801,9,2,IPRRNVATL,=,71.0,3715.6,3,0.5
6,human,HLA B*0801,9,0,QARQMVQAM,=,466.0,724.2,4,0.5
7,human,HLA B*0801,9,2,IPKRNRSIL,=,268.0,1570.8,5,1.0
8,human,HLA B*0801,9,4,LPQTRWQAV,=,188.55,7135.4,6,1.0
9,human,HLA B*0801,9,4,FLHPKHWGT,=,285.45,14099.4,3,0.5
