# Hobohm 1

## Python Imports

In [161]:
import numpy as np
from time import time
from math import sqrt
import pandas as pd
import glob
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity

## Data Imports

## DEFINE THE PATH TO YOUR COURSE DIRECTORY

In [15]:
data_dir = "data/"

In [16]:
alphabet_file = data_dir + "Matrices/alphabet"
alphabet = np.loadtxt(alphabet_file, dtype=str)

alphabet

blosum_file = data_dir + "Matrices/BLOSUM50"
_blosum50 = np.loadtxt(blosum_file, dtype=int).T

blosum50 = {}

for i, letter_1 in enumerate(alphabet):
    
    blosum50[letter_1] = {}

    for j, letter_2 in enumerate(alphabet):
        
        blosum50[letter_1][letter_2] = _blosum50[i, j]
        
blosum50      

{'A': {'A': 5,
  'R': -2,
  'N': -1,
  'D': -2,
  'C': -1,
  'Q': -1,
  'E': -1,
  'G': 0,
  'H': -2,
  'I': -1,
  'L': -2,
  'K': -1,
  'M': -1,
  'F': -3,
  'P': -1,
  'S': 1,
  'T': 0,
  'W': -3,
  'Y': -2,
  'V': 0},
 'R': {'A': -2,
  'R': 7,
  'N': -1,
  'D': -2,
  'C': -4,
  'Q': 1,
  'E': 0,
  'G': -3,
  'H': 0,
  'I': -4,
  'L': -3,
  'K': 3,
  'M': -2,
  'F': -3,
  'P': -3,
  'S': -1,
  'T': -1,
  'W': -3,
  'Y': -1,
  'V': -3},
 'N': {'A': -1,
  'R': -1,
  'N': 7,
  'D': 2,
  'C': -2,
  'Q': 0,
  'E': 0,
  'G': 0,
  'H': 1,
  'I': -3,
  'L': -4,
  'K': 0,
  'M': -2,
  'F': -4,
  'P': -2,
  'S': 1,
  'T': 0,
  'W': -4,
  'Y': -2,
  'V': -3},
 'D': {'A': -2,
  'R': -2,
  'N': 2,
  'D': 8,
  'C': -4,
  'Q': 0,
  'E': 2,
  'G': -1,
  'H': -1,
  'I': -4,
  'L': -4,
  'K': -1,
  'M': -4,
  'F': -5,
  'P': -1,
  'S': 0,
  'T': -1,
  'W': -5,
  'Y': -3,
  'V': -4},
 'C': {'A': -1,
  'R': -4,
  'N': -2,
  'D': -4,
  'C': 13,
  'Q': -3,
  'E': -3,
  'G': -3,
  'H': -3,
  'I': -2,
  'L'

### Sequences

In [76]:
def load_CSV_sequences(alm_file):
    
    data = pd.read_csv(alm_file)
    
    return data

In [149]:
def encode(peptides, encoding_scheme, alphabet):
    encoded_peptides = []
        
    for peptide in peptides:
        encoded_peptide = []

        for peptide_letter in peptide:
            
            for alphabet_letter in alphabet:
                
                encoded_peptide.append(encoding_scheme[peptide_letter][alphabet_letter])
                
        #add a 1 (bias)
        encoded_peptide.append(1)

        #store peptide
        encoded_peptides.append(encoded_peptide)
               
    return np.array(encoded_peptides)

In [165]:
def homology_function(alignment_length, matches, threshold, peptide1 = None, peptide2 = None):
    if (peptide1!=None) & (peptide2!=None):
         if len(peptide1) == len(peptide2):
                e_peptides = encode([peptide1, peptide2], blosum50, alphabet)
                peptide1 = np.array(e_peptides[0])
                peptide2 = np.array(e_peptides[1])
                homology_score = cosine(peptide1, peptide2)
                if homology_score < threshold:
                    
                    return "discard", homology_score
                else:
                    return "keep", homology_score


    homology_score = 2.9 * sqrt(alignment_length)

    if matches > homology_score: ## Add the inequally sign
        return "discard", homology_score
    else:
        return "keep", homology_score

## Smith-Waterman O2

### This code is identical to the code you wrote the other day

In [18]:
def smith_waterman(query, database, scoring_scheme, gap_open, gap_extension):
    
    P_matrix, Q_matrix, D_matrix, E_matrix, i_max, j_max, max_score = smith_waterman_alignment(query, database, scoring_scheme, gap_open, gap_extension)
    
    aligned_query, aligned_database, matches = smith_waterman_traceback(E_matrix, D_matrix, i_max, j_max, query, database, gap_open, gap_extension)
    
    return aligned_query, aligned_database, matches


def smith_waterman_alignment(query, database, scoring_scheme, gap_open, gap_extension):

    # Matrix imensions
    M = len(query)
    N = len(database)
    
    # D matrix change to float
    D_matrix = np.zeros((M+1, N+1), np.int)

    # P matrix
    P_matrix = np.zeros((M+1, N+1), np.int)
    
    # Q matrix
    Q_matrix = np.zeros((M+1, N+1), np.int)

    # E matrix
    E_matrix = np.zeros((M+1, N+1), dtype=object)

    # Main loop
    D_matrix_max_score, D_matrix_i_max, D_matrix_i_max = -9, -9, -9
    for i in range(M-1, -1, -1):
        for j in range(N-1, -1, -1):
            
            # Q_matrix[i,j] entry
            gap_open_database = D_matrix[i+1,j] + gap_open
            gap_extension_database = Q_matrix[i+1,j] + gap_extension
            max_gap_database = max(gap_open_database, gap_extension_database)
            
            Q_matrix[i,j] = max_gap_database
                
            # P_matrix[i,j] entry
            gap_open_query = D_matrix[i,j+1] + gap_open
            gap_extension_query = P_matrix[i,j+1] + gap_extension
            max_gap_query = max(gap_open_query, gap_extension_query)
            
            P_matrix[i,j] = max_gap_query
            
            # D_matrix[i,j] entry
            diagonal_score = D_matrix[i+1,j+1] + scoring_scheme[query[i]][database[j]]    
            
            # E_matrix[i,j] entry
            candidates = [(1, diagonal_score),
                          (2, gap_open_database),
                          (4, gap_open_query),
                          (3, gap_extension_database),
                          (5, gap_extension_query)]
            
            direction, max_score = max(candidates, key=lambda x: x[1])
            
            
            # check entry sign
            if max_score > 0:
                E_matrix[i,j] = direction
            else:
                E_matrix[i,j] = 0
            
            # check max score sign
            if max_score > 0:
                D_matrix[i, j] = max_score
            else:
                D_matrix[i, j] = 0

            # fetch global max score
            if max_score > D_matrix_max_score:
                D_matrix_max_score = max_score
                D_matrix_i_max = i
                D_matrix_j_max = j
            
    return P_matrix, Q_matrix, D_matrix, E_matrix, D_matrix_i_max, D_matrix_j_max, D_matrix_max_score


def smith_waterman_traceback(E_matrix, D_matrix, i_max, j_max, query, database, gap_open, gap_extension):
    
    # Matrix imensions
    M = len(query)
    N = len(database)
    
    # aligned query string
    aligned_query = []
    
    # aligned database string
    aligned_database = []
    
    # total identical matches
    matches = 0

        
    # start from max_i, max_j
    i, j = i_max, j_max
    while i < M and j < N:

        # E[i,j] = 0, stop back tracking
        if E_matrix[i, j] == 0:
            break
        
        # E[i,j] = 1, match
        if E_matrix[i, j] == 1:
            aligned_query.append(query[i])
            aligned_database.append(database[j])
            if ( query[i] == database[j]):
                matches += 1
            i += 1
            j += 1
        
        
        # E[i,j] = 2, gap opening in database
        if E_matrix[i, j] == 2:
            aligned_database.append("-")
            aligned_query.append(query[i])
            i += 1

            
        # E[i,j] = 3, gap extension in database
        if E_matrix[i, j] == 3:
                   
            count = i + 2
            score = D_matrix[count, j] + gap_open + gap_extension
            
            # Find length of gap
            while((score - D_matrix[i, j])*(score - D_matrix[i, j]) >= 0.00001):   
                count += 1
                score = D_matrix[count, j] + gap_open + (count-i-1)*gap_extension

            for k in range(i, count):
                aligned_database.append("-")
                aligned_query.append(query[i])
                i += 1
            
            
        # E[i,j] = 4, gap opening in query
        if E_matrix[i, j] == 4:
            aligned_query.append("-")
            aligned_database.append(database[j])
            j += 1
        
        
        # E[i,j] = 5, gap extension in query
        if E_matrix[i, j] == 5:
             
            count = j + 2
            score = D_matrix[i, count] + gap_open + gap_extension
            
            # Find length of gap
            while((score - D_matrix[i, j])*(score - D_matrix[i, j]) >= 0.0001): 
                count += 1
                score = D_matrix[i, count] + gap_open + (count-j-1)*gap_extension

            for k in range(j, count):
                aligned_query.append("-")
                aligned_database.append(database[j])
                j += 1

                
    return aligned_query, aligned_database, matches

## Hobohm 1

### Similarity Function

### This code defines the threshold for similarity

In [116]:
#def homology_function(alignment_length, matches):
#
#    homology_score = 2.9 * sqrt(alignment_length) # FIX FOR SHORT PEPTIDES
#    
#    if matches > homology_score: ## Add the inequally sign
#        return "discard", homology_score
#    else:
#        return "keep", homology_score

### Main Loop

In [178]:
# load list
training_files = glob.glob("data/train/*train.csv")
#training_files = "\\data\\HLA A_0101_train.csv"

for training_file in training_files:
    print(training_file)
    
    alm_file = training_file
    data = load_CSV_sequences(alm_file)
    #print(data)
    candidate_sequences = data['sequence']

    print ("# Numner of elements:", len(candidate_sequences))

    accepted_sequences = []
    clusters = {}
    cluster_number = 0

    accepted_sequences.append(candidate_sequences[0])
    #accepted_ids.append(candidate_ids[0])
    clusters[candidate_sequences[0]] = cluster_number
    cluster_number = cluster_number +1

    #print ("# Unique.", 0, len(accepted_sequences)-1)#, accepted_ids[0])

    # parameters
    scoring_scheme = blosum50
    gap_open = -11
    gap_extension = -1
    threshold = 0.35

    t0 = time()

    for i in range(1, len(candidate_sequences)):

        for j in range(0, len(accepted_sequences)):

            query = candidate_sequences[i]
            database = accepted_sequences[j]

            aligned_query, aligned_database, matches = smith_waterman(query, database, scoring_scheme, gap_open, gap_extension)

            alignment_length = len(aligned_query)

            homology_outcome, homology_score = homology_function(alignment_length, matches, threshold, query, database)
            if homology_score < threshold:
                print(query,database,homology_score)
            # query is not unique
            if homology_outcome == "discard":

                #print ("# Not unique.", i, candidate_sequences[i], "is homolog to", accepted_sequences[j], homology_score)

                get_cluster = clusters[accepted_sequences[j]]
                clusters[candidate_sequences[i]] = get_cluster 

                break

        # query is unique
        if homology_outcome == "keep":
            accepted_sequences.append(query)
            #accepted_ids.append(candidate_ids[i])
            clusters[query] = cluster_number
            cluster_number = cluster_number +1
            #print (i, candidate_sequences[i], homology_score)

    t1 = time()

    #print ("Elapsed time (m):", (t1-t0)/60)

    lst_clust = list(clusters.values())
    
    cluster_counts = []
    
    for cluster in lst_clust:
        cluster_counts.append(lst_clust.count(cluster))
    
    ratios = []
    
    for count in cluster_counts:
        ratios.append(1/count)        
    
    data['cluster'] = lst_clust
    data['ratio'] = ratios

    data.to_csv(training_file+'_cluster.csv', index=False)

data/train\HLA A_0101_train.csv
# Numner of elements: 82
IADMGHLKY ISDYDYYRY 0.30495950339585887
SSPLFNNFY VSSIFISFY 0.346102117328987
FADINGKLY HTEFEGQVY 0.32442606393466467
ATDKAAAAY LTDDMIAAY 0.33732969947120406
KSDGTGTIY VSDGGPNLY 0.3428653849206721
LSAFSLHSY LTEIDIRDY 0.3220514861749578
NSDPEFNVL YTNPQFNVY 0.26244513733273434
LSDDAVVCY LTDDMIAAY 0.23074025477521187
HLDMLRHLY RLDAFRQTY 0.29100384870818974
LADTSLSGY LTEIDIRDY 0.3266767837403092
YRSDIVGTY LTDDMIAAY 0.3050376350614249
GTEMFRHGY GTQLFEDNY 0.22731246739210853
AADKAAAAY ATAKAAAAY 0.16437923223170503
FVSVYFSDY YVDHYYRDY 0.2999309046751629
LTKQYLNLY VSSIFISFY 0.3464873554195964
LIENELMNY MIEPRTLQY 0.29782366577008534
CSNDKSLVY GSEDRDLLY 0.2606152320676305
FLDQWWTEY YVDHYYRDY 0.3153940124744733
data/train\HLA A_0201_train.csv
# Numner of elements: 944
FLYGGLLLA VILGVLLLI 0.32855434626087177
FLYGALVLA VILGVLLLI 0.28288601620876486
IILFILFFA VILGVLLLI 0.2913432103341277
RLLDDLVIV RFLEDYFGV 0.31583488404763704
LIFFVIILA VILGVL

FLKDVMESM YLPEVISTI 0.26479810543089644
FANHAFTLV FANNEFTLV 0.12504194263603574
ALLKHRFEI ALIHHNTHL 0.3311757447223854
TMLSIILVI VILGVLLLI 0.15838940386401001
LLAQFTSAI VLHSFTDAI 0.23398720334033618
NVFISPASI SMFLMTATL 0.30641081365096257
FIYSIMETI VLHSFTDAI 0.3466321304478923
VLIQRNPQL ALIHHNTHL 0.3072595732216944
FMDGKQACV FLGGTPVCL 0.259294336233882
ILPVIFLSI AITLVVISV 0.26052604820537006
QLFHLCLII HLIKIPLLI 0.33328134178702007
LLIHFLLSL VILGVLLLI 0.32448850273400265
ILGLPTQTV LLAVGATKV 0.33952150043480533
KLFYVYYNL AMFTTMYNI 0.31262844379025856
LLSEIRFYI LLTEVETYV 0.18010376077327217
FLYGALALA VILGVLLLI 0.34323387416476225
LLNMRDLIV CLALSDLLV 0.24575451772801826
FIDILLFVI FLAIKLYGV 0.337407970784922
KVIKLVKSL KIMEIVSHL 0.1985454822844258
ILAQVPFSV ITYQVPFSV 0.16616773488859293
FLSHDFTLV FANNEFTLV 0.18898653726460413
ALQNVMISI AITLVVISV 0.286565577285285
FTLNHVLAL ILYDNVVTL 0.342941006002786
YLLPAIVHI YLFPGPVYV 0.2775083611733298
SLSTKLKQV RLEARIAQL 0.27119647688039683
YLLEKSRAI RLL

RLVDFFPDI RLYDYFTRV 0.2388836786669536
YLVSSLSEI FVIGGMTGV 0.2755499940209084
VLYDEFVTI KVYDKLFPV 0.34081032335848105
KVAEIVHFL KIMEIVSHL 0.2853438118017446
FVNYDFTIV FANNEFTLV 0.21771022235403248
RLAELIGPA GLTEVFGST 0.3135841259113681
HLINKLLST QIIGYVIGT 0.3303533213599882
ELDSNLYRI RLEARIAQL 0.33645740726407103
KVAELVHFL KIMEIVSHL 0.301203254608316
KMIPLLFIL VILGVLLLI 0.3133329823199823
FMMVLPGAA FSAVISGSV 0.3401474546441028
VLAKDGTEV LLAVGATKV 0.33506487145142316
VICSFLVFL VILGVLLLI 0.27760294530227414
NIAEYIAGL HMWNFISGI 0.33810083625202414
LLALQQLEV LMTLYQIQV 0.1942788398665639
LLLEAGALV LLTEVETYV 0.3446925080821652
TLFLLFLEI AITLVVISV 0.30194906571063473
YFVAYQATV YLVAYQATT 0.12220621148913491
FLHYCNSYA YLYFCSSDV 0.33826208528678436
HIFYQLANV LMWYELSKI 0.3493248189033412
YIALCKVTV YLYFCSSDV 0.3417050759150805
LLDCIMFQS ILDSVGIEA 0.3473107146657931
TLWAIINTI AMYVAIQAV 0.3484515220386315
ALAKAAAAM ATAKAAAAV 0.1721835677801593
KLDYWSFQL SVDFYQFRV 0.2507828438673321
ILMQVPFSV ITYQVPF

ALAKAAAAL ATAKAAAAV 0.1453084022641018
SLYNTVATI YLFNAIETM 0.2978377916313093
YTVAYQATV YLVAYQATT 0.16584525641421421
FIISVISLV VILGVLLLI 0.2446000539733062
NVIGLIVIL VILGVLLLI 0.2064896099695256
ILDIAGFEI VLSIMAFIL 0.30907797666367554
FLLLTSIPI FLLIRYITT 0.32778464671777463
ITFQVPFSV ITYQVPFSV 0.026762754950225842
IMPQGEAGL ILPDKIDGL 0.3487606356109211
KLFGSLAFV DLMGYIPLV 0.3406354539625691
RLNDFLGLL SLSHYFTLV 0.33059371344564004
LLYILRYIV VLSIMAFIL 0.25984651636374345
FVFRSPFIV YLFPGPVYV 0.30828353929014896
YTDQVPFSV ITYQVPFSV 0.25807933425368335
LLFGAPVYV VILGVLLLI 0.3409991028154492
RMILYLESV RLAVYIDKV 0.19246331745736223
FVDYNFTIV FANNEFTLV 0.255819998944403
GILGFVFTL VILGVLLLI 0.3115619017850151
TLIDIWFLA SMLGIWFFT 0.15123363622985386
SLSSQLSNL RLEARIAQL 0.2911498235965616
YLCTFMIIT YLRLYIILA 0.3241619696616683
LLDLFGPEV LLTFWNPPT 0.3433581364661136
TLPELNLSL SVSDFDLRI 0.2819423912418416
FLWHVRKRV ALWEIQQVV 0.31358817380230053
SLFSLLLVI VILGVLLLI 0.22749036429820246
GLDVLTAKV GMK

NLAEDIMRL SMEAEMIQL 0.310999445317629
VLALYSPPL VMYAFTTPL 0.3187132634446679
ELRRQVDQL SLSSQLSNL 0.3113645160345201
LMTLYQIQV MMNITRLEV 0.26926108800148785
VISKIYTLI LISLINSLV 0.3098164865029519
SLFNTVATV SLFNTIATL 0.028511663934478126
KMFCQLAKT KMFCQLAKV 0.0519492882087208
IIMAINVFT VLTSVDIET 0.32324584192175765
FANNGFTLV FVDYNFSLV 0.2696468599099937
ALAKAAAAL ASAKAAAAV 0.218322781041428
SLFNTIATI SLFNTIATL 0.015377840227288164
ELADKVTKL SLSSQLSNL 0.2591630692969462
KIGDKFQTV KVYGRYSAV 0.3434160428354629
ALMEITSRY ALVEICTEM 0.2878774029916634
SLPPPGTRV SMPPPGTRV 0.013802830871601723
GLESIEQNL EMETLQSQL 0.2920023648171083
KLKDVLLQV YVNAILYQI 0.3492815373919633
LLALQQLEV ALALEQYGI 0.2953250486852824
IVGAETFYV MAGVEVRYI 0.34828447912137694
DLERKVESL SLSSQLSNL 0.29975606464596527
YIESKAKQL FLEQQNKIL 0.33681025129341313
SLFYTVATL AMYVAIQAV 0.34624378753766605
KIMSIGFEA KILSVFFLA 0.31637241399887506
GLRALRETL GTSVIRSNI 0.32274307198741004
SLEATFIDV SMEAEMIQL 0.24634995386936165
YLYVDKNFI FL

LLSGAGEHL FIASAPQQL 0.31636423513363365
SVSRDFTLV SLSHDFTLV 0.08686994862546049
VLSDFKSWL LILNFLDWI 0.2916995517823393
WIKDIMTST WVMDTLNGI 0.3194342188588206
VLIALSVLA IIMAINVFT 0.1621432416693599
NVISKIYTL DIINSVSII 0.30753915690378386
ELMESRMRI SLIGSKTQI 0.3117830542842025
IISTFHLSI IIAVFDSKL 0.33088125284728087
ASAKAAAAV ALAKAAAAA 0.2886174339107014
LIALSVLAV LLQLTVWGI 0.2783334693509011
RMQKEITAL KLQKDLEGL 0.19114084891848715
GLVGLVTFL GLIMVLSFL 0.18269616285871026
ALIRILQQL YIIRVTTEL 0.2830295038283399
LLALQQLEV VLFLQMMNV 0.3017716461218919
TLMNVITLV GLIMVLSFL 0.326273043610396
GIIITVGML GLIMVLSFL 0.2130657301813228
VLSDFKTWL LILNFLDWI 0.3113900413856542
IVYGRSNAI IIYSKAGNI 0.20794860362008127
SMSQELAEL PLNDNIATL 0.3205018833760549
GLILFVLAL SIMAFILGI 0.21920775240625423
YLSGANLNL YLSGANLNV 0.021602316960857904
ALKGTNESL RLRDLNQAV 0.32307233037588423
VMPEKRNVV MMAKEEELV 0.2793642369509012
FLSHNFTLV SLSHDFTLV 0.18142627822007873
VLMIKALEL VLFLQMMNV 0.24791353587512766
WLMKNMDPL LLI

DLLNVTYNI ELSRLRYNL 0.31411782660879073
ALMPLYACI VLLPFYETL 0.27367892276441796
CLTSTVQLV CIRNASKFV 0.28570426363747214
QLFKYVPSA KVEKYLPEV 0.32742262097384733
MMFDAMGAL IIYSKAGNI 0.3362893286577633
NIYSALMTL SLFYTIATI 0.34652127484433637
RVSRPTTVV HVGRPTTVV 0.12071162613995068
SVSVGTGIL NIREGTHVL 0.34486346309064586
KIDYYIPYV KVEKYLPEV 0.2629139043053512
ILHCANFNV ILHNGAYSL 0.31003883529318144
FLSNGHVTI YLSGANLNV 0.2758924481593148
TVQEFIFSA SIMAFILGI 0.33805350220479014
FLIVSLCPT LLIKTLSPA 0.31200362075024535
FMYSTVATI FLTSVINRV 0.31792070122034155
DIMTSTRTI DLTAALRDV 0.29846112253162194
ILFLTVATL SLFYTIATI 0.23115501713642894
FMGVIYIMI FVDTMSIYI 0.3432854643958956
ALEAKIAQL ILLAELEQL 0.3390214943792915
GLCNYGGIL ALCRWGLLL 0.28754101789273656
FVNYDFALV SLSHDFTLV 0.27760916881281084
QLMAEKLQL SLIGSKTQI 0.2929571428896466
NVIEDITFL NILMDSIFV 0.31121267960919874
RIVVALSSL KMAVEVGSI 0.261903832077812
AVITETIPI AAIDRQVSV 0.33956680801473405
SIMETIDPV LLIKTLSPA 0.3113730016088758
SLFNTIATI

FIAEIDHWI FTAKINEMV 0.3021141832028844
YLVAYKATV YLVAYQATI 0.030249456827262322
VICSFLVFL VLIAGIILL 0.3388684188473463
NMLSTVLGV ATLNTLITL 0.3197734975905677
KLMPICMDV KLTPLCVTL 0.18081098219450809
KVIQYLAYV KVLSIMAFI 0.21020589434730874
LTNAISSRV FSAVISGSV 0.31654935132835826
FVFDRPLPV YIYGIPLSL 0.3099778577750365
MLLNVQTLI KVLSIMAFI 0.3364246458025628
ALPHAILRL GVPHSVFIA 0.34327763151312196
YLVAYQATV YLVAYQATI 0.004770793974953169
TLVDICFWS GLVDLFVFS 0.34052900572582634
YMIKKLLKI YLISIFLHL 0.30794036757467147
VLLPFYETL ALMPLYACI 0.27367892276441796
AMAKAAAAV ASAKAAAAV 0.16870335879868104
FVDGVPFVV FVDTMSIYI 0.2957105804942962
YLPEVISTI FVKKMLPKI 0.32206368728682866
KQYGDIDLL TQSGALEVL 0.3149244590463369
YVVAYQATV YLVAYQATI 0.0295166220277171
FILGIIITV YIYGIPLSL 0.26428846130120454
FANHNFTLV IANYNFTLV 0.14084377580488783
KLLNMRDLI KVLSIMAFI 0.3077840340020509
TIAVITETI TIDAINKCV 0.3323951977943005
ATAKAAAAV ASAKAAAAV 0.0267863853541479
NIFMTLVPV GLYSSTVPV 0.2833078002217895
FTEGKINPL 

FFVFIHMVR FMRFFQLLR 0.29709929713069294
RLGVRATRK KVGVYKMHK 0.3372131505359799
LVKMINHLK ILKKLSSIK 0.28847233999852073
FLWTQSLRR YLFNQHIKK 0.2801205212931771
TTLLNETAK RQILDNAAK 0.2786715546923354
LVFNSISAR FIFSALDEK 0.23511654504497903
RVISDGYFK MVIENGILK 0.3041287851130424
YLYNKYSFK KLFKKTDFK 0.33926422711362014
TFMIITSTK ATVVIGTSK 0.3394513576947238
SSNVANYQK SSRVDRYSK 0.3053709823220786
QLLRLMADK SMMVILPDK 0.3093687657637193
MLRLFDFNK IFAFIDFSK 0.2500680624220116
QLFKPLTKK KVIQPRVEK 0.33252949255800923
TFRERYSYK TTKDYFSFK 0.3414724753352518
MLFTSTNDK LIFCHSKKK 0.3331295582577005
IVFNLPVSK VVFGILIKR 0.2577275311927891
FLRSIAMLK ILKKLSSIK 0.2854406570582235
LIYRRRLMK MVIENGILK 0.3408253591629361
QQFANVISK STYSDICSK 0.3414946090778941
KSISSIFGY KVVDTFISY 0.3498581962252728
LLLNTRQLK ILIKRRQQK 0.2746819653837833
NTTYDFLAR TTRYKYLNK 0.3113776501035993
RTQNVLGEK KAPNVISSK 0.2663673492626423
SVLNLVIVK SVLRAVLPR 0.30895679768359285
DSMDVLAEK RSLQTIASK 0.32018970989297924
KLMEEYLRR KVVDTFIS

GTLSYDNLK SILSLETVK 0.32314752519326795
ALFMYYAKR GLFVYLIRY 0.3310820474924372
LAISAVYFK WILDRLFFK 0.3424765343133179
DILSIIDAK SVMEVYDGR 0.26860770070128237
SVLEVFEGR SVMEVYDGR 0.0742860517842372
LMYALEPRK LVYIFEPEK 0.1840352755745256
KLGDQFGRK ALDGTFQRK 0.34555646254658046
SLFRAVITK SVLRAVLPR 0.18026181759149662
KVIQPRVEK ISIRPRVTK 0.33402987265574935
MVDELVTRK LTQDLVQEK 0.2875698372059663
LLSINSSFY VMHINSPFK 0.302508309781952
DSMDVLAEK KSSSILARR 0.3104428144657051
SMMVILPDK KIKLILANK 0.3072836879058155
VSMMSMYGK ISLNSMYTR 0.27430332144921754
LVKMINHLK MATMLEYVR 0.3055313710602585
KSISSMTIR KSISKSNAK 0.3457514327570419
RSIAMLKSK KSYSLIRPK 0.27131645603035437
ATMLEYVRY VTMMKYCSY 0.28968636888663435
VTSSGAIYK VTSSGTIYK 0.02610375905370943
RINEEKHEK KLDDVEKEK 0.3193994550753042
AIILHQQQK LIFCHSKKK 0.3456011780051027
ITFFQEVPH VTMFEALPH 0.1605566050547016
LLIWAYLSK IVIWGKTPK 0.29341458907372175
VVNYDNSTK ALNFPGSQK 0.3210418133131445
RVLFSIFYK RVYINVVVK 0.3181411559295474
YLFNQHIKK SLYDEH

LFKNVRLLK VIEDITFLR 0.25452129850064187
DVLKTRLFR MTMRRRLFK 0.3298501079926437
KSINKVYGR KSISSIFGY 0.20453829865718676
KLQARNIQK PISASDMQK 0.29687057221195023
DLLNSMMNR RFVEELLHR 0.32317799590906693
VTFQGKFKK AALDGTFQR 0.28345268201232743
RVFNNYMPY RLFYTFFSY 0.33936895629863173
KSLLLLNTR KALMQLTTK 0.26669105254651704
QIFEVYWYL RVYEALYYV 0.2825253207343793
IVLFQRFLR VALYRRIQR 0.2919199926673227
IINAHRIPK LMQGSTLPR 0.3115209478796016
TNFESFTVK VSLSAYIIR 0.34041343474668007
SSIKSKSRR NSISARALK 0.32468320983376886
LLGPGRPYK LLGPGRPYR 0.012032469556099046
LMSIISTFH MTKILEPFR 0.3235845946730965
ISIRPRVTK VALYRRIQR 0.335741816889193
VFKDSFLRK MVSDTIMKR 0.26677977006060927
QTNFKSLLR KSMLKELIK 0.31104412153587324
IITPVVFYR VIEDITFLR 0.2496501234474029
STELIRRVR RTKLMSNIK 0.2862599548314585
RIAQGVLQR RFVEELLHR 0.3030502148922092
SFYVNRGFK RFYITTRYK 0.30055540970012773
ALKKLIIDR KLPRMFLPK 0.3223547860029329
LIKFISDNK ILQLIRHGR 0.29735830296142984
LVKSYSLIR FMRFFQLLR 0.31783309160030293
ATFSVPMEK 

VVFGILIKR IVFNLPVSK 0.2577275311927891
SVLEVFEGR GVVRVWDVK 0.3451681253853547
DIVNNFITK PLFNNFYKR 0.3028992435765875
TTLLNETAK YTLFQQTGR 0.33871029402000385
RFVKFNDYR EILKINSVK 0.33176545436644334
FLWTQSLRR YLFNQHIKK 0.2801205212931771
DISKLTNFK EILKINSVK 0.3185856076095628
LSARNKLFK VTSSGAIYK 0.30907679596796644
SILDRIDTR TTVNTLSER 0.3460577843622368
LSIVVDINK ISRVNDLNR 0.31021332776184973
LVFNSISAR TTVNTLSER 0.3295418542321753
ETITEKTFK RTFNEDLFR 0.3107056679103951
DLIVTFRER DTIVSRSSR 0.34221490759579176
DLIAMENLK EILKINSVK 0.24104869488790293
NTMCTEETK DTIVSRSSR 0.34943328675667906
NVAVIDKAK DVDIYDAVR 0.32213095747115084
TLYVKALTK AVFIHNFKR 0.3194165505867298
TIQRFSSLR EILKINSVK 0.3414538864415161
VTSSGTIYK VTSSGAIYK 0.02610375905370943
IAEYIAGLK IAPGIADIR 0.31933988901831756
NQVKFYFNK SNIQFNISK 0.32742786131283497
KVMFVIRFK EIIFLKLFK 0.3131430990930859
ETFKIDAVR EILKINSVK 0.19020830510151587
NLTDTNFKK TISKDNLER 0.32608512847869797
GIFKNNDVR SIYSRPKIK 0.3410323639868157
QMLTSGEYK EL

EIMDKEQLL EVIERINLL 0.27483348857167555
IANYNFTLV FANHAFTLV 0.2018104107677433
SVDVDIYDA TIAVSVYGA 0.23720189358735044
DLSDQIAEL ELQAQIAEL 0.17571710994870882
FANSKFTLV FANHAFTLV 0.1440730644817061
DISSFYWSL NISGYNFSL 0.31343119371981487
KVIKLVKSL EVIERINLL 0.34249761303293547
SVPLPCQLM EVRIPVDLV 0.3297578698561653
SISEINEWL TIAHINTLI 0.31308327197347574
LTFDVFRPL STFNMWREI 0.32969848982957195
AAAKAAAAV ATAKAAAAV 0.03801045110792767
VLFLQMMNV MMLVPLITV 0.2985975025544194
YLVAYQAKV YVVAYQATV 0.0889127479851689
TIDAINKCV NVSRVVECL 0.33825995594823655
SAVFKDSFL DAMIHKTYI 0.342888001153849
NLAEDIMRL NIYSALMTL 0.3456471378695085
ETVKMGAFM NSFELGVWV 0.3358026755385348
ELADKVTKL EVIERINLL 0.3308289591556761
MIGNYFSGV NIAEYIAGL 0.31882391333007776
EVKTLSSYI DLKHATDYI 0.3108203960824365
ELQAALARV ELQAQIAEL 0.16654967328324888
NVFISPASI NTFVNFNSV 0.3318286368828406
DVSRPTTVV DVSRPTTVL 0.021225671520298306
LIALSVLAV VAAVIIMAI 0.2806956944803908
QIFNIISYI GLFDFVNFV 0.22999229599598348
VISVIFYFI VA

QLMAEKLQL AVITETIPI 0.3435473994804855
FIISVISLV WLGAAITLV 0.3323893554339693
YTDQVPFSV ITDQVPFSV 0.08857702303454584
KLVSISNFI IILAISALL 0.3226305843808822
SIFLHLVKI RIFYNILEI 0.2307571008881989
YLPLSVFII TILLGIFFL 0.3428914761770583
LLNATDIAV ILDSVGIEA 0.3158236275286842
YIFRNTINM RIFYNILEI 0.33375636502181827
FTDQVPFSV ITDQVPFSV 0.06066078253883411
KLQVELDNV KMAVEVGSI 0.21248418141528935
FLGGTTVCL YLSGANLNL 0.3061301007572672
FLLTRILTI NLLDRLLLI 0.3194119648989897
GLADAFILL NLLDRLLLI 0.25529990592250684
FLYGALVLA VLYPVIFIT 0.3217856555680534
FANHAFTLV FCSDALTLI 0.2684761083044015
LVMDKNHAI IIMEEGNSI 0.21845385158915254
ILDQVPFSV ITDQVPFSV 0.0623108971233205
TLWAIINTI KLYTIVSTL 0.22085880220770238
VVFQTSATI IILAISALL 0.3007648456555153
FLDWIKDIM YLTAIQDFI 0.3355609292211368
QLPLESDAV KMAVEVGSI 0.3477105697588586
NLAEDIMRL ALPHAILRL 0.28631852564144367
TMFKIVYSL SLYKGVYEL 0.2621336822971513
SIFVSTMPV RIFYNILEI 0.3297939927364002
YILYIVFCI YLLAVCGCI 0.3143503525498843
LLYMTSATI FLLLADA

YIVGANIET YLSGANLNL 0.24894184513665296
VLYDEFVTI IMIGHLVGV 0.3403371377276502
YVIKVSARV IILAISALL 0.336902178330156
YVLLHLLVV TILLGIFFL 0.3289245798391043
ILPVIFLSI VLTILYYGA 0.2994892113669264
LLSLFSTLV IILAISALL 0.34327197932503206
LLFGYPVYV YLVSFGVWI 0.3444705072763161
SMDTLLFFL SLDSLVHLL 0.22352261333943102
TIAVITETI ALMAITKNV 0.2869237048417216
GLLDSIKMI ACMDGFEVV 0.33156598149636907
VILGVLLLI VLYPVIFIT 0.2882520634313559
LMMSCISCI YLLAVCGCI 0.3385415549568418
KMFCQLAKT KMFCQLAKV 0.0519492882087208
TSLGLLYTV NCLSLLLSV 0.31084598633322724
FLYGALALA FLYGGLLLA 0.12115657256558754
FLSLGLVSL ILSLETVKM 0.29126038794500775
LMDMITLSL ITDQVPFSV 0.31564969674345833
YILRGLLEA FLYGGLLLA 0.327511999001702
GLVGLVTFL SIYECITFL 0.3450836893441205
IILLILSCI LMAVVLASL 0.2752698149425382
RLFSYNFTT RLFDFNKQA 0.3321399076132314
FLSHNFTLV FCSDALTLI 0.30811373308123957
FIQNIDFKA ILDSVGIEA 0.3223378302737273
YIDAYVSRL YMNGTMSQV 0.32270222108325786
FTWYGIAAL LMWYELSKI 0.27226119026152007
KIFEYGFTF ELLEMK

YVMTMILFL TILLGIFFL 0.34029576175462695
AMFTAALNI AVITETIPI 0.3273280141813384
MMNITRLEV ILSLETVKM 0.31523350532840444
AVIGALLAV NCLSLLLSV 0.32657021286725363
FVNYNFTLV FVNYDFTIV 0.04124311310663353
ILKKIIPTL LMAVVLASL 0.33242741150179944
LTYSQLMTL FMYSTVATI 0.3305837107407592
TLYDFDYYI SLENFRAYV 0.33810966746023985
AILHNIYRL ALPHAILRL 0.2881734577486048
SMSQELAEL SLSTKLKQV 0.2515343057722659
QLLQANPIL RLYQASAVM 0.2533627283693689
SLYADSPSV HLMSDNPKA 0.30142244216471703
RLMELPVKT KIMSIGFEA 0.31145398860183804
IMIGVLVGV LMAVVLASL 0.3411422729328891
QIFNIISYI KVIQYLAYV 0.26832520208842214
ALVGLFVLL TLIDIWFLA 0.31802712851243253
ITATFTAPL LLAQFTSAI 0.2969300991838968
YLVAYQKTV YLVAYQATT 0.13544926838154003
LLPSLFLLL ILFIMFMLI 0.31660305335535543
ILAIIFLVL VLTILYYGA 0.34021141305385727
SVVIYDFLV KLFTHDIML 0.34599129453691213
LLFILFYFA VLYPVIFIT 0.3468782222931517
RMILYLESV KMMLFYMDL 0.32856561717116484
CLTSTVQLV SLDSLVHLL 0.3442706082070882
FLHKRFTLV FVNYDFTIV 0.271278946669002
LLMMTLPSI L

SLNRNFTLV FANHDFTLV 0.321574437608918
SIVCIVAAV GLILFVLAL 0.33446147351024746
LISSDGARV ILAADLEKL 0.3165300094856984
TLVDLCFWS GLVDLFVFS 0.3324016392815149
YLVAYKATV YLVAKQATV 0.17254107638856642
YLVAYQAKV YIIRVTTEL 0.34991130443455776
GLFDFVNFV GLVDLFVFS 0.3482781584096222
TLQSFRQDV GLRALRETL 0.3149388029325786
MIGNYFSGV ILAGYGAGV 0.3274179552071351
VLIAGIILL VLVGGVLAA 0.21464614441217156
ILSDENYLL VLQQNNSFI 0.3342908382074582
EMETLQSQL MVEYLENQL 0.33781446867613596
VLPFDIKYI ILAADLEKL 0.33720670184849455
FVSCDFTIV FANHDFTLV 0.25286961752100523
FMYTKHSML FLFMDRDAL 0.3401716383112856
YNYSLSAAV FSWTITDAV 0.2881100655804488
WVMDTLNGI WLMKNMDPL 0.23390239058055728
FLLLADARV YLVAKQATV 0.33654293425616144
TICLKNEGV ALALEQYGI 0.33170146925022026
ILATLNTLI IMTSYQYLI 0.30683805205567605
IVTFINDYA ITTLLNETA 0.2616029018007274
SLFYTVATL SLFNTAATL 0.22611183947155833
VLFLQMMNV SIFVSTMPV 0.34345097204089803
ILVGYMSNL ILAGYGAGV 0.2884128816492494
TLILSNKLL SAVFKDSFL 0.30975547810938764
LSAEELMSL FT

YMIKKLLKI YLLMHLVSL 0.2799082470474209
KLLYAAEMV KMIYDLNAV 0.3193566125752929
ATYGIIVPV ATLNTLITL 0.3454956355687264
RVYEALYYV QIFNIISYI 0.30805394242462125
SLEGDLEDL EMKTQLEEL 0.26838771139771733
ALMDLLMFS GLVDLFVFS 0.14760832735449358
ATAKAAAAV AIAKAAAAV 0.11193261579663638
KLKDVLLQV KIEDLINQL 0.23459487919861344
HILHAYCGI GLIYTYSGL 0.3315165466810047
KLMPICMDV KLTPLCVTL 0.18081098219450809
SLSHDFTLV FANHDFTLV 0.24652884874719705
SISSVLTIL ELDNVTGLL 0.30967496267834704
TVYPKTHYV STYPGNTFV 0.34464728337803263
YIVAYQATV YLVAKQATV 0.1622147386569478
FMYSTAATI SLFNTAATL 0.30678966809645647
GLYSSTVPV SLFNTAATL 0.33694834555675346
LVSDCASTI FTDTCGASI 0.30678858605382775
EVNDTHYTV QLSNNKYVL 0.31309616646284355
SMMSMYGKA GLTEVFGST 0.3393042479222932
LMTAISQGI ILAGYGAGV 0.3236887250478293
VLSDFKTWL SISEINEWL 0.2680550637625999
TTNNLLEQL KIEDLINQL 0.2620044167849468
FANSKFTLV FANHDFTLV 0.14360236747590094
ALKGTNESL SLAAAKKQL 0.34381168687449803
ELCAEAEEL EMKTQLEEL 0.3441628064670974
IVGAETFYV 

FANHKFTLV FVNEKYCII 0.2429988499462734
RLFTKVKPL RVFKKIMSI 0.2694878001625508
RIITILQDI DLTAALRDV 0.29696277046082853
YLKAYQATV YLVAYQATI 0.16491612221690466
LLFNEKLKV VVFQTSATI 0.34074379627251883
LVSDCASTI LLNETAKVI 0.29748368586862584
SVSVGTGIL RIKIAPGIA 0.31261995972267675
FTNSQIFNI ILDDNLYKV 0.33758252022114843
LTYSQLMTL RVFKKIMSI 0.3352045007916614
KINEMVDEL SIMETIDPV 0.33165283276314517
FLSHDFTLV FVNEKYCII 0.285919972621946
LLNESNIFL LLNETAKVI 0.31581347930895187
QIFEVYWYL RTFHIFYYL 0.2548362484006699
RLRSSVPGV RLRQDTEDI 0.30254988458974963
ILPDKIDGL LLTDTIESA 0.27814532989771423
SLFYTIATI NVFISPASI 0.33471382430313623
ALEAKIAQL AMNREVSSL 0.28191538565795426
LLTFWNPPV LLTFWNPPT 0.041525999652305745
SMFSTVATI TLMNVITLV 0.32426630748350305
MMNERDVSV LMTLDDLAI 0.32956072423602356
IISYIILFI ILSCIFAFI 0.2861301993380222
ALAPSTMKI NLAEDIMRL 0.3092907140207757
FMYTKHSML FLWGPRALV 0.34692477871123417
RVSRPTTVV HVSRPTTVV 0.07189499910109454
GLRQQLEDI DLTAALRDV 0.30683718509202196
VLTDFKT

RIITILQDI KVIKLVKSL 0.18807312905875517
YLVAYQATT KLVAYQATV 0.2232028415390247
NVIGLIVIL DLMGYIPLV 0.282847801335647
AVITETIPI FVFDRPLPV 0.34901364956115566
SLQEEIAFL SMSQELAEL 0.3028630079501411
NIEVKLFIV HVDGKILFV 0.3168226441050095
NMLSTVLGV SLFNTVATV 0.30251722872444875
ILISLINSL KVIKLVKSL 0.26078671124451946
AVIIMAINV ITVLTSVDI 0.28231837493403333
NIYSALMTL SLFNTVATV 0.2669937952765883
FANHKFTLV FVNRRFTLV 0.12836451444220387
FIISTLNKI WVMDTLNGI 0.24869195953552803
SVIFYFISI STSFYLISI 0.2112425645616568
ILDDNLYKV LVKSGLTEV 0.3232830315976891
FMYTKHSML YLYVDKNFI 0.3227245214150619
FSLGAAVKA YIVGANIET 0.3015501679918823
FSWTITDAV YNYSLSAAV 0.2881100655804488
ILDSVGIEA LMDCIIFES 0.3431719979527149
GLFDFVNFV SLFNTVATV 0.3328661172692605
MILVPLITV MMLVPLITV 0.027533348988252437
FMYSTAATI SLFNTVATV 0.34935243743572186
GLLGNVSTV SLFNTVATV 0.24531204809700358
ITLILSNKL ITVLTSVDI 0.33638658876570904
YVILKDPRI YLVTRHADV 0.3442477291812489
FAIEALAKA LLIKTLSPA 0.3149848662532342
ALAKAAAAL ALAK

KQMYRKFSR RQTVSRFKK 0.3284528177724242
YLTMKAIEK YLLVKWIRK 0.31661488456974185
GVSENIFLK KINSNFLLK 0.2707677306952304
KLFAAETLK KLITPNYMK 0.34723712515288874
ELYDTSPTK AIFQSSATK 0.3043161667005636
MLIYSMWGK VSMMSMYGK 0.33092463105524816
VALSSLVSK ASLPTTIAK 0.32406775283953926
GLCAHILLY RVCEKMALY 0.3178668567882065
YLLVKWYRK FLLRHYYNK 0.3424474372644112
RMFLAMITY RLFYTFFSY 0.25422113974411165
VVLASLIYR RVLFSIFYK 0.33390647593542433
KLDDVEKEK KTNDINVRR 0.32916717270654783
VTSSGTIYK FLKSGAVVK 0.3339002086037084
FLKENKLNK YIRRNMINK 0.22403467049431514
MFLTSVINR SFIISTLNK 0.2799205223523564
RVAVNKSNK RLGVRATRK 0.3075369628162037
ILKKLSSIK ALERLLSLK 0.2617265360230594
FIFSALDEK LVFNSISAR 0.23511654504497903
TTYDFLARK DSMDVLAEK 0.29037307546813873
HLIFCHSKK QLCYCPASK 0.3465515823221694
LIKFISDNK LITLILSNK 0.2634066324657308
LVYIFEPEK QLFTFSPRR 0.32061588021766496
QLFKPLTKK KVVNPLFEK 0.2933662761324932
SSRVDRYSK RQTVSRFKK 0.3245187865054313
MLFTSTNDK LVFNSISAR 0.3007895691281143
MVDELVTRK ALER

AIDRQVSVK RVNKGTGVK 0.34104123658924435
ALERLLSLK TIERIFNAK 0.21344533606539484
KSISSMTIR KAFNHASVK 0.3480861590819948
MVSDTIMKR VMANNVKKK 0.33533496671616425
VSLKKTNDK LSVETITEK 0.31839533734671055
SSLRREHIK MSLQRQFLR 0.34085964873911634
STLNFNNLR QTVDFTDCR 0.29402814788058607
GLCAHILLY ELCGAFLFY 0.2970987762516131
ALFMYYAKR GLFVYLIRY 0.3310820474924372
DLVKSSFVK GIFQSSMTK 0.3180861264669166
DIFVSLVKK RVYINVVVK 0.28527655567660626
PSIFLIITK GTMYILLKK 0.2558744832500136
LFDKDTFFK IMDKEQLLK 0.23983636688432874
SQIFNIISY ATSLDVINY 0.32045291271587173
ILNFLDWIK TFQLLNMIK 0.33830080493831316
SLYDEHIKK YLFNQHIKK 0.19466657924287367
RTFNEDLFR KMLTFDVFR 0.32733801401055707
EIIFLKLFK KVMFVIRFK 0.3131430990930859
THLGPQFCK TYLGPLNCK 0.3285889147453642
KSMREEYRK KSIENKHQR 0.29250095706717893
ISLNSMYTR LSVETITEK 0.3409089350767984
FSTSAADIK FTSTNDKIK 0.28357375027688847
TYLGPQFCK TYLGPLNCK 0.2684896542597096
PISASDMQK TISKDNLER 0.293111213857736
SSKGLACYR RTGDIGCFK 0.3043188107239597
SVTKSSSWK PV

IMPKAGLLI IMPKTGFLI 0.06253301989742965
AWIDNYNKF AQIDNYNKF 0.16324350824779155
NYARTEDFF PYKRIEELL 0.3365340344437733
AYSLTLQGL VYDFAFRDL 0.31766569007762946
YFILVNLLI EYVLLLFLL 0.338414047588141
VMPKTGLLI IMPKTGFLI 0.04645269853590683
VYMIMVKCW LYNLLIRCL 0.28027843662609886
EYLQLVFGI EYVLLLFLL 0.29434895559241137
TYLPTNASL AYINADSSI 0.25583306219255453
IFALISFLL EYVLLLFLL 0.32737500452668167
TYGIIVPVL LYNLLIRCL 0.3328963204807768
LYGPDAPTI LYGPDTPII 0.0792895899820476
TWKPTIFLL NWKPIVQFL 0.2705329783172654
data/train\HLA_A_2403_train.csv
# Numner of elements: 23
data/train\HLA_A_2601_train.csv
# Numner of elements: 42
YVIKVSARV NVIKVSARV 0.14239454644710337
EIIELTRTL NVIKVSARV 0.2902438280437192
EVIKVSARV NVIKVSARV 0.04450423551807603
SSFFMNRFY SSPLFNNFY 0.34377337183812273
YVIKVNARV NVIKVSARV 0.16810137918725032
YVIKVIARV NVIKVSARV 0.2602085451739353
YVIKVEARV NVIKVSARV 0.18914150521522544
YVIKVSSRV NVIKVSARV 0.1643195301484175
YVIKVSFRV NVIKVSARV 0.25704281405811824
YLKKWLNSF YVRGY

ILSKIPYLR VIEDITFLR 0.21675845441592845
IASKINNNR LTDTIESAK 0.322338408279904
SAICSVVRR KAMLYIIRR 0.3220345194125266
RFVKFNDYR STLNFNNLR 0.32690364170012665
SILDRIDTR ALYRRIQRR 0.3207561103973239
QVKDNIISR MVSDTIMKR 0.25773903260291575
DLLNSMMNR NAISSRVDR 0.3405745751189122
LMQGSTLPR VLRENTSPK 0.3437412456783202
DVKASMLEK PIRVCLLPR 0.34236305690417046
ELRRAAIDR TISKDNLER 0.34211892670638533
DSMDVLAEK TTVNTLSER 0.28245504630024165
ISIIVLFQR VTFLLLCGR 0.3336844302666212
LVKMINHLK MATMLEYVR 0.3055313710602585
HINTLIQYR DVSLIIEYK 0.24585567323687219
VALYRRIQR IVLFQRFLR 0.2919199926673227
VLCVKKFYK IVLFQRFLR 0.30574359173122534
IFAFIDFSK FFVFIHMVR 0.33268030069384413
TLISSDGAR EFINTGSSK 0.3343871438038525
AVNHYFKTR EINEWLSSK 0.3389296409084833
SYLNVSDFR STLNFNNLR 0.27924047823611053
FMRFFQLLR ILEYLYIMR 0.29932786549514645
NSISARALK NAISSRVDR 0.3286702426433278
ISLNSMYTR ISRDELWAR 0.32708344033631576
TFRERYSYK DFRDYQSYR 0.3235382943068016
MLQKEYMER TISKDNLER 0.33524264146153915
VFKAMETFK IYK

SVPLPCQLM EVRIPVDLV 0.3297578698561653
ILFIMFMLI IMFMLIFNV 0.2978467315806703
YLVKYQATV YTVAYQATV 0.15579092064171984
MLLNVQTLI TLMNVITLV 0.25188186352593733
RVYEALYYV NIYSALMTL 0.3306500150975198
EVLRPTTLV EIMDKEQLL 0.3356075509393387
SIVCIVAAV TLMNVITLV 0.3333077386078849
DISSFYWSL ELSRLRYNL 0.3473750795951438
ELQAQIAEL ELADKVTKL 0.25058301498128766
ELANEVKVL EIMDKEQLL 0.32726942041541596
FANYNFTLV FANCNFTLV 0.17607143816108084
DVSRPTTVV DVSRPTTVL 0.021225671520298306
IVVALSSLV LMTAISQGI 0.34505820213987126
RVSRPTTVV DVSRPTTVL 0.11424504955488413
LTYSQLMTL NIYSALMTL 0.34102644023013984
ALAKAAAAL AAAKAAAAV 0.15556884578799823
MMFDAMGAL LMYDIINSV 0.2406012473911342
IILFILFFA IMFMLIFNV 0.3446700714529841
FANHNFTLV FANCNFTLV 0.18558319773513154
EVIERINLL KVAELVHFL 0.3252913387975631
STVDVRNIV STIANSNII 0.31241021487615084
TLNFPISPI NVTYNIKPV 0.2696737068358491
ELLRPTTVV DVSRPTTVL 0.19860184145883086
LISLINSLV MSDIFHALV 0.34203869267918585
DVSRPTAVV DVSRPTTVL 0.04383948498806456
TLFIDRGSI

LPTLFGRGV FPGLYGASI 0.2463195732545832
MPLETQLAI LPTNASLSF 0.31048177095209106
LPYPQPQPF LPYSQPQPF 0.06214896478888832
data/train\HLA_B_0801_train.csv
# Numner of elements: 16
FLKDVMESM FVRTLFQQM 0.2670519640641188
FVRQCFNPM FVRTLFQQM 0.2929993152665237


In [176]:
data

Unnamed: 0,species,allele,length,cv,sequence,inequality,ic50,smm,cluster,ratio
0,human,HLA A*0101,9,1,YTDDYPMYK,=,18.8700,292.2,0,1.0
1,human,HLA A*0101,9,1,QSITRSLIY,=,94.4151,341.8,1,1.0
2,human,HLA A*0101,9,3,KSDPIMLLK,=,345.2000,1098.7,2,1.0
3,human,HLA A*0101,9,0,GTATYLPPY,=,120.0000,797.4,3,1.0
4,human,HLA A*0101,9,0,SSPLFNNFY,=,117.0000,1077.8,4,1.0
...,...,...,...,...,...,...,...,...,...,...
77,human,HLA A*0101,9,4,FVSVYFSDY,=,135.0000,1340.3,66,0.5
78,human,HLA A*0101,9,0,ISDYDYYRY,=,47.6600,180.5,71,1.0
79,human,HLA A*0101,9,1,CADGTRHTY,=,438.5000,67.7,72,1.0
80,human,HLA A*0101,9,0,LTEIDIRDY,=,65.0000,287.7,73,1.0


In [156]:
data

Unnamed: 0,species,allele,length,cv,sequence,inequality,ic50,smm,cluster,ratio
0,human,HLA B*0801,9,3,FVRTLFQQM,=,188.0,1023.5,0,0.333333
1,human,HLA B*0801,9,2,YLKKWLNSF,=,69.5,3199.7,1,1.0
2,human,HLA B*0801,9,2,RDALGRTAL,=,499.5,3602.5,0,0.333333
3,human,HLA B*0801,9,2,FPRGQGVPI,=,129.5,7094.6,2,1.0
4,human,HLA B*0801,9,2,NNKSRLVAF,=,161.0,1823.9,0,0.333333
5,human,HLA B*0801,9,2,IPRRNVATL,=,71.0,3715.6,3,0.5
6,human,HLA B*0801,9,0,QARQMVQAM,=,466.0,724.2,4,0.5
7,human,HLA B*0801,9,2,IPKRNRSIL,=,268.0,1570.8,5,1.0
8,human,HLA B*0801,9,4,LPQTRWQAV,=,188.55,7135.4,6,1.0
9,human,HLA B*0801,9,4,FLHPKHWGT,=,285.45,14099.4,3,0.5
