In [1]:
# Prior to this, peptides have already been filtered by library design in Step 0
# This notebook will take AS-MS data that has been filtered by library design and rigorously remove the sequence isomers

In [1]:
import os, difflib
import pandas as pd
import numpy as np

In [2]:
# file_list = [],
# for file in os.listdir(os.getcwd()):
#     if file.endswith('.csv'):
#         file_list.append(file)

In [3]:
file = 'Example Raw Data.csv'
df = pd.read_csv(file)

In [4]:
df

Unnamed: 0,Notebook,Scan,Peptide,ALC (%),m/z,z,Obs Parent Mass,RT,Mass,ppm
0,284-126,3203,NNGGGLGSGQASK,89,573.2864,2,1144.5582,22.55,1144.5586,-0.3
1,268-020,5770,TGSAADVADYSAK,96,627.7993,2,1253.5840,27.06,1253.5889,-3.8
2,268-186,318,GSASGLSFKGSHK,99,421.2267,3,1260.6582,9.91,1260.6575,0.7
3,256-092,10187,GMASDMADYAAGK,99,643.7775,2,1285.5404,43.45,1285.5432,-2.2
4,268-175,4195,GLASGLSFKGSHK,99,429.9094,3,1286.7063,25.99,1286.7095,-2.5
...,...,...,...,...,...,...,...,...,...,...
4262,284-079,12732,WYTHMMFPWMWFK,95,630.6179,3,1888.8318,74.98,1888.8403,-4.4
4263,261-179,26317,LYYYMTPWWPYWK,95,948.4512,2,1894.8878,87.87,1894.8906,-1.5
4264,228-002,38310,YLYYMTPWPWYWK,98,956.4479,2,1910.8812,96.49,1910.8855,-2.2
4265,271-146,5783,WRNKFWLQWHWYK,97,495.0081,4,1976.0032,48.36,1976.0110,-3.9


In [5]:
# Filter out based on ALC, ppm, and Uniqueness create a new dataframe
ALC_Cutoff = 85 # inclusive of this number and higher
ppm_error_cutoff = 10 # exclusive of this number, i.e., if 10, then 9.9 ppm is kept

# ALC, which is sequencing confidence, This was previously filtered, and is filtered again here
ALC_list = df['ALC (%)'].to_list()
Good_ALC_list = []
for i,ALC in enumerate(ALC_list):
    if ALC > ALC_Cutoff-1: Good_ALC_list.append(i)
df_ALC = df.iloc[Good_ALC_list,:]

# ppm error
ppm_list = df_ALC['ppm'].to_list()
Good_ppm_list = []
for i,ppm in enumerate(ppm_list):
    if ppm > -ppm_error_cutoff and ppm < ppm_error_cutoff: Good_ppm_list.append(i)
df = df_ALC.iloc[Good_ppm_list,:]

# # Use this if it is needed
# # Remove nonunique peptides
# Unique_list = df_ppm['Uniqueness'].to_list()
# Good_unique_list = []
# for i,uniq in enumerate(Unique_list):
#     if uniq == 'UNIQUE': Good_unique_list.append(i)
# df = df_ppm.iloc[Good_unique_list,:] # overwrites earlier dataframe

# Calculate and sort by observed parent mass, OPM
m_z = df['m/z'].to_list()
zs = df['z'].to_list()
OPM = []
for i,mz in enumerate(m_z):
    OPM.append(mz*zs[i]-zs[i]*1.0073)
df['Obs Parent Mass'] = OPM
df = df.sort_values(by=['Obs Parent Mass']) # technically unnecessary, but useful to allow manual inspection

df.reset_index(drop=True,inplace=True) # required such that the indices of lists and dataframes match

# Clean up peptides for comparison, remove KU if it exists, remove ' 's that sometime exist
peptides_start = df['Peptide'].tolist()
peptides = [pep.replace('KU', 'K') for pep in peptides_start]
betterpeptides = [string for string in peptides if string != '']
df['Peptide'] = betterpeptides
print(f'ALC, ppm, and Unique filtering completed, removed {len(ALC_list)-len(peptides_start)} peptides')

ALC, ppm, and Unique filtering completed, removed 66 peptides


In [6]:
df

Unnamed: 0,Notebook,Scan,Peptide,ALC (%),m/z,z,Obs Parent Mass,RT,Mass,ppm
0,284-126,3203,NNGGGLGSGQASK,89,573.2864,2,1144.5582,22.55,1144.5586,-0.3
1,268-020,5770,TGSAADVADYSAK,96,627.7993,2,1253.5840,27.06,1253.5889,-3.8
2,268-186,318,GSASGLSFKGSHK,99,421.2267,3,1260.6582,9.91,1260.6575,0.7
3,256-092,10187,GMASDMADYAAGK,99,643.7775,2,1285.5404,43.45,1285.5432,-2.2
4,268-175,4195,GLASGLSFKGSHK,99,429.9094,3,1286.7063,25.99,1286.7095,-2.5
...,...,...,...,...,...,...,...,...,...,...
4196,284-079,12732,WYTHMMFPWMWFK,95,630.6179,3,1888.8318,74.98,1888.8403,-4.4
4197,261-179,26317,LYYYMTPWWPYWK,95,948.4512,2,1894.8878,87.87,1894.8906,-1.5
4198,228-002,38310,YLYYMTPWPWYWK,98,956.4479,2,1910.8812,96.49,1910.8855,-2.2
4199,271-146,5783,WRNKFWLQWHWYK,97,495.0081,4,1976.0032,48.36,1976.0110,-3.9


In [7]:
# Step 1, remove peptide duplicates
df = df.drop_duplicates(subset=['Peptide'])
df.reset_index(drop=True,inplace=True)
print(f'Step 1 completed, removed {len(peptides_start)-len(df.index)} duplicates')

Step 1 completed, removed 0 duplicates


In [8]:
df

Unnamed: 0,Notebook,Scan,Peptide,ALC (%),m/z,z,Obs Parent Mass,RT,Mass,ppm
0,284-126,3203,NNGGGLGSGQASK,89,573.2864,2,1144.5582,22.55,1144.5586,-0.3
1,268-020,5770,TGSAADVADYSAK,96,627.7993,2,1253.5840,27.06,1253.5889,-3.8
2,268-186,318,GSASGLSFKGSHK,99,421.2267,3,1260.6582,9.91,1260.6575,0.7
3,256-092,10187,GMASDMADYAAGK,99,643.7775,2,1285.5404,43.45,1285.5432,-2.2
4,268-175,4195,GLASGLSFKGSHK,99,429.9094,3,1286.7063,25.99,1286.7095,-2.5
...,...,...,...,...,...,...,...,...,...,...
4196,284-079,12732,WYTHMMFPWMWFK,95,630.6179,3,1888.8318,74.98,1888.8403,-4.4
4197,261-179,26317,LYYYMTPWWPYWK,95,948.4512,2,1894.8878,87.87,1894.8906,-1.5
4198,228-002,38310,YLYYMTPWPWYWK,98,956.4479,2,1910.8812,96.49,1910.8855,-2.2
4199,271-146,5783,WRNKFWLQWHWYK,97,495.0081,4,1976.0032,48.36,1976.0110,-3.9


In [9]:
# Step 2: Remove sequence isomers with the same precursor mass -OR- with a different specific precursor mass
# There are multiple situations where the removal of precursors of different masses is justified including:
    # 1) incorrect monoisotopic precursor selection (abs delta of 1)
    # 2) oxidations (abs delta of 16, 32)
    # 3) and sodium adduct (abs delta of 22)
# 0.69 similarity was emperically determined and is very lenient in the comparison (multiple mismatches / shifts allowed)
# the mass differences are the tight requirement here; and thus, and thus, ALC cutoffs are not considered
# RT differences are not considered in case the data was acquired using different gradients, though they were used to set the similarity requirement
# The highest ALC peptide will be retained, with ppm as tie-breaker

# Extract lists from dataframe for enumeration
peptides = df['Peptide'].tolist()
OPM = df['Obs Parent Mass'].tolist()
ALC = df['ALC (%)'].tolist()
ppm = df['ppm'].tolist()

# Make a list of all the peptides to remove, Step 2:
similarity_cutoff = 0.69      # more than this
Mass_diff_list = [0,1,16,32,22] # and a mass difference within this list

to_drop = []
for i in range(len(peptides)):
    if len(peptides) > 500 and i % 100 == 0: print(f"Step 2: {i} of {len(peptides)}")
    for k in range(len(peptides)):
        sim = difflib.SequenceMatcher(a=peptides[k], b=peptides[i])
        simz = sim.ratio()
        difference = np.abs(float(OPM[i]) - float(OPM[k]))
        if i != k and simz > similarity_cutoff:
            # print(f"Trouble {peptides[k]} {ALC[k]} {round(float(OPM[k]),3)} and {peptides[i]} {ALC[i]} {round(float(OPM[i]),3)}, similarity {round(float(simz),3)} difference {round(float(difference),3)}")
            if round(float(difference)) in Mass_diff_list:
                if ALC[k] > ALC[i]:
                    print(f"1: I've marked {peptides[i]} {ALC[i]}, index is {i}, similarity {round(float(simz),3)}, Mass difference {round(float(difference),3)}")
                    to_drop.append(i)
                elif ALC[k] < ALC[i]:
                    print(f"2: I've marked {peptides[k]} {ALC[k]}, index is {k}, similarity {round(float(simz),3)}, Mass difference {round(float(difference),3)}")
                    to_drop.append(k)
                elif ALC[k] == ALC[i]:    
                    if abs(float(ppm[k])) < abs(float(ppm[i])):
                        print(f"3: I've marked {peptides[k]} {ALC[k]}, index is {k}, similarity {round(float(simz),3)}, Mass difference {round(float(difference),3)}")
                        to_drop.append(k)
                    elif abs(float(ppm[i])) < abs(float(ppm[k])):
                        print(f"4: I've marked {peptides[i]} {ALC[i]}, index is {i}, similarity {round(float(simz),3)}, Mass difference {round(float(difference),3)}")
                        to_drop.append(i)
                    else:
                        print(f"5: I've marked {peptides[k]} {ALC[k]}, index is {k}, similarity {round(float(simz),3)}, Mass difference {round(float(difference),3)}")
                        to_drop.append(k)

Step 2: 0 of 4201
2: I've marked DLADYAAVDGGFK 91, index is 19, similarity 0.692, Mass difference 32.004
1: I've marked LTTGDAPDYADGK 88, index is 10, similarity 0.692, Mass difference 21.954
1: I've marked PDVPDYAVASGDK 90, index is 14, similarity 0.692, Mass difference 32.064
3: I've marked APGDLPDYAVTNK 95, index is 27, similarity 0.692, Mass difference 22.094
2: I've marked NSFEGDAPDYAGK 90, index is 43, similarity 0.692, Mass difference 32.007
1: I've marked DLADYAAVDGGFK 91, index is 19, similarity 0.692, Mass difference 32.004
1: I've marked DLADYAAVDGGFK 91, index is 19, similarity 0.692, Mass difference 15.996
2: I've marked LTTGDAPDYADGK 88, index is 10, similarity 0.692, Mass difference 21.954
4: I've marked SVDTGDMADYAGK 95, index is 20, similarity 0.692, Mass difference 15.977
1: I've marked PGQNLDVGDYAAK 96, index is 21, similarity 0.692, Mass difference 21.981
1: I've marked QGSADLPDYAANK 97, index is 22, similarity 0.692, Mass difference 31.992
4: I've marked GGVLRDVSDY

2: I've marked HKYGADVVDYAAK 92, index is 245, similarity 0.692, Mass difference 22.105
3: I've marked DPGTYDLVDYAGK 99, index is 140, similarity 0.692, Mass difference 1.009
3: I've marked VGDPGDLHDYAEK 99, index is 150, similarity 0.692, Mass difference 0.981
2: I've marked VNGTDDLTDYSAK 96, index is 100, similarity 0.692, Mass difference 16.087
4: I've marked VGDPGDLHDYAEK 99, index is 150, similarity 0.692, Mass difference 0.981
4: I've marked VGDPGDLHDYAEK 99, index is 150, similarity 0.692, Mass difference 16.013
2: I've marked VGEHDDLEDYAGK 98, index is 317, similarity 0.692, Mass difference 31.99
2: I've marked LVGGLFDLHDYAK 93, index is 322, similarity 0.769, Mass difference 32.111
2: I've marked SQADLPDYSATVK 96, index is 90, similarity 0.692, Mass difference 21.931
1: I've marked ASLSYDVQDYAGK 97, index is 154, similarity 0.692, Mass difference 22.013
3: I've marked TSMEDVADYADGK 99, index is 105, similarity 0.923, Mass difference 15.994
1: I've marked QSPVDLADYSAQK 92, inde

2: I've marked MLDAPDYAQDSAK 87, index is 187, similarity 0.846, Mass difference 15.994
1: I've marked MLDAPDYANTDAK 93, index is 266, similarity 0.692, Mass difference 15.99
1: I've marked YPWALDAADYAGK 95, index is 268, similarity 0.769, Mass difference 32.022
1: I've marked YPWALDAADYAGK 95, index is 268, similarity 0.692, Mass difference 22.081
1: I've marked GPVQGDVMPYAYK 88, index is 269, similarity 0.846, Mass difference 1.001
4: I've marked WGSLTDLTDYAAK 99, index is 270, similarity 0.692, Mass difference 22.056
3: I've marked SSDVDAPDYAELK 99, index is 129, similarity 0.692, Mass difference 32.003
4: I've marked EFDLADYAATTPK 99, index is 275, similarity 0.692, Mass difference 0.994
2: I've marked EGVFDLADYAMDK 85, index is 521, similarity 0.692, Mass difference 31.969
4: I've marked VSAGDMEDYAEQK 93, index is 276, similarity 0.846, Mass difference 15.991
3: I've marked PYLANDVADYAAK 99, index is 133, similarity 0.692, Mass difference 31.952
3: I've marked EFDLADYAATTPK 99, in

3: I've marked LVLFDATDYADSK 99, index is 382, similarity 0.692, Mass difference 1.022
3: I've marked QSLTDWADYAASK 99, index is 370, similarity 0.692, Mass difference 1.023
3: I've marked LVLFDATDYADSK 99, index is 382, similarity 0.692, Mass difference 1.019
4: I've marked YGDDVQDYAQVGK 99, index is 380, similarity 0.692, Mass difference 0.997
1: I've marked DGVPDYAADLYMK 85, index is 381, similarity 0.692, Mass difference 1.036
4: I've marked LVLFDATDYADSK 99, index is 382, similarity 0.692, Mass difference 1.022
4: I've marked LVLFDATDYADSK 99, index is 382, similarity 0.692, Mass difference 1.019
2: I've marked EGVFDLADYAMDK 85, index is 521, similarity 0.692, Mass difference 15.938
3: I've marked EPVQNDVADYSAK 99, index is 235, similarity 0.692, Mass difference 22.056
2: I've marked VTPGFDLEDYAAK 98, index is 194, similarity 0.692, Mass difference 32.039
3: I've marked LGSDLEDYAHTMK 99, index is 577, similarity 0.692, Mass difference 21.95
4: I've marked VPLGTHDLEDYAK 99, index i

2: I've marked LAANMVDYADARK 86, index is 358, similarity 0.692, Mass difference 15.908
2: I've marked RMDDYADAVGNVK 89, index is 356, similarity 0.846, Mass difference 15.991
1: I've marked DMRDYADAVGNVK 91, index is 479, similarity 0.692, Mass difference 31.954
2: I've marked MGADLHDYAAFMK 98, index is 843, similarity 0.769, Mass difference 31.94
4: I've marked LLDFPDYAFGLAK 96, index is 484, similarity 0.692, Mass difference 16.004
3: I've marked WLGLDDTSDYAAK 99, index is 363, similarity 0.769, Mass difference 15.973
2: I've marked MPFSDAPDYADNK 92, index is 647, similarity 0.692, Mass difference 15.927
3: I've marked AAEQDVFDYADDK 99, index is 649, similarity 0.692, Mass difference 15.925
3: I've marked AFGGEDVEDYAWK 99, index is 652, similarity 0.692, Mass difference 15.942
4: I've marked YDMDVADYASGHK 99, index is 492, similarity 0.692, Mass difference 22.057
1: I've marked MGLPTDHHDYAAK 93, index is 494, similarity 0.692, Mass difference 32.02
2: I've marked FGLPTDHHDYAAK 90, i

3: I've marked AYGYDLPDYADAK 99, index is 418, similarity 0.692, Mass difference 16.026
4: I've marked VYDLFDYADTAGK 99, index is 560, similarity 0.692, Mass difference 22.028
4: I've marked LTVETDVSDYAHK 99, index is 562, similarity 0.692, Mass difference 32.009
2: I've marked LNHNVDVSDYASK 92, index is 419, similarity 0.692, Mass difference 16.021
1: I've marked SFYDLEDYANGGK 98, index is 564, similarity 0.692, Mass difference 1.011
4: I've marked SFYDLEDYANGGK 98, index is 564, similarity 0.692, Mass difference 1.061
1: I've marked YGQFSDVADYADK 94, index is 566, similarity 0.692, Mass difference 16.053
1: I've marked LDAPDYADNNWGK 91, index is 567, similarity 0.692, Mass difference 32.016
1: I've marked MLDVEDYSHGGQK 89, index is 568, similarity 0.692, Mass difference 1.046
4: I've marked DDWPDYALVADAK 99, index is 569, similarity 0.692, Mass difference 22.06
2: I've marked VPAEMPDVQDSFK 86, index is 429, similarity 0.846, Mass difference 15.996
4: I've marked KPTEDVTDYADPK 99, ind

1: I've marked ETFDVPDYAEATK 89, index is 639, similarity 0.692, Mass difference 31.997
4: I've marked VESLADYPDYADK 99, index is 640, similarity 0.692, Mass difference 1.034
2: I've marked ENFSDAPDYADNK 96, index is 635, similarity 0.692, Mass difference 0.064
1: I've marked PNQYSDLADYATK 97, index is 641, similarity 0.692, Mass difference 22.04
1: I've marked PNQYSDLADYATK 97, index is 641, similarity 0.769, Mass difference 22.072
3: I've marked LAYMKDVVDYAAK 99, index is 658, similarity 0.692, Mass difference 1.063
2: I've marked VGPVVDLDDYAYK 97, index is 359, similarity 0.692, Mass difference 31.99
2: I've marked TKGGLDRHDYAFK 90, index is 937, similarity 0.692, Mass difference 22.06
1: I've marked SKGFMDPRDYAAK 89, index is 644, similarity 0.692, Mass difference 22.048
5: I've marked LAYMKDVVDYAAK 99, index is 658, similarity 0.846, Mass difference 1.017
1: I've marked MPFSDAPDYADNK 92, index is 647, similarity 0.692, Mass difference 15.927
1: I've marked MPFSDAPDYADNK 92, index 

1: I've marked VTELPDVWDYAGK 87, index is 720, similarity 0.692, Mass difference 1.054
1: I've marked VTELPDVWDYAGK 87, index is 720, similarity 0.692, Mass difference 0.991
1: I've marked VTELPDVWDYAGK 87, index is 720, similarity 0.692, Mass difference 21.933
1: I've marked TQTPLDLEDYAVK 97, index is 722, similarity 0.692, Mass difference 32.057
4: I've marked RELADVKDYAANK 99, index is 723, similarity 0.692, Mass difference 0.042
3: I've marked NLKDVKDYAAQTK 99, index is 739, similarity 0.692, Mass difference 1.021
4: I've marked VGWADLKDYAARK 99, index is 725, similarity 0.692, Mass difference 32.144
4: I've marked VGWADLKDYAARK 99, index is 725, similarity 0.692, Mass difference 15.916
4: I've marked VGWADLKDYAARK 99, index is 725, similarity 0.692, Mass difference 31.94
2: I've marked EYGMVDAEDYTGK 94, index is 557, similarity 0.846, Mass difference 15.988
3: I've marked YDMDVADYASGHK 99, index is 492, similarity 0.692, Mass difference 22.057
3: I've marked PVLFDMADYAHSK 99, inde

4: I've marked PLFGMDPRDYASK 98, index is 784, similarity 0.692, Mass difference 16.01
4: I've marked LMDKEDYAVTAPK 99, index is 785, similarity 0.692, Mass difference 16.052
3: I've marked MLDKEDYAVTAPK 99, index is 594, similarity 0.923, Mass difference 15.994
1: I've marked GDLQDYAFVNNLK 97, index is 786, similarity 0.692, Mass difference 0.991
2: I've marked MKDLPDYAFEGVK 98, index is 1011, similarity 0.692, Mass difference 15.977
1: I've marked GMHDMLDYADMAK 95, index is 789, similarity 0.769, Mass difference 16.08
2: I've marked GEHSFDTPDYADK 88, index is 598, similarity 0.692, Mass difference 16.02
2: I've marked EMLDPDLMDYAGK 85, index is 1271, similarity 0.692, Mass difference 32.003
1: I've marked PSAMNDLADYAWK 92, index is 792, similarity 0.692, Mass difference 1.016
2: I've marked GSHDLADYAHNWK 87, index is 1019, similarity 0.692, Mass difference 16.015
3: I've marked ATHELDVVDYADK 99, index is 542, similarity 0.692, Mass difference 21.973
2: I've marked GEHSFDTPDYADK 88, i

3: I've marked NAVTHDLQDYAEK 99, index is 873, similarity 0.692, Mass difference 0.923
4: I've marked LTVHDLKDYAAEK 99, index is 868, similarity 0.692, Mass difference 15.928
3: I've marked MTVHDMPDYAANK 99, index is 1176, similarity 0.692, Mass difference 21.867
3: I've marked TRVFDLVDYAAHK 99, index is 1354, similarity 0.692, Mass difference 32.019
2: I've marked SSDVTDYADKFKK 87, index is 876, similarity 0.692, Mass difference 0.067
1: I've marked WNPLDMHDYAGGK 85, index is 870, similarity 0.846, Mass difference 15.997
1: I've marked LLMMDVTDYAGDK 97, index is 871, similarity 0.692, Mass difference 21.991
1: I've marked LLMMDVTDYAGDK 97, index is 871, similarity 0.692, Mass difference 32.09
4: I've marked NAVTHDLQDYAEK 99, index is 873, similarity 0.692, Mass difference 0.923
4: I've marked DLHDYAHATVFSK 99, index is 874, similarity 0.769, Mass difference 32.043
4: I've marked DLHDYAHATVFSK 99, index is 874, similarity 0.692, Mass difference 32.03
4: I've marked DLHDYAHATVFSK 99, in

4: I've marked FGEPPEDLEDYAK 99, index is 956, similarity 0.692, Mass difference 1.077
2: I've marked AHVDLEDYAENQK 96, index is 1306, similarity 0.692, Mass difference 21.982
2: I've marked VDLPFDLKDYSAK 91, index is 977, similarity 0.846, Mass difference 1.002
1: I've marked MGAMDDMQDYASK 87, index is 964, similarity 0.692, Mass difference 0.086
2: I've marked VGGDWQDYADKEK 85, index is 969, similarity 0.692, Mass difference 0.045
2: I've marked TVGMHDNMEYADK 87, index is 1485, similarity 0.846, Mass difference 31.98
2: I've marked RDVSADMHDYAAK 88, index is 744, similarity 0.692, Mass difference 16.001
1: I've marked YADMHDYAAQPTK 93, index is 966, similarity 0.923, Mass difference 15.991
1: I've marked QDEDLQDYAANTK 97, index is 967, similarity 0.692, Mass difference 1.002
2: I've marked LDAPDYADNNWGK 91, index is 567, similarity 0.692, Mass difference 32.016
1: I've marked SLMDLPDYADENK 98, index is 968, similarity 0.692, Mass difference 0.075
1: I've marked SLMDLPDYADENK 98, inde

2: I've marked GTWGNDWEDYAAK 96, index is 998, similarity 0.692, Mass difference 1.057
2: I've marked GMHDMLDYADMAK 95, index is 789, similarity 0.769, Mass difference 16.08
1: I've marked LGFVDDLEDYAEK 89, index is 1024, similarity 0.692, Mass difference 15.983
1: I've marked LGFVDDLEDYAEK 89, index is 1024, similarity 0.692, Mass difference 1.055
1: I've marked LGFVDDLEDYAEK 89, index is 1024, similarity 0.692, Mass difference 0.029
1: I've marked LGFVDDLEDYAEK 89, index is 1024, similarity 0.769, Mass difference 15.932
1: I've marked LGFVDDLEDYAEK 89, index is 1024, similarity 0.692, Mass difference 15.956
1: I've marked LGFVDDLEDYAEK 89, index is 1024, similarity 0.692, Mass difference 15.982
1: I've marked LGFVDDLEDYAEK 89, index is 1024, similarity 0.692, Mass difference 22.142
3: I've marked LFGNDLEDYAVNK 99, index is 795, similarity 0.692, Mass difference 15.999
2: I've marked EMLDPDLMDYAGK 85, index is 1271, similarity 0.692, Mass difference 15.921
4: I've marked VQADLQDYAEHPK

2: I've marked LRGYTDTEDYASK 98, index is 1096, similarity 0.769, Mass difference 0.045
1: I've marked FKDDLEDYSTSAK 92, index is 1094, similarity 0.692, Mass difference 15.949
1: I've marked VDQDYAYYSVPAK 90, index is 1095, similarity 0.692, Mass difference 16.017
1: I've marked LRGYTDTEDYASK 98, index is 1096, similarity 0.846, Mass difference 31.991
1: I've marked LRGYTDTEDYASK 98, index is 1096, similarity 0.692, Mass difference 0.94
1: I've marked LRGYTDTEDYASK 98, index is 1096, similarity 0.769, Mass difference 0.045
3: I've marked LRGTDTEDYAAFK 99, index is 654, similarity 0.692, Mass difference 31.994
3: I've marked LTVHDLKDYAAEK 99, index is 868, similarity 0.692, Mass difference 15.928
3: I've marked LHVRTDTQDYAAK 99, index is 1088, similarity 0.692, Mass difference 0.942
3: I've marked YNTVEDMADYADK 99, index is 1337, similarity 0.692, Mass difference 15.927
3: I've marked YDTMDMQDYAAVK 99, index is 1602, similarity 0.692, Mass difference 31.936
3: I've marked LYQADMMDYAAVK

1: I've marked GLARDWEDYASLK 95, index is 1167, similarity 0.692, Mass difference 22.028
2: I've marked SSWSTDYEDYAAK 91, index is 1135, similarity 0.692, Mass difference 1.134
1: I've marked KAWLTDLEDYAAK 94, index is 1168, similarity 0.692, Mass difference 0.03
2: I've marked RQTLTDLEDYAAK 87, index is 1170, similarity 0.769, Mass difference 0.008
1: I've marked KAWLTDLEDYAAK 94, index is 1168, similarity 0.692, Mass difference 16.035
2: I've marked LVKRALDLPDYAK 90, index is 853, similarity 0.692, Mass difference 21.899
1: I've marked VYALLQDQPDYAK 93, index is 1169, similarity 0.692, Mass difference 16.028
1: I've marked RQTLTDLEDYAAK 87, index is 1170, similarity 0.769, Mass difference 0.008
1: I've marked RQTLTDLEDYAAK 87, index is 1170, similarity 0.692, Mass difference 1.034
2: I've marked VPQLTDVQDYSVK 96, index is 708, similarity 0.692, Mass difference 32.041
2: I've marked KDDVQDYAHSYAK 96, index is 1428, similarity 0.692, Mass difference 15.895
4: I've marked KLPLFDVQDYASK 

3: I've marked SDVFFDHPDYASK 99, index is 1235, similarity 0.692, Mass difference 0.02
4: I've marked YSDVNDLPDYAQK 99, index is 1237, similarity 0.692, Mass difference 21.983
1: I've marked SFWDKADYAAEPK 96, index is 1239, similarity 0.692, Mass difference 15.918
4: I've marked DVADKEDYAAMRK 99, index is 1241, similarity 0.692, Mass difference 0.017
1: I've marked YPNGDLEDYSARK 95, index is 1242, similarity 0.692, Mass difference 15.973
2: I've marked AQLKQDVRDYAAK 88, index is 911, similarity 0.692, Mass difference 21.908
2: I've marked MKTDLQDYADLAK 98, index is 991, similarity 0.692, Mass difference 15.976
1: I've marked QVVDMEDYSVAKK 89, index is 1244, similarity 0.923, Mass difference 15.991
1: I've marked QVVDMEDYSVAKK 89, index is 1244, similarity 0.769, Mass difference 0.993
1: I've marked KMTDLQDYADLAK 97, index is 1245, similarity 0.692, Mass difference 32.004
2: I've marked VTNKTDLLDYADK 94, index is 767, similarity 0.692, Mass difference 31.971
1: I've marked KMTDLQDYADLAK

4: I've marked DLQDYASHLKVLK 99, index is 1290, similarity 0.692, Mass difference 32.109
2: I've marked DDLQDYSALLRAK 97, index is 938, similarity 0.692, Mass difference 22.061
4: I've marked DLQDYASHLKVLK 99, index is 1290, similarity 0.692, Mass difference 16.08
3: I've marked FNTDVQDYAENAK 99, index is 1038, similarity 0.692, Mass difference 15.946
2: I've marked NGNDDLEDYAFEK 95, index is 1270, similarity 0.692, Mass difference 0.981
3: I've marked FDAKKDVTDYAEK 99, index is 1285, similarity 0.692, Mass difference 0.873
2: I've marked AEDMHDYADQVAK 98, index is 939, similarity 0.692, Mass difference 22.021
2: I've marked MDDLMDYAGVEQK 97, index is 1036, similarity 0.692, Mass difference 16.006
2: I've marked EMLDPDLMDYAGK 85, index is 1271, similarity 0.692, Mass difference 1.014
2: I've marked MYLDDLRDYASGK 98, index is 1552, similarity 0.692, Mass difference 16.065
2: I've marked QPFYGDMMDYASK 96, index is 1626, similarity 0.692, Mass difference 21.932
2: I've marked FALFNDMEDYAV

3: I've marked VDLQDYAEVAHMK 99, index is 1350, similarity 0.692, Mass difference 0.079
1: I've marked LKDLQDYSAKEPK 86, index is 1355, similarity 0.692, Mass difference 0.971
1: I've marked LKDLQDYSAKEPK 86, index is 1355, similarity 0.692, Mass difference 21.978
2: I've marked FRATYDVEDYAGK 98, index is 1346, similarity 0.692, Mass difference 0.91
2: I've marked QMDLFDYASYGPK 88, index is 1342, similarity 0.769, Mass difference 0.979
1: I've marked NENDWPDYAAVNK 97, index is 1361, similarity 0.692, Mass difference 0.913
2: I've marked TPDWPDYAVDNFK 87, index is 1858, similarity 0.692, Mass difference 32.027
1: I've marked GGTYFDVHDYSFK 93, index is 1362, similarity 0.692, Mass difference 32.066
1: I've marked LTFMDMPDYAGFK 91, index is 1363, similarity 0.692, Mass difference 16.023
1: I've marked LTFMDMPDYAGFK 91, index is 1363, similarity 0.692, Mass difference 21.936
4: I've marked PANVEWDMPDYAK 99, index is 1364, similarity 0.692, Mass difference 1.048
2: I've marked PANVEWDMDPYAK

1: I've marked LGDLNDMEDYARK 95, index is 1430, similarity 0.692, Mass difference 21.967
3: I've marked MFDVHDYANNAVK 99, index is 1156, similarity 0.692, Mass difference 16.046
2: I've marked EYMDHDYAAYVGK 97, index is 1754, similarity 0.692, Mass difference 21.92
3: I've marked LVDEQDYAAKEDK 99, index is 1161, similarity 0.692, Mass difference 16.046
5: I've marked GYDLNDLQDYAMK 99, index is 1756, similarity 0.692, Mass difference 21.915
2: I've marked SVPDLQDYAESRK 98, index is 932, similarity 0.692, Mass difference 32.03
4: I've marked APDLQDYAEKYVK 99, index is 1435, similarity 0.692, Mass difference 31.951
1: I've marked YDLVDYSARPVNK 88, index is 1436, similarity 0.692, Mass difference 1.06
1: I've marked YDLVDYSARPVNK 88, index is 1436, similarity 0.692, Mass difference 16.031
4: I've marked LYWVKDAPDYAAK 99, index is 1437, similarity 0.692, Mass difference 16.085
2: I've marked KFWVKDAPDYAAK 95, index is 1420, similarity 0.846, Mass difference 0.985
3: I've marked WMVDAHDYAFSA

2: I've marked LDLVDKVDYSDTK 89, index is 976, similarity 0.692, Mass difference 32.041
Step 2: 1500 of 4201
1: I've marked NDVHDYASDWGHK 94, index is 1500, similarity 0.692, Mass difference 0.965
1: I've marked TAYVDQRDYADVK 93, index is 1505, similarity 0.692, Mass difference 22.079
1: I've marked TAYVDQRDYADVK 93, index is 1505, similarity 0.692, Mass difference 31.989
1: I've marked TAYVDQRDYADVK 93, index is 1505, similarity 0.692, Mass difference 32.012
1: I've marked LNQFDFPDYSAVK 98, index is 1506, similarity 0.692, Mass difference 32.041
1: I've marked LNQFDFPDYSAVK 98, index is 1506, similarity 0.692, Mass difference 16.008
2: I've marked VDVADYADFKHHK 98, index is 1527, similarity 0.692, Mass difference 1.004
1: I've marked YDMPDSYARVAKK 90, index is 1508, similarity 0.692, Mass difference 1.045
2: I've marked NMQDLDMQDYAGK 98, index is 1251, similarity 0.692, Mass difference 16.033
1: I've marked DYTPDYASWMPAK 87, index is 1512, similarity 0.692, Mass difference 0.974
2: I'

3: I've marked LYLGADMQDYAEK 99, index is 1065, similarity 0.692, Mass difference 31.983
3: I've marked ESVQDYAYAPMFK 99, index is 1582, similarity 0.692, Mass difference 0.022
3: I've marked EFVHDLKDYAAPK 99, index is 1320, similarity 0.692, Mass difference 15.912
2: I've marked SNEFSDMLDYAEK 87, index is 1810, similarity 0.692, Mass difference 15.956
3: I've marked EPVMWDKQDYAAK 99, index is 2094, similarity 0.692, Mass difference 32.055
1: I've marked EPSNNDLPDYAWK 89, index is 1581, similarity 0.692, Mass difference 32.023
1: I've marked EPSNNDLPDYAWK 89, index is 1581, similarity 0.692, Mass difference 0.02
4: I've marked ESVQDYAYAPMFK 99, index is 1582, similarity 0.692, Mass difference 0.022
1: I've marked YFQVADKNDYASK 96, index is 1583, similarity 0.692, Mass difference 22.022
2: I've marked LMHMPHDYASGMK 87, index is 1326, similarity 0.692, Mass difference 15.997
4: I've marked FSYDLHDYASGMK 99, index is 1586, similarity 0.692, Mass difference 0.057
4: I've marked FTDFDDEDYAA

1: I've marked ENMLDVQDYADDK 97, index is 1668, similarity 0.692, Mass difference 0.038
2: I've marked NEMLDVQDYADDK 94, index is 1928, similarity 0.923, Mass difference 16.0
1: I've marked ENMLDVQDYADDK 97, index is 1668, similarity 0.692, Mass difference 16.023
1: I've marked FDEPDYAAWEQGK 93, index is 1669, similarity 0.692, Mass difference 31.929
1: I've marked FDEPDYAAWEQGK 93, index is 1669, similarity 0.692, Mass difference 21.956
1: I've marked FDEPDYAAWEQGK 93, index is 1669, similarity 0.692, Mass difference 16.03
2: I've marked ENMLDVQDYADDK 97, index is 1668, similarity 0.692, Mass difference 0.038
2: I've marked NEMLDVQDYADDK 94, index is 1928, similarity 0.692, Mass difference 15.962
3: I've marked LKDLEDYAAVEYK 99, index is 1690, similarity 0.692, Mass difference 1.054
3: I've marked VRTLRDLEDYAVK 99, index is 2049, similarity 0.692, Mass difference 22.132
4: I've marked TKFDWDVPDYAAK 99, index is 1672, similarity 0.692, Mass difference 22.032
2: I've marked KDDVQDYAHSYA

3: I've marked QTYLNDLQDYAGK 99, index is 1261, similarity 0.692, Mass difference 31.977
4: I've marked LTDLQDYSADEYK 99, index is 1744, similarity 0.692, Mass difference 0.035
2: I've marked ELTEADFQDYAYK 98, index is 2298, similarity 0.692, Mass difference 32.004
4: I've marked VLFEYDSPDYANK 99, index is 1746, similarity 0.692, Mass difference 32.013
3: I've marked LTDLQDYSADEYK 99, index is 1744, similarity 0.692, Mass difference 0.035
1: I've marked WGVHVDKQDYADK 97, index is 1748, similarity 0.692, Mass difference 0.984
2: I've marked YFWGVDKNDYASK 94, index is 2300, similarity 0.692, Mass difference 31.994
1: I've marked MTKRDKADYAAFK 89, index is 1749, similarity 0.769, Mass difference 15.959
3: I've marked FMWADAEDYAESK 98, index is 1774, similarity 0.692, Mass difference 1.004
1: I've marked FDLTDYADMDNNK 93, index is 1751, similarity 0.692, Mass difference 32.061
3: I've marked LYWVKDAPDYAAK 99, index is 1437, similarity 0.692, Mass difference 21.875
2: I've marked EYMDHDYAAY

4: I've marked MTSEVDFPDYAYK 99, index is 1821, similarity 0.692, Mass difference 16.026
2: I've marked NYMNDGFPDYAYK 85, index is 2383, similarity 0.692, Mass difference 31.983
2: I've marked WTVDQPDYADPYK 98, index is 2387, similarity 0.692, Mass difference 32.031
3: I've marked LQDLVDWPDYSSK 99, index is 1827, similarity 0.692, Mass difference 0.058
2: I've marked QDLQDYAHSMTPK 94, index is 1328, similarity 0.692, Mass difference 32.007
4: I've marked MLYTFDVPDYSSK 99, index is 1825, similarity 0.692, Mass difference 0.026
2: I've marked FLSFPDLQDYQSK 98, index is 2222, similarity 0.692, Mass difference 22.028
4: I've marked AFFHPDLQDYANK 99, index is 1826, similarity 0.692, Mass difference 32.024
2: I've marked LQQLDTPDYASWK 92, index is 1816, similarity 0.692, Mass difference 0.983
4: I've marked LQDLVDWPDYSSK 99, index is 1827, similarity 0.692, Mass difference 0.058
3: I've marked MLYTFDVPDYSSK 99, index is 1825, similarity 0.692, Mass difference 0.026
2: I've marked ASQEDWQDYAE

2: I've marked LTDLLFYAEMRAK 86, index is 1925, similarity 0.692, Mass difference 0.998
4: I've marked FKVLKDVVDYSEK 99, index is 1911, similarity 0.692, Mass difference 15.993
2: I've marked FPLQDDWDGYADK 86, index is 1896, similarity 0.692, Mass difference 1.023
3: I've marked WLQSFDSPDYADK 99, index is 1941, similarity 0.692, Mass difference 0.997
1: I've marked SRFSMVDHDYAVK 91, index is 1916, similarity 0.923, Mass difference 15.997
1: I've marked SRFSMVDHDYAVK 91, index is 1916, similarity 0.692, Mass difference 1.095
3: I've marked WDLVDDYPDYASK 99, index is 2192, similarity 0.692, Mass difference 15.966
2: I've marked KFWVKDAPDYAAK 95, index is 1420, similarity 0.692, Mass difference 31.956
2: I've marked YFWGVDKNDYASK 94, index is 2300, similarity 0.692, Mass difference 21.983
2: I've marked YFQVADKNDYASK 96, index is 1583, similarity 0.692, Mass difference 22.022
1: I've marked FYLQPDLDPYATK 96, index is 1922, similarity 0.692, Mass difference 32.05
1: I've marked LTDLLFYAEMR

1: I've marked ADMEDYAQNYPLK 97, index is 1984, similarity 0.692, Mass difference 0.055
2: I've marked MTYEDVQDYADQK 95, index is 2534, similarity 0.692, Mass difference 31.993
2: I've marked FTDMHDYAEDVAK 98, index is 1466, similarity 0.692, Mass difference 32.046
4: I've marked FFPDHPDYAEHAK 99, index is 1986, similarity 0.692, Mass difference 15.996
3: I've marked FFAPLHDYAYHPK 99, index is 2540, similarity 0.692, Mass difference 32.08
4: I've marked PWLVDHQDYADSK 99, index is 1987, similarity 0.692, Mass difference 1.032
4: I've marked PWLVDHQDYADSK 99, index is 1987, similarity 0.692, Mass difference 32.073
2: I've marked ADMEDYAQNYPLK 97, index is 1984, similarity 0.692, Mass difference 0.055
3: I've marked TVLPDLHDYAWDK 99, index is 1977, similarity 0.692, Mass difference 1.003
2: I've marked LNFMDQHDYAAHK 98, index is 2535, similarity 0.692, Mass difference 31.913
2: I've marked MRDPLDYAEHVVK 94, index is 1978, similarity 0.692, Mass difference 1.041
2: I've marked LRLVVDDHDYAE

3: I've marked DELYTDMEDYASK 99, index is 2070, similarity 0.692, Mass difference 0.091
4: I've marked LNRDLPDMEDYAK 99, index is 2077, similarity 0.692, Mass difference 31.971
2: I've marked QQSDFEDYAAQQK 96, index is 1694, similarity 0.692, Mass difference 22.095
4: I've marked QRLQQSDVEDYAK 98, index is 2082, similarity 0.692, Mass difference 1.05
1: I've marked QRLQQSDVEDYAK 98, index is 2082, similarity 0.692, Mass difference 15.963
4: I've marked TPQFDPRDYAELK 99, index is 2083, similarity 0.692, Mass difference 31.99
2: I've marked LEHDMDVPDYADK 98, index is 1792, similarity 0.692, Mass difference 16.144
2: I've marked KQLYLQDVPDYAK 98, index is 2099, similarity 0.692, Mass difference 1.013
2: I've marked FPVRDDVRDYADK 98, index is 2353, similarity 0.692, Mass difference 15.971
2: I've marked FPSLRDRVDYAEK 96, index is 2356, similarity 0.692, Mass difference 16.007
1: I've marked MNMETDMPDYASK 93, index is 2085, similarity 0.692, Mass difference 0.94
1: I've marked MNMETDMPDYASK

1: I've marked PVWYQTDAEDYAK 94, index is 2174, similarity 0.692, Mass difference 0.035
1: I've marked PVWYQTDAEDYAK 94, index is 2174, similarity 0.692, Mass difference 0.963
1: I've marked NPQDWLQDYAHAK 93, index is 2176, similarity 0.692, Mass difference 32.008
1: I've marked NPQDWLQDYAHAK 93, index is 2176, similarity 0.692, Mass difference 1.007
1: I've marked NPQDWLQDYAHAK 93, index is 2176, similarity 0.846, Mass difference 0.981
2: I've marked NPLWDLQDYASHK 92, index is 2207, similarity 0.769, Mass difference 1.017
3: I've marked LTQMNDQMDYAEK 99, index is 2189, similarity 0.692, Mass difference 0.93
2: I've marked MRFTFDQADYADK 98, index is 2570, similarity 0.692, Mass difference 21.961
1: I've marked FFPVWDLNDYAAK 97, index is 2179, similarity 0.692, Mass difference 0.951
2: I've marked QVDLHDYAWDDLK 94, index is 2741, similarity 0.692, Mass difference 31.949
3: I've marked FKVLKDVVDYSEK 99, index is 1911, similarity 0.692, Mass difference 15.993
2: I've marked LYVVLMVKDYSEK 

4: I've marked DSFDDWEDYAAQK 96, index is 2244, similarity 0.692, Mass difference 31.958
3: I've marked LFQDDLEDYAMTK 99, index is 2231, similarity 0.692, Mass difference 0.996
3: I've marked NRWKPDLEDYSAK 99, index is 2819, similarity 0.692, Mass difference 32.078
3: I've marked PWALERDLEDYAK 99, index is 2541, similarity 0.692, Mass difference 16.047
1: I've marked FFLEVDRQSYASK 87, index is 2252, similarity 0.846, Mass difference 1.002
2: I've marked FLKDVWDYAHPSK 85, index is 2544, similarity 0.692, Mass difference 15.97
1: I've marked WSKKLWPDYAAPK 90, index is 2254, similarity 0.692, Mass difference 32.004
4: I've marked MDDMQDYSWTNGK 99, index is 2255, similarity 0.692, Mass difference 31.944
2: I've marked MQPDDLPDYAHEK 98, index is 1705, similarity 0.692, Mass difference 31.973
2: I've marked NKQDYLDVPDYAK 98, index is 1883, similarity 0.692, Mass difference 21.899
2: I've marked MQDVPDYADWHSK 97, index is 2269, similarity 0.692, Mass difference 1.028
3: I've marked MQMDMDMVDY

2: I've marked TTFDLMDYAAMRK 97, index is 1781, similarity 0.692, Mass difference 32.051
3: I've marked NDLHDYAYDEFSK 99, index is 2714, similarity 0.692, Mass difference 21.904
3: I've marked YNMDDVQDYAMAK 99, index is 1790, similarity 0.769, Mass difference 32.049
2: I've marked VFHDLNDVQDYAK 97, index is 1805, similarity 0.769, Mass difference 31.959
1: I've marked VDWPDYATFEQPK 86, index is 2343, similarity 0.692, Mass difference 1.038
1: I've marked VDWPDYATFEQPK 86, index is 2343, similarity 0.692, Mass difference 32.012
1: I've marked KDLDWEDYAANQK 92, index is 2344, similarity 0.692, Mass difference 0.034
2: I've marked QRLQQSDVEDYAK 98, index is 2082, similarity 0.692, Mass difference 15.963
4: I've marked VQQDVEDYAHYTK 99, index is 2345, similarity 0.692, Mass difference 1.015
4: I've marked FFQTDKEDYAFGK 99, index is 2346, similarity 0.692, Mass difference 1.011
1: I've marked FDLPVDFHDYAEK 97, index is 2347, similarity 0.692, Mass difference 31.962
1: I've marked FDLPVDFHDY

2: I've marked TDDVHDYAHFDHK 94, index is 2416, similarity 0.692, Mass difference 0.019
2: I've marked RFVKDVHDYAFPK 85, index is 2821, similarity 0.692, Mass difference 22.151
3: I've marked VFQFDDLRDYADK 99, index is 2956, similarity 0.692, Mass difference 32.069
4: I've marked EMDLQDLQDYAMK 99, index is 2422, similarity 0.692, Mass difference 15.991
4: I've marked EMDLQDLQDYAMK 99, index is 2422, similarity 0.692, Mass difference 1.069
3: I've marked DMLWDLQDYASHK 99, index is 2813, similarity 0.692, Mass difference 22.026
2: I've marked EDLQDYAFMDEQK 98, index is 2949, similarity 0.692, Mass difference 31.986
4: I've marked WAWQTDTQDYASK 99, index is 2423, similarity 0.692, Mass difference 15.994
4: I've marked WAWQTDTQDYASK 99, index is 2423, similarity 0.692, Mass difference 0.998
2: I've marked YWMAHDTQDYASK 88, index is 2694, similarity 0.769, Mass difference 15.98
2: I've marked RLTDLFDVQDYAK 95, index is 2147, similarity 0.692, Mass difference 15.921
1: I've marked LVWTQDDVMD

1: I've marked HRLMQDLPDYATK 85, index is 2511, similarity 0.692, Mass difference 32.067
1: I've marked HRLMQDLPDYATK 85, index is 2511, similarity 0.923, Mass difference 15.99
1: I've marked RDLHDYAWTDLAK 97, index is 2512, similarity 0.692, Mass difference 0.044
4: I've marked TKVMLDPKDYSYK 99, index is 2513, similarity 0.923, Mass difference 16.0
4: I've marked KLLMDLHDYAWAK 99, index is 2515, similarity 0.692, Mass difference 31.968
2: I've marked RDLHDYAWTDLAK 97, index is 2512, similarity 0.692, Mass difference 0.044
1: I've marked QYDEPDYAEVMTK 91, index is 2517, similarity 0.692, Mass difference 15.954
1: I've marked QYDEPDYAEVMTK 91, index is 2517, similarity 0.692, Mass difference 1.091
1: I've marked HDVHFDSRDYADK 98, index is 2519, similarity 0.692, Mass difference 16.014
3: I've marked NFLPYDKEDYATK 99, index is 2508, similarity 0.692, Mass difference 0.968
2: I've marked QHYWDLPDYAVAK 96, index is 2539, similarity 0.846, Mass difference 1.009
1: I've marked PKQFHDKLDYADK 

2: I've marked DFEDVRDYAEHSK 91, index is 2616, similarity 0.692, Mass difference 1.03
1: I've marked HKNLDDMPDYAMK 97, index is 2598, similarity 0.692, Mass difference 0.019
2: I've marked FVQDWADYAHVDK 87, index is 2312, similarity 0.692, Mass difference 15.981
Step 2: 2600 of 4201
1: I've marked NLDMQDYAFHMPK 98, index is 2601, similarity 0.692, Mass difference 15.898
2: I've marked HKNLDDMPDYAMK 97, index is 2598, similarity 0.692, Mass difference 0.019
2: I've marked NVMDMQDYAAHRK 90, index is 2615, similarity 0.692, Mass difference 0.988
1: I've marked KYFDNDFPDYASK 89, index is 2602, similarity 0.692, Mass difference 0.075
1: I've marked NFDLEDYAHRDSK 96, index is 2603, similarity 0.692, Mass difference 22.027
1: I've marked NFDLEDYAHRDSK 96, index is 2603, similarity 0.692, Mass difference 1.056
2: I've marked PYMRDPDQYAELK 92, index is 2864, similarity 0.692, Mass difference 16.023
1: I've marked QNLTDLQDYAWDK 86, index is 2606, similarity 0.692, Mass difference 16.044
1: I've

1: I've marked FAFWSDVHDYSNK 96, index is 2696, similarity 0.692, Mass difference 0.954
3: I've marked YDMEDTPDYADMK 99, index is 2304, similarity 0.692, Mass difference 22.135
3: I've marked WDTPDYADDKNFK 99, index is 2682, similarity 0.692, Mass difference 1.032
2: I've marked WNLDMVDYAAKFK 97, index is 2719, similarity 0.692, Mass difference 1.03
2: I've marked YWFTDPRDYAVAK 97, index is 2957, similarity 0.692, Mass difference 16.047
2: I've marked PYDMFDYADRVKK 95, index is 3169, similarity 0.692, Mass difference 32.039
Step 2: 2700 of 4201
1: I've marked NMVQDYAFYAHLK 89, index is 2702, similarity 0.846, Mass difference 1.02
3: I've marked LVDRPDYAGFFRK 99, index is 2148, similarity 0.692, Mass difference 31.927
3: I've marked LVLEFDKFDYAEK 99, index is 2725, similarity 0.692, Mass difference 1.06
2: I've marked LPDLPMYAYWFGK 89, index is 2723, similarity 0.692, Mass difference 1.014
2: I've marked TNLDKHDYAFTMK 95, index is 2141, similarity 0.692, Mass difference 32.031
2: I've m

2: I've marked MPVDDLHDYSHYK 92, index is 2772, similarity 0.692, Mass difference 0.027
3: I've marked ATDLHDYSYYFPK 99, index is 2777, similarity 0.692, Mass difference 0.051
1: I've marked SNDDERLMDYAYK 87, index is 2770, similarity 0.692, Mass difference 21.955
1: I've marked SNDDERLMDYAYK 87, index is 2770, similarity 0.692, Mass difference 0.912
1: I've marked MPVDDLHDYSHYK 92, index is 2772, similarity 0.692, Mass difference 0.027
2: I've marked NYMNDGFPDYAYK 85, index is 2383, similarity 0.769, Mass difference 22.054
1: I've marked MNVAWDFPDYAYK 92, index is 2775, similarity 0.692, Mass difference 21.972
1: I've marked MNVAWDFPDYAYK 92, index is 2775, similarity 0.692, Mass difference 0.93
4: I've marked WYESDVKDYADTK 99, index is 2776, similarity 0.692, Mass difference 15.965
4: I've marked ATDLHDYSYYFPK 99, index is 2777, similarity 0.692, Mass difference 0.051
2: I've marked LSLDQHDYAHMFK 98, index is 2796, similarity 0.692, Mass difference 0.926
2: I've marked SLLDQHDYAHMFK 

2: I've marked AMNYFDPKDYADK 85, index is 2309, similarity 0.769, Mass difference 32.048
1: I've marked MFNFYDPKDYSAK 97, index is 2861, similarity 0.692, Mass difference 21.968
1: I've marked MFNFYDPKDYSAK 97, index is 2861, similarity 0.692, Mass difference 15.968
2: I've marked PDWFADFPDYSHK 90, index is 2846, similarity 0.692, Mass difference 1.04
2: I've marked PFDRWPDYASRSK 95, index is 2852, similarity 0.692, Mass difference 0.96
3: I've marked LFYFMDRADYASK 99, index is 2879, similarity 0.692, Mass difference 1.017
1: I've marked PYMRDPDQYAELK 92, index is 2864, similarity 0.692, Mass difference 16.023
1: I've marked PYMRDPDQYAELK 92, index is 2864, similarity 0.692, Mass difference 22.049
2: I've marked FWRVGDFEDYAAK 97, index is 2502, similarity 0.692, Mass difference 22.025
2: I've marked FNVMQDHEDYANK 94, index is 2869, similarity 0.692, Mass difference 0.913
1: I've marked STEDDHWPDYAYK 94, index is 2868, similarity 0.692, Mass difference 16.032
1: I've marked FNVMQDHEDYAN

1: I've marked NKHLDDKVDYAWK 98, index is 2960, similarity 0.769, Mass difference 15.959
2: I've marked LRDWADKPDYAWK 96, index is 3392, similarity 0.692, Mass difference 32.012
3: I've marked LPYLFDMHDYASK 99, index is 2428, similarity 0.692, Mass difference 32.102
2: I've marked LDFYKDVRDYADK 98, index is 3170, similarity 0.692, Mass difference 15.949
1: I've marked HAYMFDQVDYAEK 98, index is 2965, similarity 0.692, Mass difference 22.011
2: I've marked EDLQDYAFMDEQK 98, index is 2949, similarity 0.692, Mass difference 1.05
3: I've marked VFQFDDLRDYADK 99, index is 2956, similarity 0.692, Mass difference 0.972
2: I've marked FMHLEQDLEDYAK 97, index is 3265, similarity 0.692, Mass difference 21.998
3: I've marked LDKEDYAHFHHDK 99, index is 3267, similarity 0.692, Mass difference 22.02
2: I've marked HHSQWDVQDYSAK 91, index is 2446, similarity 0.692, Mass difference 32.031
3: I've marked NYTLQDHQDYADK 99, index is 2619, similarity 0.692, Mass difference 22.034
3: I've marked MLAKFDVHDY

4: I've marked KFDNEDWHDYAAK 99, index is 3046, similarity 0.692, Mass difference 22.002
4: I've marked WAMPNDLEDYAWK 99, index is 3047, similarity 0.692, Mass difference 1.054
4: I've marked WAMPNDLEDYAWK 99, index is 3047, similarity 0.769, Mass difference 0.988
3: I've marked VFFDFPDYAATWK 99, index is 2559, similarity 0.692, Mass difference 31.992
3: I've marked VWRWDLTDYAANK 99, index is 3036, similarity 0.692, Mass difference 0.968
4: I've marked RWVTDLEDYANEK 99, index is 3050, similarity 0.692, Mass difference 0.995
1: I've marked LVDVMPYAHYTWK 89, index is 3052, similarity 0.846, Mass difference 1.004
4: I've marked HDDLHDYAFLHKK 99, index is 3053, similarity 0.692, Mass difference 32.09
2: I've marked GHRYDLHDYAFTK 98, index is 2832, similarity 0.692, Mass difference 16.028
3: I've marked MNTFFDKQDYAFK 99, index is 3489, similarity 0.692, Mass difference 31.937
1: I've marked FFMDQEDYADTNK 93, index is 3056, similarity 0.769, Mass difference 1.099
3: I've marked FVDDWPDYAMFQK

1: I've marked NNYDVRDAYDQMK 91, index is 3158, similarity 0.923, Mass difference 15.997
1: I've marked MDHDYAFNSKFEK 91, index is 3159, similarity 0.923, Mass difference 16.001
1: I've marked LEYDKEDYAQMDK 88, index is 3160, similarity 0.692, Mass difference 0.121
2: I've marked TVAWHDWQDYAEK 92, index is 3180, similarity 0.692, Mass difference 1.007
1: I've marked WFFTDMPDYANLK 88, index is 3163, similarity 0.692, Mass difference 15.959
3: I've marked EQTLLWDLHDYAK 99, index is 2959, similarity 0.692, Mass difference 15.941
2: I've marked DYKMRDLEDYATK 87, index is 3388, similarity 0.769, Mass difference 15.993
2: I've marked NKHLDDKVDYAWK 98, index is 2960, similarity 0.769, Mass difference 15.959
3: I've marked MPDQLDKHDYAWK 99, index is 3154, similarity 0.692, Mass difference 1.009
2: I've marked LRDWADKPDYAWK 96, index is 3392, similarity 0.692, Mass difference 16.053
1: I've marked PYDMFDYADRVKK 95, index is 3169, similarity 0.692, Mass difference 32.039
1: I've marked LDFYKDVRD

3: I've marked SYLDDMQDYAYKK 99, index is 3279, similarity 0.692, Mass difference 0.018
2: I've marked FTSFYEDYEDYAK 94, index is 3558, similarity 0.769, Mass difference 21.99
2: I've marked RSYKLDQQDYAYK 95, index is 3565, similarity 0.692, Mass difference 22.113
1: I've marked YPMFDVPMYADYK 87, index is 3276, similarity 0.846, Mass difference 1.003
2: I've marked FFMDQEDYADTNK 93, index is 3056, similarity 0.692, Mass difference 16.064
4: I've marked SYLDDMQDYAYKK 99, index is 3279, similarity 0.692, Mass difference 0.018
2: I've marked RSYKLDQQDYAYK 95, index is 3565, similarity 0.769, Mass difference 22.095
1: I've marked SDRHDYAFAYWPK 97, index is 3280, similarity 0.692, Mass difference 15.972
1: I've marked SDRHDYAFAYWPK 97, index is 3280, similarity 0.692, Mass difference 1.035
1: I've marked LHDVWDHADYAWK 93, index is 3281, similarity 0.692, Mass difference 22.03
1: I've marked LHDVWDHADYAWK 93, index is 3281, similarity 0.769, Mass difference 0.0
1: I've marked LHDVWDHADYAWK 9

4: I've marked DLSADRHDYAWWK 99, index is 3381, similarity 0.692, Mass difference 16.005
5: I've marked VEDLHDYAAMWWK 99, index is 3387, similarity 0.692, Mass difference 0.992
2: I've marked NDKLNDMRDYAWK 97, index is 3641, similarity 0.692, Mass difference 22.006
1: I've marked FLVKDRRDYADHK 98, index is 3382, similarity 0.692, Mass difference 31.936
2: I've marked WDMHDYADDMGFK 94, index is 3372, similarity 0.769, Mass difference 1.061
4: I've marked WVDWMDYADDGYK 99, index is 3384, similarity 0.692, Mass difference 0.958
2: I've marked QVEDLDMEDYAWK 94, index is 3086, similarity 0.692, Mass difference 22.05
5: I've marked DLSADRHDYAWWK 99, index is 3381, similarity 0.692, Mass difference 0.992
4: I've marked VEDLHDYAAMWWK 99, index is 3387, similarity 0.692, Mass difference 22.038
1: I've marked DYKMRDLEDYATK 87, index is 3388, similarity 0.769, Mass difference 32.059
1: I've marked DYKMRDLEDYATK 87, index is 3388, similarity 0.769, Mass difference 15.993
1: I've marked LRDWADKPDYA

2: I've marked LDRHDYAHFQQLK 97, index is 3496, similarity 0.692, Mass difference 0.077
2: I've marked YFFMDLPDYANFK 95, index is 3660, similarity 0.692, Mass difference 15.938
1: I've marked LDRHDYAHFQQLK 97, index is 3496, similarity 0.692, Mass difference 0.077
1: I've marked HFTDMQDYSDREK 94, index is 3498, similarity 0.692, Mass difference 16.047
2: I've marked WTDLQDYSFDVRK 90, index is 3510, similarity 0.692, Mass difference 1.084
Step 2: 3500 of 4201
2: I've marked FKRWDKPDYAADK 95, index is 3069, similarity 0.692, Mass difference 32.028
1: I've marked FYMHDAFQDYAHK 90, index is 3506, similarity 0.692, Mass difference 1.003
3: I've marked DFDKEDYARVWTK 98, index is 3511, similarity 0.692, Mass difference 0.039
1: I've marked WTDLQDYSFDVRK 90, index is 3510, similarity 0.692, Mass difference 1.084
4: I've marked DFDKEDYARVWTK 98, index is 3511, similarity 0.692, Mass difference 0.039
1: I've marked WKLQYDQQDYASK 94, index is 3512, similarity 0.692, Mass difference 31.979
2: I've

2: I've marked RNFDRRDRDYAAK 88, index is 3621, similarity 0.692, Mass difference 0.001
1: I've marked RNFDRRDRDYAAK 88, index is 3621, similarity 0.692, Mass difference 0.001
2: I've marked YVYDMMDYAHNTK 91, index is 3613, similarity 0.692, Mass difference 0.999
2: I've marked TRLWDVHDYATFK 97, index is 3232, similarity 0.692, Mass difference 31.961
2: I've marked FDWHDYAEHVQSK 90, index is 3357, similarity 0.692, Mass difference 22.056
4: I've marked FLYDVHDYAFEHK 99, index is 3627, similarity 0.769, Mass difference 1.053
1: I've marked ERDLHDYAFYLNK 98, index is 3629, similarity 0.692, Mass difference 16.132
1: I've marked ERDLHDYAFYLNK 98, index is 3629, similarity 0.846, Mass difference 0.008
4: I've marked VWDLHDYAFYLNK 99, index is 3630, similarity 0.692, Mass difference 16.14
2: I've marked ERDLHDYAFYLNK 98, index is 3629, similarity 0.846, Mass difference 0.008
1: I've marked TLYSWDMEDYAMK 86, index is 3634, similarity 0.923, Mass difference 31.988
1: I've marked TLYSWDMEDYAMK

4: I've marked DWDSWDKQDYAHK 99, index is 3711, similarity 0.692, Mass difference 1.031
1: I've marked WRMGFDVHDYSHK 95, index is 3712, similarity 0.692, Mass difference 0.035
2: I've marked MYKDLYDYADKYK 96, index is 3884, similarity 0.692, Mass difference 22.03
4: I've marked YLFDLQDYAFDRK 99, index is 3718, similarity 0.692, Mass difference 32.019
3: I've marked WNDQKDYAWNVKK 96, index is 3736, similarity 0.692, Mass difference 0.95
1: I've marked WDDDEMDYAEMDK 91, index is 3720, similarity 0.692, Mass difference 1.103
2: I've marked WDFKDSYEDYAEK 89, index is 3739, similarity 0.692, Mass difference 1.14
1: I've marked FSEDQHDYSEFYK 92, index is 3721, similarity 0.692, Mass difference 16.079
3: I've marked FQYFDMHDYASQK 99, index is 3738, similarity 0.692, Mass difference 0.996
4: I've marked FNFYDMHDYAEVK 99, index is 3722, similarity 0.692, Mass difference 1.016
1: I've marked PEMYVYDREDYAK 90, index is 3723, similarity 0.923, Mass difference 15.995
1: I've marked VSQEYDYQDYAWK 93

2: I've marked YYDVADYAFWEHK 86, index is 3823, similarity 0.692, Mass difference 0.066
1: I've marked MNDYWDYASDHYK 89, index is 3828, similarity 0.692, Mass difference 0.066
2: I've marked MNDYWDYASDHYK 89, index is 3828, similarity 0.692, Mass difference 0.066
4: I've marked YWRYLDDEDYAAK 99, index is 3833, similarity 0.692, Mass difference 22.045
3: I've marked YVWYRDKNDYAAK 99, index is 3701, similarity 0.692, Mass difference 15.943
1: I've marked RLMHFDWPDYAEK 94, index is 3834, similarity 0.692, Mass difference 32.03
3: I've marked WRMDLHDYSHQDK 99, index is 3969, similarity 0.692, Mass difference 21.964
3: I've marked YLDMHDYSFHRLK 99, index is 4005, similarity 0.692, Mass difference 32.017
1: I've marked RTWRKDLEDYAQK 90, index is 3843, similarity 0.692, Mass difference 16.004
1: I've marked WVDRHDYANYDEK 97, index is 3852, similarity 0.692, Mass difference 31.854
1: I've marked WVDRHDYANYDEK 97, index is 3852, similarity 0.692, Mass difference 15.976
2: I've marked FSEDQHDYSE

4: I've marked YLDMHDYSFHRLK 99, index is 4005, similarity 0.692, Mass difference 32.017
2: I've marked NRDDMHDYSYFRK 86, index is 4083, similarity 0.692, Mass difference 21.941
2: I've marked FRSMYDMPDYSWK 88, index is 4071, similarity 0.846, Mass difference 15.994
2: I've marked MYWHDRHDYARAK 91, index is 4090, similarity 0.692, Mass difference 21.979
3: I've marked RWDLQDYAWYQGK 98, index is 3956, similarity 0.692, Mass difference 16.063
2: I've marked FRKWDTRDYAYPK 92, index is 4024, similarity 0.846, Mass difference 1.001
4: I've marked RWQYDLEDYAQMK 93, index is 4022, similarity 0.692, Mass difference 21.972
1: I've marked FRKWDTRDYAYPK 92, index is 4024, similarity 0.846, Mass difference 1.001
1: I've marked WNMFQDLHDYAYK 98, index is 4026, similarity 0.692, Mass difference 15.933
4: I've marked HLTWQWDREDYAK 99, index is 4036, similarity 0.692, Mass difference 31.983
2: I've marked WNLDKHDYSEKWK 94, index is 4043, similarity 0.692, Mass difference 0.057
2: I've marked WNDMHDYSH

In [10]:
# Remove peptides marked in Step 2 above, making a new dataframe
unique_to_drop = list(np.unique(to_drop))
new_index = list(df.index.values)
for bad_pep in unique_to_drop:
    new_index.remove(bad_pep)
df_intermed = df.iloc[new_index,:]
df_intermed.reset_index(drop=True,inplace=True)
print(f'Step 2 completed, removed {len(peptides)-len(df_intermed.index)} isomers')

Step 2 completed, removed 1568 isomers


In [11]:
# Step 3: Remove sequence isomers by high degree of similarity 
    # Similarity > 0.92, which is seen only for simple swap of 2 amino acids
# THIS IS AN EXTREMELY CONSERVATIVE STEP, sure to remove any POSSIBLE sequence isomers. On clean data, it removes <5% of the data
# RT cannot be used because different gradients were used to collect this data, but is reported

# Extract lists from dataframe for enumeration
peptides = df_intermed['Peptide'].tolist()
OPM = df_intermed['Obs Parent Mass'].tolist()
ALC = df_intermed['ALC (%)'].tolist()
ppm = df_intermed['ppm'].tolist()
RT = df_intermed['ppm'].tolist()

# Make a list of all the peptides to remove, Step 3:
similarity_cutoff = 0.92      # more than this

to_drop = []
for i in range(len(peptides)):
    if len(peptides) > 500 and i % 100 == 0: print(f"Step 3: {i} of {len(peptides)}")
    for k in range(len(peptides)):
        sim = difflib.SequenceMatcher(a=peptides[k], b=peptides[i])
        simz = sim.ratio()
        difference = np.abs(float(OPM[i]) - float(OPM[k]))
        RT_difference = np.abs(float(RT[i]) - float(RT[k]))
        if i != k and simz > similarity_cutoff:
            #print(f"Trouble {peptides[k]} {ALC[k]} {round(float(OPM[k]),3)} and {peptides[i]} {ALC[i]} {round(float(OPM[i]),3)}, similarity {round(float(simz),3)} difference {round(float(difference),3)}")
            if ALC[k] > ALC[i]:
                print(f"1: I've marked {peptides[i]} {ALC[i]}, index is {i}, similarity {round(float(simz),3)}, Mass difference {round(float(difference),3)}, RT difference {round(float(RT_difference),2)}")
                to_drop.append(i)
            elif ALC[k] < ALC[i]:
                print(f"2: I've marked {peptides[k]} {ALC[k]}, index is {k}, similarity {round(float(simz),3)}, Mass difference {round(float(difference),3)}, RT difference {round(float(RT_difference),2)}")
                to_drop.append(k)
            elif ALC[k] == ALC[i]:    
                if abs(float(ppm[k])) < abs(float(ppm[i])):
                    print(f"3: I've marked {peptides[k]} {ALC[k]}, index is {k}, similarity {round(float(simz),3)}, Mass difference {round(float(difference),3)}, RT difference {round(float(RT_difference),2)}")
                    to_drop.append(k)
                elif abs(float(ppm[i])) < abs(float(ppm[k])):
                    print(f"4: I've marked {peptides[i]} {ALC[i]}, index is {i}, similarity {round(float(simz),3)}, Mass difference {round(float(difference),3)}, RT difference {round(float(RT_difference),2)}")
                    to_drop.append(i)
                else:
                    print(f"5: I've marked {peptides[k]} {ALC[k]}, index is {k}, similarity {round(float(simz),3)}, Mass difference {round(float(difference),3)}, RT difference {round(float(RT_difference),2)}")
                    to_drop.append(k)

Step 3: 0 of 2633
4: I've marked GSASGLSFKGSHK 99, index is 2, similarity 0.923, Mass difference 26.048, RT difference 3.2
3: I've marked GSASGLSFKGSHK 99, index is 2, similarity 0.923, Mass difference 26.048, RT difference 3.2
2: I've marked ELASGLSFKGSHK 96, index is 20, similarity 0.923, Mass difference 72.021, RT difference 0.3
1: I've marked ELASGLSFKGSHK 96, index is 20, similarity 0.923, Mass difference 72.021, RT difference 0.3
2: I've marked VNPVMDVGDYAEK 98, index is 175, similarity 0.923, Mass difference 28.035, RT difference 2.6
Step 3: 100 of 2633
1: I've marked VNPVMDVGDYAEK 98, index is 175, similarity 0.923, Mass difference 28.035, RT difference 2.6
4: I've marked EVDLSDYADKVGK 99, index is 185, similarity 0.923, Mass difference 34.019, RT difference 0.9
Step 3: 200 of 2633
Step 3: 300 of 2633
3: I've marked EVDLSDYADKVGK 99, index is 185, similarity 0.923, Mass difference 34.019, RT difference 0.9
Step 3: 400 of 2633
4: I've marked LMGFQDVQDYAAK 99, index is 424, simil

In [12]:
# Remove peptides marked in Step 3 above, making a new dataframe
unique_to_drop = list(np.unique(to_drop))
new_index = list(df_intermed.index.values)
for bad_pep in unique_to_drop:
    new_index.remove(bad_pep)
df_final = df_intermed.iloc[new_index,:]
df_final.reset_index(drop=True,inplace=True)
print(f'Step 3 completed, removed {len(peptides)-len(new_index)} potential isomers')

Step 3 completed, removed 12 potential isomers


In [13]:
# Save dataframe without index
df_final.to_csv(f'{file[:-4]}_clean.csv', index=False)
print(f"Output saved, of {len(ALC_list)} peptides originally present, {len(new_index)} were retained")

Output saved, of 4267 peptides originally present, 2621 were retained
