In [None]:
# This is an example of filtering a list of de novo sequenced peptides based on library design (using regex) as output from PEAKS Studio 8.5
# If you do not have sequencing confidence or other metrics, you can remove a line to just filter by library design

In [1]:
import pandas as pd
import re

In [2]:
file = 'all de novo candidates.csv'
df = pd.read_csv(file)

In [3]:
# Regular expression pattern to match the peptide design
# Pattern for X12K with X = {A,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,V,W,Y, and M(+15.99), ending in 'K(-.98)' the C-terminal Lys carboxamide}
pattern = r'^((?:[ADEFGHIJKLMNOPQRSTVWY]|M\(+15\.99\)){12})K\(-\.98\)$'

In [4]:
# Filter rows based on the peptide design
filtered_df = df[df['Peptide'].str.contains(pattern)]

  filtered_df = df[df['Peptide'].str.contains(pattern)]


In [5]:
# Roughly filter the peptides based on ALC >= 70, which is sequencing confidence, and ppm sequencing error -10 < ppm < 10
filtered_df = filtered_df.query('`ALC (%)` >= 70 and (`ppm` < -10 or `ppm` > 10)')

In [6]:
filtered_df # It is ok if there are sequence isomers or duplicates. The next step will remove those.

Unnamed: 0,Fraction,Source File,Feature,Peptide,Scan,Tag length,ALC (%),Length,m/z,z,RT,Mass,ppm,local confidence (%),tag (>=0%),mode
46801,27,284-188-12ca5-Alq6-Repl1-Scout-1.raw,-,ELEPTDKYYFFEK(-.98),4061 4062,13,72,13,569.9544,3,27.45,1706.8191,13.1,83 85 93 87 82 74 56 29 25 35 93 99 100,ELEPTDKYYFFEK(-.98),HCD/ETHCD
46822,27,284-188-12ca5-Alq6-Repl1-Scout-1.raw,-,ELEPTDKYYFFEK(-.98),4062,13,72,13,569.9544,3,27.46,1706.8191,13.1,83 85 93 87 82 74 56 29 25 35 93 99 100,ELEPTDKYYFFEK(-.98),ETHCD
56773,27,284-188-12ca5-Alq6-Repl1-Scout-1.raw,20644,KASYYMWQHQSSK(-.98),4624,13,71,13,548.2576,3,30.79,1641.7722,-13.0,74 84 82 71 35 21 19 65 90 91 97 99 99,KASYYMWQHQSSK(-.98),ETHCD
56774,27,284-188-12ca5-Alq6-Repl1-Scout-1.raw,20644,KASYYWMQHQSSK(-.98),4624,13,71,13,548.2576,3,30.79,1641.7722,-13.0,74 84 82 70 35 16 26 65 90 91 97 99 99,KASYYWMQHQSSK(-.98),ETHCD
56775,27,284-188-12ca5-Alq6-Repl1-Scout-1.raw,20644,AKSYYMWQHQSSK(-.98),4624,13,70,13,548.2576,3,30.79,1641.7722,-13.0,71 81 82 70 34 21 19 64 90 90 97 99 99,AKSYYMWQHQSSK(-.98),ETHCD
73957,27,284-188-12ca5-Alq6-Repl1-Scout-1.raw,-,LFYWWYLMTTYNK(-.98),5478 5479,13,73,13,609.9603,3,36.11,1826.8855,-14.4,96 49 36 35 79 82 83 69 86 86 74 85 99,LFYWWYLMTTYNK(-.98),HCD/ETHCD
73978,27,284-188-12ca5-Alq6-Repl1-Scout-1.raw,-,LFYWWYLMTTYNK(-.98),5479,13,73,13,609.9603,3,36.11,1826.8855,-14.4,96 49 36 35 76 82 83 69 86 86 74 85 99,LFYWWYLMTTYNK(-.98),ETHCD
73979,27,284-188-12ca5-Alq6-Repl1-Scout-1.raw,-,LWFYWYLMTTYNK(-.98),5479,13,72,13,609.9603,3,36.11,1826.8855,-14.4,92 29 40 52 72 82 83 68 86 85 73 85 99,LWFYWYLMTTYNK(-.98),ETHCD
73980,27,284-188-12ca5-Alq6-Repl1-Scout-1.raw,-,LFWYWYLMTTYNK(-.98),5479,13,72,13,609.9603,3,36.11,1826.8855,-14.4,90 45 23 47 74 83 84 69 86 86 74 85 99,LFWYWYLMTTYNK(-.98),ETHCD
73981,27,284-188-12ca5-Alq6-Repl1-Scout-1.raw,-,LWFYWYLMTTNYK(-.98),5479,13,71,13,609.9603,3,36.11,1826.8855,-14.4,92 28 40 52 72 82 83 67 83 81 62 87 99,LWFYWYLMTTNYK(-.98),ETHCD


In [7]:
filtered_df.to_csv(f'Filtered by Lib {file}')