# In silico Trypsin digestion program

This is an in silico trypsin digestion program. The input is a fasta file that contains a protein sequence to be digested; the output is a Txt file that contains all trypsin-digested peptides and corresponding protein accessions.
Adapted from Yafeng Zhu and Yafeng Zhu (https://github.com/yafeng/trypsin/tree/master)

In [12]:
# Check python version
import sys
print(sys.version)

3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0]


In [21]:
# Define function for trypsin digestion 
def TRYPSIN(proseq, miss_cleavage):
    peptides = []
    cut_sites = [0]
    
    for i in range(0, len(proseq) - 1):
        if proseq[i] == 'K' and proseq[i + 1] != 'P':
            cut_sites.append(i + 1)
        elif proseq[i] == 'R' and proseq[i + 1] != 'P':
            cut_sites.append(i + 1)
    
    if cut_sites[-1] != len(proseq):
        cut_sites.append(len(proseq))

    if len(cut_sites) > 2:
        if miss_cleavage == 0:
            for j in range(0, len(cut_sites) - 1):
                peptides.append(proseq[cut_sites[j]:cut_sites[j + 1]])

        elif miss_cleavage == 1:
            for j in range(0, len(cut_sites) - 2):
                peptides.append(proseq[cut_sites[j]:cut_sites[j + 1]])
                peptides.append(proseq[cut_sites[j]:cut_sites[j + 2]])
            
            peptides.append(proseq[cut_sites[-2]:cut_sites[-1]])

        elif miss_cleavage == 2:
            for j in range(0, len(cut_sites) - 3):
                peptides.append(proseq[cut_sites[j]:cut_sites[j + 1]])
                peptides.append(proseq[cut_sites[j]:cut_sites[j + 2]])
                peptides.append(proseq[cut_sites[j]:cut_sites[j + 3]])
            
            peptides.append(proseq[cut_sites[-3]:cut_sites[-2]])
            peptides.append(proseq[cut_sites[-3]:cut_sites[-1]])
            peptides.append(proseq[cut_sites[-2]:cut_sites[-1]])
    else:  # No trypsin site in the protein sequence
        peptides.append(proseq)
    
    return peptides

In [22]:
# Set the variables 
input_file = 'data/MCF7_altORFs_proteins.fasta'  # Update with your actual file name
output_file = 'data/MCF7_altProt_Tryp_digest.fasta'  # Update with your desired output file name
miss_cleavage = 1  # Set the number of allowed missed cleavages (e.g., 1)

In [23]:
# Import package
from Bio import SeqIO

In [25]:
# Open the input FASTA file and the output file
with open(output_file, 'w') as output:
    for record in SeqIO.parse(input_file, 'fasta'):
        proseq = str(record.seq)
        peptide_list = TRYPSIN(proseq, miss_cleavage)
        
        # Write each peptide to the output in FASTA format
        for idx, peptide in enumerate(peptide_list):
            output.write(f">{record.id}_peptide{idx + 1}\n{peptide}\n")

print(f"Peptide digestion completed. Results saved in {output_file}.")

Peptide digestion completed. Results saved in data/MCF7_altProt_Tryp_digest.fasta.


In [26]:
# Check the first few sequences from the input and output file for visual inspection
## Set of sequences (rows) to display
num_sequences_to_display = 5  # Adjust this as needed

## Read and print 
with open(input_file, "r") as input_handle:
    records = SeqIO.parse(input_handle, "fasta")
    print(f"NON-Digesed Proteins")
    for i, record in enumerate(records):
        if i >= num_sequences_to_display:
            break
        print(f"Record ID: {record.id}")
        print(f"Integral Protein Sequence: {record.seq}\n")
        
with open(output_file, "r") as output_handle:
    records = SeqIO.parse(output_handle, "fasta")
    print(f"TRYPSIN-Digesed Proteins")
    for i, record in enumerate(records):
        if i >= num_sequences_to_display:
            break
        print(f"Record ID: {record.id}")
        print(f"Digested Protein Sequence: {record.seq}\n")

NON-Digesed Proteins
Record ID: SRR6730014.301_ORF.1
Integral Protein Sequence: LRGPGLLLHGGR

Record ID: SRR6730014.302_ORF.1
Integral Protein Sequence: MWSLSDLLGLRPSFSEDESCSLRRALRDWA

Record ID: SRR6730014.302_ORF.2
Integral Protein Sequence: LICWGCVHPSQKMSPVPCAGH

Record ID: SRR6730014.303_ORF.1
Integral Protein Sequence: LLCWGCVYPSQKMSPVPCAGH

Record ID: SRR6730014.304_ORF.1
Integral Protein Sequence: LNCWGCVHPSQKMIPVPCAGH

TRYPSIN-Digesed Proteins
Record ID: SRR6730014.301_ORF.1_peptide1
Digested Protein Sequence: LR

Record ID: SRR6730014.301_ORF.1_peptide2
Digested Protein Sequence: LRGPGLLLHGGR

Record ID: SRR6730014.301_ORF.1_peptide3
Digested Protein Sequence: GPGLLLHGGR

Record ID: SRR6730014.302_ORF.1_peptide1
Digested Protein Sequence: MWSLSDLLGLRPSFSEDESCSLR

Record ID: SRR6730014.302_ORF.1_peptide2
Digested Protein Sequence: MWSLSDLLGLRPSFSEDESCSLRR

