The output from the Skyline document (if using a novel document and not the preformed human/conserved document) does not include Peptide Notes with the corresponding histone mark (e.g. H3 K27me1). I want to use the hard-coded table of histone marks and their respective mass shifts provided in the complementary `generate_hptms.ipynb` script to reverse-engineer the Skyline Peptide Modified Sequence back into biological histone mark notation (e.g. `PEK[+42.04695]TIDER` would become `H3 K3me1`).


input: a CSV with Peptide Modified Sequences from `Skyline>File>Export>Custom Report...` formats

output: a new column with the Peptide Note using "histone mod" language (H3K27me1, H3K4ac, etc)

In [1]:
import sys
import os
from os import listdir
from os.path import isfile, join
import pandas as pd
import re
import numpy as np
from Bio import SeqIO
from pyteomics import fasta, parser, mass, achrom, electrochem, auxiliary
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import itertools

sys.stdout.write("Imported required packages successfully.\n")

# set the master table for modification mass shifts
MZSHIFT_DICT = {'[nPR]':'[+56]',
                '[AC]': '[+42]',
                '[PR]': '[+56]',
                '[ME1]': '[+70]',
                '[ME2]': '[+28]',
                '[ME3]': '[+42]',
                '[PH]': '[+80]',
                '[nME3PR]': '[+98.1]',
                '[nME2PR]': '[+84.1]',
                '[nME1PR]': '[+126.1]',
                '[nPR2]': '[+112.1]',
                '[nAC]': '[+98]',
                '[nACPR]': '[+84.1]'}

Imported required packages successfully.


In [10]:
## digest protein into peptides

# read in FASTA
fasta_file = os.path.join(os.getcwd(), 
                          "../../collab_greer/data/uniprot-dicty_histones_7entries_acc20201231.fasta")

protein_df = pd.DataFrame()  # Initialize a dataframe to store results
for protein in tqdm(SeqIO.parse(fasta_file, "fasta")):
    
    protein_sequence = str(protein.seq).upper()
    
    # cleave initial "start" methionine if present
    if protein_sequence[0] == "M":
        protein_sequence = protein_sequence[1:]

    # add this protein to the dataframe
    new_df = pd.DataFrame({'protein': protein.id,
                           'protein_sequence': protein_sequence}, index=[0])
    
    protein_df = protein_df.append(new_df)
protein_df = protein_df.drop_duplicates()
print(protein_df.head())
##
## read in Skyline Export Report with Peptide Modified Sequences
##

# read in Skyline Export Report with Peptide Modified Sequences
skyline_df = pd.read_csv(os.path.join(os.getcwd(), 
                                      "../../collab_greer/data/greer_onlyhistlibrary_groupcomparison_mound-v-vegetative.csv"))
print(skyline_df.head())


7it [00:00, 1276.31it/s]

                 protein                                   protein_sequence
0     sp|P54671|H1_DICDI  GPKAPTTPTKKAAATKSKPKPNHPTYQVMISTAIAHYKDRTGSSQP...
0   sp|Q54WG6|H2AX_DICDI  SETKPASSKPAAAAKPKKVIPRVSRTGEPKSKPESRSARAGITFPV...
0   sp|Q54LA5|H2AZ_DICDI  TESETTSKKVNKRVKPVPKSTKAGLIFPVGRIHRMLKNKVPLKRVS...
0  sp|Q54LP8|H2BV3_DICDI  VFVKGQKKATKGSTQSGEEKTASTTPKVTKTPTEGGEKKRKKRKSD...
0   sp|O15819|H33A_DICDI  ARTKQTARKSTGAKVPRKHIGSKQAHKQTPVSSSSGGVKKVHRFRP...
                                     Protein                 Peptide  \
0                       sp|Q54WG6|H2AX_DICDI  SETKPASSKPAAAAKPKKVIPR   
1                       sp|Q54WG6|H2AX_DICDI  SETKPASSKPAAAAKPKKVIPR   
2  sp|O15819|H33A_DICDI,sp|Q55BN9|H33B_DICDI                  TKQTAR   
3                       sp|Q54WG6|H2AX_DICDI  SETKPASSKPAAAAKPKKVIPR   
4                       sp|Q54WG6|H2AX_DICDI  SETKPASSKPAAAAKPKKVIPR   

                           Peptide Modified Sequence  MS Level  \
0  S[+42]ETK[+56]PASS[+80]K[+28]PAAAAK[+56]PK




In [42]:
# remove propionylations ([+56], [+112.1])
skyline_df['new_pep_seq'] = skyline_df['Peptide Modified Sequence']
skyline_df['new_pep_seq'] = skyline_df['new_pep_seq'].str.replace(r'\[\+56\]', '')
skyline_df['new_pep_seq'] = skyline_df['new_pep_seq'].str.replace(r'\[\+112.1\]', '')


decode_df = pd.DataFrame()  # Initialize a dataframe to store results
for index, row in skyline_df.iterrows():
    peptide = row['Peptide']
    mod_seq = row['new_pep_seq']
    fc = row['Fold Change Result']
    pval = row['Adjusted P-Value']
    
    protein_match = list(protein_df[protein_df['protein_sequence'].str.contains(peptide)]['protein'])
    
    for protein in protein_match:
        sequence = protein_df[protein_df['protein'] == protein]['protein_sequence'][0]
        aa_index = sequence.find(peptide)  # get the amino acid index position
        
        residue_list = re.sub( r"([A-Z])", r" \1", mod_seq).split()

        # decode the histone modification
        histone_mod = ''
        for i in range(len(residue_list)): 
            if '[' in residue_list[i]:
                aa_pos = aa_index + i + 1  # have to +1 for indexing
                new_mod = residue_list[i][0] + str(aa_pos) + residue_list[i][1:]
                histone_mod = histone_mod + new_mod
        
        # add this protein/peptide to the dataframe
        new_df = pd.DataFrame({'Protein Name': protein,
                               'Peptide Sequence': peptide,
                               'Peptide Modified Sequence': row['Peptide Modified Sequence'],
                               'histone mod': histone_mod,
                               'Fold Change Result': fc,
                               'Adjusted P-Value': pval}, index=[0])
    
        decode_df = decode_df.append(new_df)
        
decode_df = decode_df.drop_duplicates()


print(decode_df)

            Protein Name        Peptide Sequence  \
0   sp|Q54WG6|H2AX_DICDI  SETKPASSKPAAAAKPKKVIPR   
0   sp|Q54WG6|H2AX_DICDI  SETKPASSKPAAAAKPKKVIPR   
0   sp|O15819|H33A_DICDI                  TKQTAR   
0   sp|Q55BN9|H33B_DICDI                  TKQTAR   
0   sp|Q54WG6|H2AX_DICDI  SETKPASSKPAAAAKPKKVIPR   
..                   ...                     ...   
0   sp|O15819|H33A_DICDI             EIAQEFKTDLR   
0   sp|Q55BN9|H33B_DICDI             EIAQEFKTDLR   
0   sp|O15819|H33A_DICDI             EIAQEFKTDLR   
0   sp|Q55BN9|H33B_DICDI             EIAQEFKTDLR   
0   sp|Q54LA5|H2AZ_DICDI      VKPVPKSTKAGLIFPVGR   

                            Peptide Modified Sequence  \
0   S[+42]ETK[+56]PASS[+80]K[+28]PAAAAK[+56]PK[+70...   
0   S[+42]ETK[+56]PASS[+80]KPAAAAK[+70]PK[+56]K[+5...   
0                                    T[+56]K[+70]QTAR   
0                                    T[+56]K[+70]QTAR   
0   S[+42]ETK[+56]PASS[+80]K[+28]PAAAAK[+56]PK[+56...   
..                               

In [43]:
output_df = decode_df.groupby('histone mod', 'Fold Change Result', 'Adjusted P-Value')['Protein Name'].apply(lambda x: ','.join(x)).reset_index()
print(output_df)

ValueError: No axis named Fold Change Result for object type <class 'pandas.core.frame.DataFrame'>