In [69]:
import os
from os import listdir
from os.path import isfile, join
import pandas as pd
from pyteomics import mztab
import re

project_dir = 'D:/Penn/proj/als_ptms/'
annsolo_dir = os.path.join(project_dir, "./results/annsolo")


In [71]:
##
## Read in the open modification search results
##

# function to concatenate multiple results into one dataframe
def concat_frx(file_list, score_tau):
    concat_df = pd.DataFrame([])
    for file in file_list:
        table = mztab.MzTab(os.path.join(annsolo_dir, file))
        df = table.spectrum_match_table
        df = df[['sequence', 'charge', 'exp_mass_to_charge', 'calc_mass_to_charge', 'search_engine_score[1]']]
        df.reset_index(inplace=True)
        df = df.loc[(df['search_engine_score[1]'] >= score_tau)]
        concat_df = concat_df.append(df)
    return concat_df

# retrieve a list of all result files in the annsolo directory
annsolo_results = ([f for f in listdir(annsolo_dir) if 
                   isfile(join(annsolo_dir, f)) and ".mztab" in f])

# concatenate results into one dataframe, filtering for "high quality" PTMs
score_cutoff = 0.75  
annsolo_df = concat_frx(annsolo_results, score_cutoff)
annsolo_df.head()

# function to calculate the delta mass given the query m/z, library m/z, and charge (z)
def calc_delta_mass(df_row):
    query_mz = df_row['exp_mass_to_charge']
    lib_mz = df_row['calc_mass_to_charge']
    z = df_row['charge']
    delta = (query_mz-lib_mz)*z
    return delta

# add a column to the ann solo df with the delta mass
annsolo_df['delta_mass'] = annsolo_df.apply(lambda row: calc_delta_mass(row), axis=1)

# add column to ann solo with stripped (unmodified) peptide sequence, but keep the modified sequence
annsolo_df['PeptideSeq'] = annsolo_df['sequence'].str.replace(r"\[.[0-9]*\]","")
results_df = annsolo_df
del annsolo_df


##
## Map delta masses to their modification 
##

# read in the ABRF delta mass chart
deltamass_interpreter = pd.read_csv(os.path.join(project_dir, '../../resources/ann_solo/abrf_deltamass.csv'), engine='python')
deltamass_interpreter['Avg. Mass Change'] = deltamass_interpreter['Avg. Mass Change'].str.replace(',','').astype(int)

# consolidate the delta mass so that interpetations of the same delta mass are concatenated into one
new_table = pd.DataFrame([])
for mass_shift in deltamass_interpreter['Avg. Mass Change'].drop_duplicates():
    temp_df = deltamass_interpreter.loc[deltamass_interpreter['Avg. Mass Change'] == mass_shift]
    
    if len(temp_df) == 1:
        temp_df['Modification Type'] = temp_df['Modification']
    else:
        mod_list = temp_df['Modification']
        temp_df['Modification Type'] = mod_list.str.cat(sep=',')
    
    temp_df = temp_df[['Avg. Mass Change', 'Modification Type']].drop_duplicates()
    new_table = new_table.append(temp_df)
    
deltamass_interpreter = new_table
del new_table

# merge the two together based on a rounded integer value of the delta_mass column
results_df['Avg. Mass Change'] = results_df.apply(lambda row: int(row['delta_mass']), axis=1)

# filter average mass changes <14 (not likely interesting and/or not interpretable)
results_df = results_df.loc[(abs(results_df['Avg. Mass Change']) > 14)]
results_df = pd.merge(results_df, deltamass_interpreter, on='Avg. Mass Change', how='left')

# Repeat for +1 mass and -1 mass
results_df['Avg. Mass Change'] = results_df.apply(lambda row: (row['Avg. Mass Change']+1), axis=1)
results_df = pd.merge(results_df, deltamass_interpreter, on='Avg. Mass Change', how='left')
results_df['Avg. Mass Change'] = results_df.apply(lambda row: (row['Avg. Mass Change']-2), axis=1)
results_df = pd.merge(results_df, deltamass_interpreter, on='Avg. Mass Change', how='left')
results_df['Avg. Mass Change'] = results_df.apply(lambda row: (row['Avg. Mass Change']+1), axis=1)

# clean up the multiple modification columns (maybe strip NA, then concatenate?)
results_df['Modification +/- 1'] = results_df['Modification Type_x'].astype(str)+"; "+results_df['Modification Type_y'].astype(str)+"; "+results_df['Modification Type'].astype(str)
results_df.drop(columns=['Modification Type_x', 'Modification Type_y', 'Modification Type'], inplace=True)
results_df['Modification +/- 1'] = results_df['Modification +/- 1'].str.replace('nan; ', '').astype(str)
results_df['Modification +/- 1'] = results_df['Modification +/- 1'].str.replace(' nan;', '').astype(str)
results_df['Modification +/- 1'] = results_df['Modification +/- 1'].str.replace('; nan', '').astype(str)


##
## Massage and annotate dataframe for downstream statistical analyses
##

# annotate sample groups
batchlist = pd.read_csv(os.path.join(project_dir, './data/batchlist.txt'), header=None, sep="\t", engine='python')
batchlist.columns = ['FileName', 'SampleGroup']
batchlist['FileName'] = batchlist['FileName'].map(lambda x: x.lstrip('C:\\Users\\linds\\Downloads\\ALS_CSF_Biomarker_Study-1582674932238\\Q-Exactive_Plus\\').rstrip('.mzml'))
results_df['FileName'] = results_df['PSM_ID'].map(lambda x: re.search("TN_CSF_062617_[0-9]{2}", x)[0])
results_df = pd.merge(results_df, batchlist, on='FileName', how='left')

# remove any unmapped modifications
results_df = results_df.loc[results_df['Modification +/- 1'] != 'nan']

results_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


                           PSM_ID  \
0    TN_CSF_062617_02.7916.7916.5   
1    TN_CSF_062617_02.7927.7927.5   
2    TN_CSF_062617_02.8983.8983.3   
3  TN_CSF_062617_02.10183.10183.4   
4  TN_CSF_062617_02.10428.10428.3   

                                            sequence  charge  \
0  KVLADVLQDIANDNISSADYTQDPSVTPC[160]C[160]NPYQIA...       5   
1  KVLADVLQDIANDNISSADYTQDPSVTPC[160]C[160]NPYQIA...       5   
2                                    TDTSHHDQDHPTFNK       3   
3                                ADDKETC[160]FAEEGKK       4   
4                       C[160]C[160]AAADPHEC[160]YAK       3   

   exp_mass_to_charge  calc_mass_to_charge  search_engine_score[1]  \
0          921.167053           863.204918                0.784513   
1          921.969055           863.204918                0.763284   
2          604.591675           593.927557                0.777612   
3          411.685851           407.687161                0.780780   
4          523.535678           518.204101

In [None]:
##
## Calculate spectral counts
##



In [74]:
# write out the results for dissemination
results_df.to_csv(os.path.join(project_dir, './results/openmodsearch_als_ptms.csv'), index=False)