In [2]:
import pyteomics
import pandas as pd
from pyteomics import mzml, auxiliary
import matplotlib.pyplot as plt
import spectrum_utils.plot as sup
import spectrum_utils.spectrum as sus



In [3]:
#get an xic based on the mz and retention time 
file1 = pyteomics.mzml.MzML('/Volumes/LaCie/phospho_white_list/PXD000472/peakpicking1-1/b0775p96_Control_1.mzML')


In [6]:

def clean_values(df):
    #keep the highest scoring value from each scan number

    #sort based on intensity value
    df_slim = df.sort_values('intensity')
    #drop duplicate scans and keep the one with the highest intensity
    df_slim = df_slim.drop_duplicates(subset=["scan"], keep="last") #keep highest scoring intensity

    #sort on time, easier to read
    df_slim = df_slim.sort_values('time')
    
    return(df_slim)

In [7]:
def get_values(target_mz, peak_time, data):
    df = pd.DataFrame(columns = {'scan', 'time', 'intensity', "mz"})

    tol = 0.1
    mz_min = target_mz - tol
    mz_max = target_mz + tol
    times = data.time[peak_time-(5/60) : peak_time+(5/60)]

    for spot in times:
        #checking that we have an MS1 scan
        if spot['ms level'] == 1:

            #getting the time
            time = (spot['scanList']['scan'][0].get('scan start time'))

            #get scan number
            scanString = spot['id']
            startSpot = scanString.find('scan=')
            scanNum = scanString[startSpot+5:]

            #get intensity and mz
            intensity_array = spot['intensity array']
            mz_array = spot["m/z array"] 

            #checking through all mz array for anything in our range of mz values
            for x in range(0, len(mz_array)):
                if mz_array[x] > mz_min and mz_array[x] < mz_max: 
                    intensity = intensity_array[x]

                    #creating a new row and adding it into the df
                    row = {'scan': scanNum, 'time': time, 'intensity': intensity, 'mz': mz_array[x]}
                    df = df.append(row, ignore_index=True)
    cleaned_df = clean_values(df)
    
    return cleaned_df

In [8]:
rt = 62.3792
mz = 982.952369

df = get_values(mz, rt, file1)

In [9]:
df

Unnamed: 0,intensity,mz,scan,time
2,6369043.5,983.037351,7454,62.29397
7,6373104.0,983.038047,7455,62.306077
12,5033637.5,983.036997,7456,62.318247
17,5357552.0,983.038223,7457,62.330283
22,4005049.5,983.036758,7458,62.342412
27,3761561.5,983.037507,7459,62.354585
32,3770353.75,983.03725,7460,62.366773
38,3257212.75,983.0368,7461,62.378753
41,3776821.0,983.037051,7462,62.390745
48,2862925.5,983.037089,7465,62.413633
