In [1]:
import numpy as np 
import pandas as pd 
import altair as alt 
import matplotlib.pyplot as plt
import ntpath
import os 
import re 
from Commons.data_processing import *
from Commons.ms_handler import *
from Commons.my_mzml import *

In [2]:
xmls = get_files('.\mzXML', exts=['.mzXML'])

data_file = get_files('.\Working_Datafiles', exts=['extract.csv'])[0]
df = pd.read_csv(data_file)
df.head(2)

Unnamed: 0,accession,description,checked,confidence,annotated_sequence,modifications,master_protein_accessions,rt_min,mz_da,charge,...,sequence,data_source,temperature,run,concentration,dilution,glycan,glycan_type,degree_sial,pep_mods
0,Q3SZR3,Alpha-1-acid glycoprotein OS=Bos taurus OX=991...,True,High,[R].QNGTLSK.[V],1xHexNAc(3)Hex(4)NeuAc(1) [N2],Q3SZR3,27.2321,1148.97754,2,...,QNGTLSK,1_30C_Run3,30C,Run3,1.0,1x,HexNAc(3)Hex(4)NeuAc(1),Sialylated,Monosialylated,QNGTLSK_1xHexNAc(3)Hex(4)NeuAc(1) [N2]
1,Q3SZR3,Alpha-1-acid glycoprotein OS=Bos taurus OX=991...,True,High,[R].QNGTLSK.[V],1xHexNAc(4)Hex(5)NeuAc(1) [N2],Q3SZR3,28.0701,1331.04211,2,...,QNGTLSK,1_30C_Run1,30C,Run1,1.0,1x,HexNAc(4)Hex(5)NeuAc(1),Sialylated,Monosialylated,QNGTLSK_1xHexNAc(4)Hex(5)NeuAc(1) [N2]


In [3]:
files_of_interest = [f for f in xmls if re.search('_1_', f)]
parsers = dict()
for file in files_of_interest:
    base = ntpath.splitext(ntpath.basename(file))[0]
    ident = '_'.join(base.split('_')[2:])
    
    parsers[ident] = mzXML(file, separate_levels=True)

In [4]:
df[['sequence', 'glycan', 'temperature', 'run', 'num_psms']].sort_values(['glycan', 'temperature', 'run'])

Unnamed: 0,sequence,glycan,temperature,run,num_psms
2,QNGTLSK,HexNAc(3)Hex(4)NeuAc(1),30C,Run1,14
4,QNGTLSK,HexNAc(3)Hex(4)NeuAc(1),30C,Run2,12
0,QNGTLSK,HexNAc(3)Hex(4)NeuAc(1),30C,Run3,13
6,QNGTLSK,HexNAc(3)Hex(4)NeuAc(1),45C,Run1,10
9,QNGTLSK,HexNAc(3)Hex(4)NeuAc(1),45C,Run2,12
11,QNGTLSK,HexNAc(3)Hex(4)NeuAc(1),45C,Run3,9
17,QNGTLSK,HexNAc(3)Hex(4)NeuAc(1),60C,Run1,8
13,QNGTLSK,HexNAc(3)Hex(4)NeuAc(1),60C,Run2,7
15,QNGTLSK,HexNAc(3)Hex(4)NeuAc(1),60C,Run3,8
1,QNGTLSK,HexNAc(4)Hex(5)NeuAc(1),30C,Run1,16


In [5]:
res = pd.DataFrame()

for i, row in df.iterrows():
    sequence = row['sequence']
    mass = row['mz_da']
    charge = row['charge']
    file_alias = row['data_source']
    temperature = row['temperature']
    rt = row['rt_min']
    run = row['run']
    glycan = row['glycan']

    time, intensity = parsers[file_alias].ms1_extract(mass, tolerance=5)
    intensity = gaussian_filter(intensity, sigma=2)
    
    sub = pd.DataFrame({
        'Time': time,
        'Intensity': intensity
    })
    sub.loc[:, "Sequence"] = sequence
    sub.loc[:, "Glycan"] = glycan
    sub.loc[:, "Mass"] = mass
    sub.loc[:, "Charge"] = charge
    sub.loc[:, "RT"] = rt
    sub.loc[:, "Temperature"] = temperature
    sub.loc[:, "Run"] = run

    res = pd.concat([res, sub])
    
res.reset_index(inplace=True)
res = res.drop('index', axis=1)

In [22]:
res.Sequence.unique()

array(['QNGTLSK'], dtype=object)

In [7]:
res.loc[:, 'rt_left'] = res.RT-0.5
res.loc[:, 'rt_right'] = res.RT+0.5

In [9]:
ms1_info = res[(res.Time >= 23) & (res.Time <= 32)]

my_colors = alt.Color(
    "Temperature:N",
    scale=alt.Scale(
        domain=["30C", "45C", "60C"], range=["#6E6581", "#B0B2BB", "#6B8A97"]
    ),
)

base = (
    alt.Chart(ms1_info)
    .mark_line()
    .encode(
        x=alt.X("Time:Q", title="Time (min)"),
        y=alt.Y("Intensity:Q", title="Relative Abundance", axis=alt.Axis(format=".2e")),
        color=my_colors,
    )
)

lower_lims = (
    alt.Chart(ms1_info)
    .mark_rule(strokeDash=[5, 5])
    .encode(
        x=alt.X("rt_left:Q", title=""),
    )
)

upper_lims = (
    alt.Chart(ms1_info)
    .mark_rule(strokeDash=[5, 5])
    .encode(
        x=alt.X("rt_right:Q", title=""),
    )
)

area = (
    alt.Chart(ms1_info)
    .mark_area(opacity=0.7)
    .encode(
        x=alt.X("Time:Q", title="Time (min)"),
        y=alt.Y("Intensity:Q", title="Relative Abundance"),
        color=my_colors,
    )
    .properties(width=150, height=150)
)

layer = alt.layer(base, lower_lims, upper_lims, area).facet(
    column="Temperature:N", row="Run:N"
)

chart = alt.vconcat()
for glycan in ms1_info.Glycan.unique():
    l = layer.transform_filter(alt.datum.Glycan == glycan).properties(title=f"{glycan}")
    # l.save(f'{glycan}.png', scale_factor=20)
    chart &= l


In [11]:
chart.save(r'.\Figures\EICAllFound.svg')

In [11]:
data_file = get_files('.\Working_Datafiles', exts=['notfound.csv'])[0]
df = pd.read_csv(data_file)
df.head(2)

Unnamed: 0,accession,description,checked,confidence,annotated_sequence,modifications,master_protein_accessions,rt_min,mz_da,charge,...,sequence,data_source,temperature,run,concentration,dilution,glycan,glycan_type,degree_sial,pep_mods
0,Q3SZR3,Alpha-1-acid glycoprotein OS=Bos taurus OX=991...,True,High,[R].QNGTLSK.[V],1xHexNAc(5)Hex(6)NeuAc(1) [N2],Q3SZR3,24.5519,1513.6062,2,...,QNGTLSK,1_30C_Run3,30C,Run3,1.0,1x,HexNAc(5)Hex(6)NeuAc(1),Sialylated,Monosialylated,QNGTLSK_1xHexNAc(5)Hex(6)NeuAc(1) [N2]
1,Q3SZR3,Alpha-1-acid glycoprotein OS=Bos taurus OX=991...,True,High,[R].QNGTLSK.[V],1xHexNAc(5)Hex(6)NeuAc(1) [N2],Q3SZR3,24.9925,1513.61023,2,...,QNGTLSK,1_30C_Run1,30C,Run1,1.0,1x,HexNAc(5)Hex(6)NeuAc(1),Sialylated,Monosialylated,QNGTLSK_1xHexNAc(5)Hex(6)NeuAc(1) [N2]


In [12]:
df[['data_source', 'num_psms']]

Unnamed: 0,data_source,num_psms
0,1_30C_Run3,7
1,1_30C_Run1,8
2,1_45C_Run1,7
3,1_45C_Run2,8
4,1_45C_Run3,7


In [13]:
nf = pd.DataFrame()

avg_mass = df.mz_da.mean()

for key in parsers:
    time, intensity = parsers[key].ms1_extract(avg_mass, tolerance=5)
    intensity = gaussian_filter(intensity, sigma=2)

    sub = pd.DataFrame({
        'Time': time,
        'Intensity': intensity
    })
    
    _, temp, run = key.split('_')
    sub.loc[:, "Temperature"] = temp
    sub.loc[:, "Run"] = run

    nf = pd.concat([nf, sub])
    
nf.reset_index(inplace=True)
nf = nf.drop('index', axis=1)

In [14]:
nf

Unnamed: 0,Time,Intensity,Temperature,Run
0,0.001091,0.000000,30C,Run1
1,0.005070,0.000000,30C,Run1
2,0.033369,0.000000,30C,Run1
3,0.043958,0.000000,30C,Run1
4,0.069268,0.000000,30C,Run1
...,...,...,...,...
47984,81.968333,48.293748,60C,Run3
47985,81.977000,12.210572,60C,Run3
47986,81.982000,2.404404,60C,Run3
47987,81.990500,0.368727,60C,Run3


In [19]:
_nf = nf[(nf.Time >= 20) & (nf.Time<=30)]

my_colors = alt.Color(
    "Temperature:N",
    scale=alt.Scale(
        domain=["30C", "45C", "60C"], range=["#6E6581", "#B0B2BB", "#6B8A97"]
    ),
)

base = (
    alt.Chart(_nf)
    .mark_line()
    .encode(
        x=alt.X("Time:Q", title="Time (min)"),
        y=alt.Y("Intensity:Q", title="Relative Abundance", axis=alt.Axis(format=".2e")),
        color=my_colors,
    )
)

lower_lims = (
    alt.Chart(_nf)
    .mark_rule(strokeDash=[5, 5])
    .encode(
        x=alt.X("rt_left:Q", title=""),
    )
    .transform_calculate(rt_left="datum.RT - 0.5")
)

upper_lims = (
    alt.Chart(_nf)
    .mark_rule(strokeDash=[5, 5])
    .encode(
        x=alt.X("rt_right:Q", title=""),
    )
    .transform_calculate(rt_right="datum.RT + 0.5")
)

area = (
    alt.Chart(_nf)
    .mark_area(opacity=0.7)
    .encode(
        x=alt.X("Time:Q", title="Time (min)"),
        y=alt.Y("Intensity:Q", title="Relative Abundance"),
        color=my_colors,
    )
    .properties(width=150, height=150
    )
)

layer = alt.layer(base, lower_lims, upper_lims, area).facet(
    column="Temperature:N", row="Run:N"
).properties(title='QNGTLSK + HexNAc(5)Hex(6)NeuAc(1)')


In [21]:
layer.save(r'.\Figures\30Only.svg')

In [243]:
data_file = get_files('.\Working_Datafiles', exts=['only.csv'])[0]
df = pd.read_csv(data_file)
df.head()

Unnamed: 0,accession,description,checked,confidence,annotated_sequence,modifications,master_protein_accessions,rt_min,mz_da,charge,...,sequence,data_source,temperature,run,concentration,dilution,glycan,glycan_type,degree_sial,pep_mods
0,Q3SZR3,Alpha-1-acid glycoprotein OS=Bos taurus OX=991...,True,High,[R].NPEYNK.[S],1xHexNAc(5)Hex(6)NeuAc(1) [N5],Q3SZR3,35.1691,1522.08777,2,...,NPEYNK,1_60C_Run2,60C,Run2,1.0,1x,HexNAc(5)Hex(6)NeuAc(1),Sialylated,Monosialylated,NPEYNK_1xHexNAc(5)Hex(6)NeuAc(1) [N5]
1,Q3SZR3,Alpha-1-acid glycoprotein OS=Bos taurus OX=991...,True,High,[R].NPEYNK.[S],1xHexNAc(5)Hex(6)NeuAc(1) [N5],Q3SZR3,34.7748,1521.58704,2,...,NPEYNK,1_60C_Run3,60C,Run3,1.0,1x,HexNAc(5)Hex(6)NeuAc(1),Sialylated,Monosialylated,NPEYNK_1xHexNAc(5)Hex(6)NeuAc(1) [N5]
2,Q3SZR3,Alpha-1-acid glycoprotein OS=Bos taurus OX=991...,True,High,[R].NPEYNK.[S],1xHexNAc(5)Hex(6)NeuAc(1) [N5],Q3SZR3,35.0144,1522.09497,2,...,NPEYNK,1_60C_Run1,60C,Run1,1.0,1x,HexNAc(5)Hex(6)NeuAc(1),Sialylated,Monosialylated,NPEYNK_1xHexNAc(5)Hex(6)NeuAc(1) [N5]


In [252]:
df[['mz_da', 'charge', 'delta_mz_da', 'theo_mh_da', 'delta_m_ppm']]

Unnamed: 0,mz_da,charge,delta_mz_da,theo_mh_da,delta_m_ppm
0,1522.08777,2,0.00085,3043.16656,0.56
1,1521.58704,2,-0.49988,3043.16656,-328.53
2,1522.09497,2,0.00805,3043.16656,5.29


In [253]:
3043.16656 / 2

1521.58328

In [18]:
only_45 = pd.DataFrame()

mass = 1268.01453

for key in parsers:
    time, intensity = parsers[key].ms1_extract(mass, tolerance=10)
    intensity = gaussian_filter(intensity, sigma=2)

    sub = pd.DataFrame({
        'Time': time,
        'Intensity': intensity
    })
    
    _, temp, run = key.split('_')
    sub.loc[:, "Temperature"] = temp
    sub.loc[:, "Run"] = run

    only_45 = pd.concat([only_45, sub])
    
only_45.reset_index(inplace=True)
only_45 = only_45.drop('index', axis=1)

In [19]:
only_45 = only_45[(only_45.Time >= 20) & (only_45.Time<=35)]

my_colors = alt.Color(
    "Temperature:N",
    scale=alt.Scale(
        domain=["30C", "45C", "60C"], range=["#6E6581", "#B0B2BB", "#6B8A97"]
    ),
)

base = (
    alt.Chart(only_45)
    .mark_line()
    .encode(
        x=alt.X("Time:Q", title="Time (min)"),
        y=alt.Y("Intensity:Q", title="Relative Abundance", axis=alt.Axis(format=".2e")),
        color=my_colors,
    )
)

lower_lims = (
    alt.Chart(only_45)
    .mark_rule(strokeDash=[5, 5])
    .encode(
        x=alt.X("rt_left:Q", title=""),
    )
    .transform_calculate(rt_left="datum.RT - 0.5")
)

upper_lims = (
    alt.Chart(only_45)
    .mark_rule(strokeDash=[5, 5])
    .encode(
        x=alt.X("rt_right:Q", title=""),
    )
    .transform_calculate(rt_right="datum.RT + 0.5")
)

area = (
    alt.Chart(only_45)
    .mark_area(opacity=0.7)
    .encode(
        x=alt.X("Time:Q", title="Time (min)"),
        y=alt.Y("Intensity:Q", title="Relative Abundance"),
        color=my_colors,
    )
    .properties(width=150, height=150)
)

layer = alt.layer(base, lower_lims, upper_lims, area).facet(
    column="Temperature:N", row="Run:N"
).properties(title='NPEYNK+HexNAc(5)Hex(6)NeuGc(1)')
layer.save(r'.\Figures\45Only.svg')
layer

In [308]:
data_file = get_files('.\Working_Datafiles', exts=['60_only_2.csv'])[0]
df = pd.read_csv(data_file)
df.head()

only_60 = pd.DataFrame()

mass = df[df.delta_m_ppm == df.delta_m_ppm.min()].mz_da.item()

for key in parsers:
    time, intensity = parsers[key].ms1_extract(mass, tolerance=5)
    intensity = gaussian_filter(intensity, sigma=2)

    sub = pd.DataFrame({
        'Time': time,
        'Intensity': intensity
    })
    
    _, temp, run = key.split('_')
    sub.loc[:, "Temperature"] = temp
    sub.loc[:, "Run"] = run

    only_60 = pd.concat([only_60, sub])
    
only_60.reset_index(inplace=True)
only_60 = only_60.drop('index', axis=1)

In [309]:
only_60 = only_60[(only_60.Time >= 40) & (only_60.Time<=70)]

my_colors = alt.Color(
    "Temperature:N",
    scale=alt.Scale(
        domain=["30C", "45C", "60C"], range=["#6E6581", "#B0B2BB", "#6B8A97"]
    ),
)

base = (
    alt.Chart(only_60)
    .mark_line()
    .encode(
        x=alt.X("Time:Q", title="Time (min)"),
        y=alt.Y("Intensity:Q", title="Relative Abundance", axis=alt.Axis(format=".2e")),
        color=my_colors,
    )
)

lower_lims = (
    alt.Chart(only_60)
    .mark_rule(strokeDash=[5, 5])
    .encode(
        x=alt.X("rt_left:Q", title=""),
    )
    .transform_calculate(rt_left="datum.RT - 0.5")
)

upper_lims = (
    alt.Chart(only_60)
    .mark_rule(strokeDash=[5, 5])
    .encode(
        x=alt.X("rt_right:Q", title=""),
    )
    .transform_calculate(rt_right="datum.RT + 0.5")
)

area = (
    alt.Chart(only_60)
    .mark_area(opacity=0.7)
    .encode(
        x=alt.X("Time:Q", title="Time (min)"),
        y=alt.Y("Intensity:Q", title="Relative Abundance"),
        color=my_colors,
    )
    .properties(width=150, height=150)
)

layer = alt.layer(base, lower_lims, upper_lims, area).facet(
    column="Temperature:N", row="Run:N"
)
layer


In [310]:
df

Unnamed: 0,accession,description,checked,confidence,annotated_sequence,modifications,master_protein_accessions,rt_min,mz_da,charge,...,sequence,data_source,temperature,run,concentration,dilution,glycan,glycan_type,degree_sial,pep_mods
0,P12763,Alpha-2-HS-glycoprotein OS=Bos taurus OX=9913 ...,True,High,[K].LCPDCPLLAPLNDSR.[V],2xCarbamidomethyl [C2; C5]; 1xHexNAc(4)Hex(5)N...,P12763,59.2946,1321.20605,3,...,LCPDCPLLAPLNDSR,1_60C_Run2,60C,Run2,1.0,1x,HexNAc(4)Hex(5)NeuAc(1)NeuGc(1),Sialylated,Disialylated,LCPDCPLLAPLNDSR_HexNAc(4)Hex(5)NeuAc(1)NeuGc(1)
1,P12763,Alpha-2-HS-glycoprotein OS=Bos taurus OX=9913 ...,True,High,[K].LCPDCPLLAPLNDSR.[V],2xCarbamidomethyl [C2; C5]; 1xHexNAc(4)Hex(5)N...,P12763,58.8623,1321.20483,3,...,LCPDCPLLAPLNDSR,1_60C_Run3,60C,Run3,1.0,1x,HexNAc(4)Hex(5)NeuAc(1)NeuGc(1),Sialylated,Disialylated,LCPDCPLLAPLNDSR_HexNAc(4)Hex(5)NeuAc(1)NeuGc(1)


In [277]:
mass_error(1321.206, 1321.2079)

-1.4380779892439197

In [316]:
data_file = get_files('.\Working_Datafiles', exts=['30_only_2.csv'])[0]
df = pd.read_csv(data_file)
df.head()

only_30 = pd.DataFrame()

mass = df[df.delta_m_ppm == df.delta_m_ppm.min()].mz_da.item()

for key in parsers:
    time, intensity = parsers[key].ms1_extract(mass, tolerance=5)
    intensity = gaussian_filter(intensity, sigma=2)

    sub = pd.DataFrame({
        'Time': time,
        'Intensity': intensity
    })
    
    _, temp, run = key.split('_')
    sub.loc[:, "Temperature"] = temp
    sub.loc[:, "Run"] = run

    only_30 = pd.concat([only_30, sub])
    
only_30.reset_index(inplace=True)
only_30 = only_30.drop('index', axis=1)

In [321]:
only_30 = only_30[(only_30.Time >= 35) & (only_30.Time<=45)]

my_colors = alt.Color(
    "Temperature:N",
    scale=alt.Scale(
        domain=["30C", "45C", "60C"], range=["#6E6581", "#B0B2BB", "#6B8A97"]
    ),
)

base = (
    alt.Chart(only_30)
    .mark_line()
    .encode(
        x=alt.X("Time:Q", title="Time (min)"),
        y=alt.Y("Intensity:Q", title="Relative Abundance", axis=alt.Axis(format=".2e")),
        color=my_colors,
    )
)

lower_lims = (
    alt.Chart(only_30)
    .mark_rule(strokeDash=[5, 5])
    .encode(
        x=alt.X("rt_left:Q", title=""),
    )
    .transform_calculate(rt_left="datum.RT - 0.5")
)

upper_lims = (
    alt.Chart(only_30)
    .mark_rule(strokeDash=[5, 5])
    .encode(
        x=alt.X("rt_right:Q", title=""),
    )
    .transform_calculate(rt_right="datum.RT + 0.5")
)

area = (
    alt.Chart(only_30)
    .mark_area(opacity=0.7)
    .encode(
        x=alt.X("Time:Q", title="Time (min)"),
        y=alt.Y("Intensity:Q", title="Relative Abundance"),
        color=my_colors,
    )
    .properties(width=150, height=150)
)

layer = alt.layer(base, lower_lims, upper_lims, area).facet(
    column="Temperature:N", row="Run:N"
)
layer


In [320]:
mass

1142.43469