In [1]:
import sys, os, re, copy
import dill as pickle # this serializes all the functions inside the quantification dict
import numpy as np
import scipy as sp
from scipy.optimize import newton, minimize, fsolve
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.backends.backend_pdf
import matplotlib.colors as mcolors
palette = list(mcolors.TABLEAU_COLORS.keys())
%matplotlib inline
import seaborn as sns
sns.set_style("whitegrid", {
 'axes.spines.bottom': True,
 'axes.spines.left': True,
 'axes.spines.right': True,
 'axes.spines.top': True
})
sns.set(font_scale=1)
palette = list(mcolors.TABLEAU_COLORS.keys())
sns.set_theme(style="ticks", palette="muted")
sns.set_context("talk")

## qwertyu
Lorem ipsum
* Media, conditions, cell lines, setup etc
* Using 15N-amide Gln because wrongly thought this would label Gly, which is the reason for all the cumbersome natural isotope abundance correction. However, the correction is small (in the order of 1-4%).
* Use of Asn labelling
* bla


In [2]:
def impute_conc(piece_wise_fit_metab, response_ratio):
    '''
    This function imputes the concentration from a response ratio.
    '''
    response_ratio_range = np.array(list(piece_wise_fit_metab.keys()))
    mask_range = [response_ratio >= min_v and response_ratio <= max_v for max_v, min_v in response_ratio_range]
    k = tuple(response_ratio_range[mask_range][0])
    conc = piece_wise_fit_metab[k](response_ratio)
    estimator = 'intrapolation'
    if 0 in k:
        estimator = 'extrapolation under'
    elif np.inf in k:
        estimator = 'extrapolation over'
    return(conc, estimator)

In [3]:
def NB_ratio_nitrogen(Npos, label_abundance_15N, natural_abundance_14N=0.99636):
    '''
    Calculates a multiplier to correct concentration
    of nitrogen labelled compounds.
    Multiply the return value with the calculated concentration
    to perform the correction.
    Npos = positions that can be labelled
    '''
    # Under natural abundance
    # what is the expected fraction of the compound
    # with _no_ 15N on all the positions that could be labelled:
    frac_natural = sp.stats.binom.pmf(Npos, Npos, natural_abundance_14N)
    # With the label
    # what is the expected fraction of the compound
    # with 15N on all the positions that could be labelled:
    frac_label = sp.stats.binom.pmf(Npos, Npos, label_abundance_15N)
    return(frac_natural/frac_label)

#### Correction for aspartate
Aspartate is a special case for quantification because U-13C labelled asparagine was feed to cell to descriminate protein levels of asparagine and aspartate since protein bound U-13C asparagine turns into U-13C aspartate upon acid hydrolysis; however, the U-13C aspartate is also part of the internal standards for quantification. Thus the U-13C aspartate coming from protein U-13C asparagine has to be decoupled from U-13C aspartate coming from the internal standard. This is achieved by finding the ratio a unlabelled to labelled aspartate, by running samples without internal standards, and then correcting the response ratio accordingly.

Let $A_1$ be the peak area of unlabelled aspartate, $A_2$ be that of U-13C aspartate derived from hydrolyzed U-13C asparagine and $N$ be that derived from the internal standard. We then have:
$$
Y = \frac{A_1}{A_2+N} => N = \frac{A_1}{Y-A_2}
$$

From the ratio we have:
$$
X = \frac{A_1}{A_2} => A_2 = \frac{A_1}{X}
$$

By substitution we get:
$$
N = \frac{A_1}{Y}-\frac{A_1}{X}
$$


And the corrected response ratio ($RR$) for finding the concentration of protein aspartate and asparagine can then be found:
$$
RR = \frac{A_1+A_2}{N} => \frac{A_1+\frac{A_1}{X}}{\frac{A_1}{Y}-\frac{A_1}{X}} => \frac{X+1}{\frac{X}{Y}-1}
$$

The response ratio is converted to a concentration of total aspartate through the calibration curve $f(RR)$. Then the fraction of aspartate from hydrolyzed asparagine is calculated using the aspartate label fraction $X$. Incomplete labelling with asparagine is taken into account by using the fraction of labelled to unlabelled asparagine before acid hydrolysis ($F$). It is not necessary to correct for natural isotope abundance because the 13C enrichment in the U-13C asparagine is approximately equal to the natural 12C abundance. Thus, the final concentration will be:
$$
[Asn] = f(RR) \frac{1}{F(1+X)}
$$

$$
[Asp] = f(RR) (1 - \frac{1}{F(1+X)})
$$




In [4]:
def asp_corr(X, Y, F, f):
    '''
    Calculates the aspartate and asparagine concentration
    based on the above shown correction.
    
    Here is some code I used for testing:
    f_conc = lambda x: x*10
    F = 0.98 # labelling fraction
    A1 = 500+700*(1-F)
    A2 = 700*F
    N = 1000
    X = A1/A2
    Y = A1/(A2+N)
    RR = (X+1)/(X/Y-1)

    Asp_total = f_conc(RR)
    Asn_conc = Asp_total*   1/(F*(X+1))
    Asp_conc = Asp_total*(1-1/(F*(X+1)))

    print(Asp_conc)
    print(Asn_conc)
    assert((Asp_conc+Asn_conc) == Asp_total)
    '''
    RR = (X+1)/(X/Y-1)
    if RR < 0:
        return(0, 0, 'extrapolation under')
    else:
        asp_total, estimator = impute_conc(f, RR)
        asn_conc = asp_total*   1/(F*(X+1))
        asp_conc = asp_total*(1-1/(F*(X+1)))
        return(asp_conc, asn_conc, estimator)

In [5]:
# Dict to correct for nitrogen natural isotope abundance:
NB_correct_dict = {
    'Guanine 15N3 pos': 3,
    'Adenine 15N2 pos': 2,
    'Uracil 15N1 neg': 1,
    'Cytosine 15N2 pos': 2,
    'Thymine 15N1 neg': 1,
    'Hypoxanthine 15N2 pos': 2,
    'Xanthine 15N2 neg': 2,
    'Guanosine 15N3 pos': 3,
    'Adenosine 15N2 pos': 2,
    'Uridine 15N1 neg': 1,
    'Cytidine 15N2 pos': 2,
    'Deoxyuridine 15N1 neg': 1,
}
# List of amino acids with quantification data:
AA_quant_list = ['Asparagine', 'Aspartate neg', 'Glutamate neg', 'Arginine pos', 'Tyrosine pos', 'Serine neg', 'Proline pos', 'Threonine neg', 'Lysine pos', 'Leucine pos', 'Valine pos', 'Phenylalanine pos', 'Isoleucine pos', 'Alanine pos']

14

In [6]:
### Read quantification function ###
dict_pickle_fnam = 'AA-nucleoside_quant-dict.pickle'
with open(dict_pickle_fnam, 'rb') as handle:
    piece_wise_fit_AA_nucs = pickle.load(handle)
# dict_pickle_fnam = 'AA-nucleobase_quant-dict.pickle'
dict_pickle_fnam = 'AA-nucleoside_quant-dict.pickle' # change to above
with open(dict_pickle_fnam, 'rb') as handle:
    piece_wise_fit_nucb = pickle.load(handle)
# Merge the two quantitation dictionaries:
piece_wise_fit_quant = {**piece_wise_fit_AA_nucs, **piece_wise_fit_nucb}

### Read measurements ###
### Replace all N/F with 0 before start ###
esheet_dict_mes = pd.read_excel('143B-H1299_Nuc-Prot_quant.xlsx', sheet_name=None)
annotation_df = pd.read_excel('annotations.xlsx')
metab_dict_mes = dict()
metab_names_mes = list()
for k in esheet_dict_mes.keys():
    if 'U-13C' not in k:
        metab_names_mes.append(k)
        metab_dict_mes[k] = copy.deepcopy(esheet_dict_mes[k])
        metab_dict_mes[k]['Response Ratio'] = metab_dict_mes[k]['Area'] / metab_dict_mes[k]['ISTD Response']
        metab_dict_mes[k]['Response Ratio'] = metab_dict_mes[k]['Response Ratio'].fillna(0).replace(np.inf, 0)
        metab_dict_mes[k]['Sample_name'] = [fn.split('_')[-1] for fn in metab_dict_mes[k]['Filename']]
        # Add annotations:
        metab_dict_mes[k] = metab_dict_mes[k].merge(annotation_df, left_on='Sample_name', right_on='Sample_name')
        if 'Aspartate' in k:
            metab_dict_mes[k] = metab_dict_mes[k].drop(['Flag Details', 'Theoretical Amt', 'Filename', 'Type', 'RT', 'Sample ID'], axis=1)
        else:
            metab_dict_mes[k] = metab_dict_mes[k].drop(['Flag Details', 'Theoretical Amt', 'Filename', 'Type', 'RT', 'Sample ID', 'Asp_X', 'Asn_frac'], axis=1)

### Impute concentration and add to metabolite dataframe ###
rr_mes = dict() # for plotting 
imp_conc_mes = dict() # for plotting
for metab in metab_names_mes[:]:
    # Remove 15N from name:
    metab_split = metab.split()
    if len(metab_split) == 4:
        metab_no_iso = ' '.join([metab_split[0], metab_split[1], metab_split[-1]])
    elif len(metab_split) == 3:
        metab_no_iso = ' '.join([metab_split[0], metab_split[-1]])
    elif len(metab_split) == 2:
        metab_no_iso = metab
    else:
        raise Exception('{} not recognized metabolite name format'.format(metab))
    # Assign imputed concentration:
    if metab_no_iso in piece_wise_fit_quant:
        if 'Aspartate' not in metab_no_iso: # special case for aspartate
            conc_list = list()
            estimator_list = list()
            for label_abundance_15N, rr in zip(metab_dict_mes[metab]['Gln_frac'].values, metab_dict_mes[metab]['Response Ratio'].values):
                conc, estimator = impute_conc(piece_wise_fit_quant[metab_no_iso], rr)
                if metab in NB_correct_dict:
                    Npos = NB_correct_dict[metab]
                    NB_mult = NB_ratio_nitrogen(Npos, label_abundance_15N)
                else:
                    NB_mult = 1
                conc_list.append(NB_mult*conc)
                estimator_list.append(estimator)
            metab_dict_mes[metab]['imputed_sample_conc'] = conc_list
            metab_dict_mes[metab]['imputed_sample_estimator'] = estimator_list
        else:
            asp_conc_list = list()
            asn_conc_list = list()
            estimator_list = list()
            for X, Y, F in zip(metab_dict_mes[metab]['Asp_X'].values, metab_dict_mes[metab]['Response Ratio'].values, metab_dict_mes[metab]['Asn_frac'].values):
                asp_conc, asn_conc, estimator = asp_corr(X, Y, F, piece_wise_fit_quant[metab_no_iso])  ###
                asp_conc_list.append(asp_conc)
                asn_conc_list.append(asn_conc)
                estimator_list.append(estimator)
            metab_dict_mes['Asparagine'] = copy.deepcopy(metab_dict_mes[metab])
            metab_names_mes.append('Asparagine')
            metab_dict_mes[metab]['imputed_sample_conc'] = asp_conc_list
            metab_dict_mes[metab]['imputed_sample_estimator'] = estimator_list
            metab_dict_mes['Asparagine']['imputed_sample_conc'] = asn_conc_list
            metab_dict_mes['Asparagine']['imputed_sample_estimator'] = estimator_list

        # Extract data for plotting:
        if metab_no_iso in rr_mes:
            rr_mes[metab_no_iso].extend(list(metab_dict_mes[metab]['Response Ratio'].values))
            imp_conc_mes[metab_no_iso].extend(list(metab_dict_mes[metab]['imputed_sample_conc'].values))
        else:
            rr_mes[metab_no_iso] = list(metab_dict_mes[metab]['Response Ratio'].values)
            imp_conc_mes[metab_no_iso] = list(metab_dict_mes[metab]['imputed_sample_conc'].values)

In [7]:
metab_dict_mes[metab_names_mes[0]].drop(['Compound', 'Formula', 'Response Ratio', 'Area', 'ISTD Response'], axis=1)

Unnamed: 0,Sample_name,cell_line,plate,replicate,hydrolysis,dilution,cell_numb,cell_vol,frac_transferred,solvent_vol,Gln_frac
0,Q01,143B,1,1,none,1.00,472133.333333,2.517313,0.25,1000,0.985233
1,Q02,143B,1,1,short,1.25,472133.333333,2.517313,0.25,1000,0.985233
2,Q03,143B,1,1,short,1.25,472133.333333,2.517313,0.25,1000,0.985233
3,Q04,143B,1,1,long,0.50,472133.333333,2.517313,0.25,1000,0.985233
4,Q05,143B,1,1,long,0.50,472133.333333,2.517313,0.25,1000,0.985233
...,...,...,...,...,...,...,...,...,...,...,...
85,Q86,H1299,3,3,none,1.00,766466.666667,5.031903,0.25,1000,0.995494
86,Q87,H1299,3,3,short,1.25,766466.666667,5.031903,0.25,1000,0.995494
87,Q88,H1299,3,3,short,1.25,766466.666667,5.031903,0.25,1000,0.995494
88,Q89,H1299,3,3,long,0.50,766466.666667,5.031903,0.25,1000,0.995494


In [8]:
df = copy.deepcopy(metab_dict_mes[metab_names_mes[0]]).drop(['Compound', 'Formula', 'Response Ratio', 'Area', 'ISTD Response'], axis=1)
for nam in metab_names_mes:
    if nam in AA_quant_list:
        assert(metab_dict_mes[nam]['Sample_name'].equals(df['Sample_name']))
        df[nam] = metab_dict_mes[nam]['imputed_sample_conc']

mask = df['hydrolysis'] != 'none'
df = df[mask]
df_mean = df.groupby(['cell_line', 'plate', 'hydrolysis']).mean().reset_index()


In [9]:
mask = df_mean['hydrolysis'] == 'long'
df_mean_cell_conc = copy.deepcopy(df_mean[mask])
df_mean_cell_conc.loc[:, AA_quant_list] = df_mean_cell_conc.loc[:, AA_quant_list].mul( df_mean_cell_conc['solvent_vol'] / (df_mean_cell_conc['dilution'] * df_mean_cell_conc['frac_transferred'] * df_mean_cell_conc['cell_vol']), axis=0)


In [10]:
df_mean_cell_conc.loc[:, AA_quant_list].sum(axis=1)

0     444014.174583
2     569917.385512
4     465605.127663
6     398696.477019
8     430114.037304
10    362532.636127
dtype: float64

In [473]:
df_mean_cell_conc.drop(['hydrolysis', 'dilution', 'frac_transferred', 'solvent_vol'], axis=1).to_excel("conc.xlsx")

In [11]:
df_mean_cell_conc.loc[:, AA_quant_list]

Unnamed: 0,Asparagine,Aspartate neg,Glutamate neg,Arginine pos,Tyrosine pos,Serine neg,Proline pos,Threonine neg,Lysine pos,Leucine pos,Valine pos,Phenylalanine pos,Isoleucine pos,Alanine pos
0,56833.407173,54947.291646,87043.580771,21095.777004,8275.7988,28265.338569,20744.441518,24561.257438,31596.686827,34714.648664,14082.708801,14217.523298,14324.547942,33311.166133
2,78832.795315,76233.197078,105545.827605,26698.941262,11161.63946,37411.082744,26608.459235,29959.734122,39764.948754,43565.589963,17088.454911,17766.800685,16669.429157,42610.485222
4,52986.123013,42227.486667,86120.207878,25171.710752,10706.749875,32940.436836,22859.53468,26851.801841,36688.334276,39288.186892,18732.276911,16233.338296,17345.422571,37453.517175
6,49229.792655,48249.91343,72620.228135,20283.65496,8310.878911,29511.020123,18046.713439,23177.868037,29689.676602,33068.092651,12808.989302,13841.593383,11905.052438,27953.002952
8,53201.466512,51986.324921,75669.136672,22188.844878,9103.698295,31982.213348,19086.08116,25241.172814,32400.730168,36139.430915,14827.313579,15070.246075,13920.395315,29296.982653
10,38690.649714,30810.817261,66137.849758,20330.213435,8747.041111,27325.443199,16745.13753,22112.85468,29058.952004,32499.715917,15809.226395,13503.306388,14271.509255,26489.919479
