In [3]:
### Notes ###


# Finish resources/readme
# Make main readme
    # Purpose
    # Project structure (i.e. jupyter notebook does this, resources does that etc.)
        # Structure for datasets
    # Input files and requirements
    # Output format e.g. isotope and adduct lists

# Input data requirement:
# Name	Molecular Weight	RT [min]



# Run current version on bile duct cys tracing
    # Go through Excel file and make sure everything is alright
# Run the tracing experiments and do a quick check
# Update rerun Compound Discoverer with new settings
# Quick check on bile duct cys tracing that all is right
# Pretty guide through this notebook





In [4]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
import pandas as pd
pd.set_option('display.max_columns', 50)
import numpy as np
import copy
import random
import json
import pprint
pp = pprint.PrettyPrinter(width=41, compact=True)

In [6]:
from resources.helper import PeakData, Isotopes, make_blacklist_dict
known_cys_fnam = 'resources/known_cys_labeled.tab'
known_gln_fnam = 'resources/known_gln_labeled.tab'
blacklist_fnam = 'resources/blacklist.tab'
blacklist_dict = make_blacklist_dict(blacklist_fnam)

# Read isotope information into an object:
IUPAC_atomic_masses = 'resources/IUPAC-atomic-masses.csv'
IUPAC_atomic_abundances = 'resources/IUPAC-atomic-abundances.html'
isotope_obj = Isotopes(IUPAC_atomic_masses, IUPAC_atomic_abundances)

# Adducts:
adducts_fnam = 'resources/adducts.tab'

In [7]:
# Define parameters for peak filtering and peak pair finding and filtering:
params = {
    ### Specify the formula for the isotopes in the labels used ###
    'labels': {
        'cys': '[13]C3 [15]N',
        'gly': '[13]C2',
        'gln': '[13]C5'
    },
    # End of isotope specification ################################

    ### Peak filter parameters ###
    'min_area': 10000,        # minimum peak area for at least one sample
    'min_MW': 70,             # minimum molecular weight of a compound

    # Peak filter: peak merging #
    # Merge peaks within a ppm mass distance AND within
    # a retention time difference OR a retention time difference x2
    # AND over a minimum peak area correlation coefficient
    'merge_ppm_tol': 200,     # maximum mass distance (in ppm) between peaks to merge
    'merge_RT_tol': 0.4,      # maximum retention time difference between peaks to merge
    'merge_corr_tol':0.7,     # minimum peak area correlation coefficient
    # End of peak filter #########
    
    ### Peak pair finding ###
    'pair_ppm_tol': 4,             # maximum mass distance (in ppm) of expected mass shift between pairs
    'pair_RT_tol': 0.25,           # maximum retention time difference between pairs
    'pair_min_area': 1,            # minimum number of sample that pass the area ratio criterium

    # Range, or list of ranges, of labelled to unlabelled peak area ratios
    # within the accepted cutoff
    # These ratios are adjusted on an experiment-to-experiment basis (see below examples).
    'area_ratio_cutoff': {
        'cys': ((0.2, 0.28),
                (0.34, 0.42)),  # GSSG theoretical parent/(parent+heavy) is 0.25 with 50/50 labelled
        'gly': ((0.08, 0.24),),
        'gln': ((0.18, 0.6),),
    },
    # End of pair finding ###
    
    ### Peak pair filter parameters ###
    # Pair filter: adduct flagging #
    # Flag an adduct if it is within a ppm mass distance
    # and within a retention time difference
    # and has smaller total peak area than its parent ion
    'adduct_ppm_tol': 200,     # maximum mass distance (in ppm) to call adduct
    'adduct_RT_tol': 0.8,      # maximum retention time difference to call adduct

    # Pair filter: isotope flagging #
    # Flag an isotope if it is within a ppm mass distance
    # and within a retention time difference
    # and has a minimum peak area correlation coefficient
    # and has smaller total peak area than its parent ion
    'isotope_ppm_tol': 200,     # maximum mass distance (in ppm) to call isotope
    'isotope_RT_tol': 0.4,      # maximum retention time difference to call isotope
    'isotope_corr_tol':0.7,     # minimum peak area correlation coefficient to call isotope
    # End of peak pair filter #########

}

# Calculate the mass shift for each label:
params['MW_shift'] = dict()
for label in params['labels']:
    params['MW_shift'][label] = isotope_obj.isotopes2mass_shift(params['labels'][label])
print('Calculated mass shift for entered labels:')
pp.pprint(params['MW_shift'])

Calculated mass shift for entered labels:
{'cys': 4.007099400040005,
 'gln': 5.016774176699997,
 'gly': 2.0067096706799994}


In [8]:
# Data as Excel file:
datafile_pos = 'projects/cys_tracing_bile-duct/pos/bile-duct_cys-tracing_pos.xlsx'
datafile_neg = 'projects/cys_tracing_bile-duct/neg/bile-duct_cys-tracing_neg.xlsx'
#datafile_neg = 'projects/SLC33A1/neg/SLC33A1_KO_BSO_control_neg.xlsx'

# JSON file with "sample_name": "content_string":
datafile_col_desc = 'projects/cys_tracing_bile-duct/bile-duct_cys-tracing.json'
# Use eval function to turn JSON into a dictionary:
col_names = eval(open(datafile_col_desc, 'r').read())

# This just specifies which samples to run which mass shift analysis on:
sample_label = {'cys': list(col_names.values())}
# Read the data:
peak_obj = PeakData('bile_duct_cys_tracing', col_names, sample_label, params)
# Read peaks:
peak_obj.read_peaks(datafile_pos, 'pos')
peak_obj.read_peaks(datafile_neg, 'neg')

Running peak filtering for polarity: pos
Filtered 1314 peaks out based on.
Minimum peak area: 39
Minimum molecular weight: 13
Merged closely related peaks: 1262
2659 peaks left.

Running peak filtering for polarity: neg
Filtered 1523 peaks out based on.
Minimum peak area: 278
Minimum molecular weight: 0
Merged closely related peaks: 1245
2844 peaks left.



In [9]:
#peak_obj.remove_blacklist_peaks(blacklist_dict)

In [10]:
peak_obj.annotate_known(known_cys_fnam, isotope_obj.formula2mass)

In [11]:
peak_obj.pick_ratio(known_cys_fnam, 'pos', isotope_obj.formula2mass, 'cys', ['m', 'm+4'])

Unnamed: 0,GSH (m),GSH (m+4),GSSG (m),GSSG (m+4),R5P Cys (m),R5P Cys (m+4),G3P Cys (m),G3P Cys (m+4),Cysteine (m),Cysteine (m+4),DHAP Cys (m),DHAP Cys (m+4),Glutamylcysteine (m),Glutamylcysteine (m+4),Succinylcysteine (m),Succinylcysteine (m+4),Succinylglutathione (m),Succinylglutathione (m+4),Cysteinylglycine (m),Cysteinylglycine (m+4)
count,39.0,39.0,39.0,39.0,39.0,39.0,39.0,39.0,39.0,39.0,39.0,39.0,39.0,39.0,39.0,39.0,39.0,39.0,39.0,39.0
mean,0.704667,0.295333,0.535899,0.464101,0.617985,0.382015,0.745506,0.254494,0.672429,0.327571,0.745506,0.254494,0.815044,0.184956,0.803489,0.196511,0.777847,0.222153,0.769016,0.230984
std,0.029361,0.029361,0.043179,0.043179,0.090858,0.090858,0.10755,0.10755,0.060368,0.060368,0.10755,0.10755,0.101935,0.101935,0.075721,0.075721,0.066436,0.066436,0.032486,0.032486
min,0.654754,0.242591,0.473532,0.3528,0.491636,0.238903,0.435944,0.064782,0.48446,0.193007,0.435944,0.064782,0.541866,0.059811,0.514196,0.06565,0.649806,0.099514,0.703366,0.171975
25%,0.677279,0.272741,0.510386,0.448885,0.515631,0.307871,0.710011,0.196353,0.639572,0.3007,0.710011,0.196353,0.729061,0.088115,0.767872,0.153664,0.728339,0.175957,0.745079,0.206049
50%,0.711886,0.288114,0.524275,0.475725,0.638528,0.361472,0.748679,0.251321,0.670007,0.329993,0.748679,0.251321,0.838569,0.161431,0.799534,0.200466,0.784592,0.215408,0.773484,0.226516
75%,0.727259,0.322721,0.551115,0.489614,0.692129,0.484369,0.803647,0.289989,0.6993,0.360428,0.803647,0.289989,0.911885,0.270939,0.846336,0.232128,0.824043,0.271661,0.793951,0.254921
max,0.757409,0.345246,0.6472,0.526468,0.761097,0.508364,0.935218,0.564056,0.806993,0.51554,0.935218,0.564056,0.940189,0.458134,0.93435,0.485804,0.900486,0.350194,0.828025,0.296634


In [12]:
peak_obj.pick_ratio(known_cys_fnam, 'neg', isotope_obj.formula2mass, 'cys', ['m', 'm+4'])

Unnamed: 0,GSH (m),GSH (m+4),GSSG (m),GSSG (m+4),S-Lactoylglutathione (m),S-Lactoylglutathione (m+4),G3P Cys (m),G3P Cys (m+4),DHAP Cys (m),DHAP Cys (m+4),Succinylcysteine (m),Succinylcysteine (m+4),Succinylglutathione (m),Succinylglutathione (m+4),Cysteinylglycine (m),Cysteinylglycine (m+4)
count,39.0,39.0,39.0,39.0,39.0,39.0,39.0,39.0,39.0,39.0,39.0,39.0,39.0,39.0,39.0,39.0
mean,0.632897,0.367103,0.663485,0.336515,0.40117,0.59883,0.547971,0.452029,0.547971,0.452029,0.797764,0.202236,0.793956,0.206044,0.753635,0.246365
std,0.069345,0.069345,0.045107,0.045107,0.279736,0.279736,0.132664,0.132664,0.132664,0.132664,0.070585,0.070585,0.072303,0.072303,0.073681,0.073681
min,0.508745,0.256307,0.519174,0.237073,0.038095,0.137008,0.225017,0.310233,0.225017,0.310233,0.674792,0.063651,0.642558,0.082988,0.660328,0.040321
25%,0.573743,0.31717,0.631594,0.304773,0.118878,0.327,0.453137,0.36121,0.453137,0.36121,0.747483,0.15452,0.752367,0.160187,0.698705,0.214553
50%,0.648583,0.351417,0.66761,0.33239,0.336046,0.663954,0.611784,0.388216,0.611784,0.388216,0.780862,0.219138,0.807149,0.192851,0.74299,0.25701
75%,0.68283,0.426257,0.695227,0.368406,0.673,0.881122,0.63879,0.546863,0.63879,0.546863,0.84548,0.252517,0.839813,0.247633,0.785447,0.301295
max,0.743693,0.491255,0.762927,0.480826,0.862992,0.961905,0.689767,0.774983,0.689767,0.774983,0.936349,0.325208,0.917012,0.357442,0.959679,0.339672


In [13]:
# Based on above analysis cutoff is chosen:
params['area_ratio_cutoff']['cys'] = ((0.05, 0.55),)

In [14]:
# Find pairs:
peak_obj.find_pairs('pos')
peak_obj.find_pairs('neg')

In [15]:
peak_obj.flag_blacklist(blacklist_dict, polarity='both')
peak_obj.flag_adducts(adducts_fnam, polarity='both')

In [16]:
isotope_set = isotope_obj.find_iso_set(min_abs=1e-6)
peak_obj.flag_isotopes(isotope_set, polarity='both')

In [17]:
peak_obj.label_pairs['cys']['pos']['peak_pair_area_parent']

Unnamed: 0,pair_id,MW_parent,RT_parent,MW_heavy,RT_heavy,polarity,label,name,Isotopes,Adducts,Blacklist,known_anno,cclp_1,cclp_2,cclp_3,ssp25_1,ssp25_2,ssp25_3,tfk_1,tfk_2,tfk_3,icc2_1,icc2_2,icc2_3,tgbc_1,...,tgbc_3,ocug1_1,ocug1_2,ocug1_3,snu308_1,snu308_2,snu308_3,kku100_1,kku100_2,kku100_3,ysccc_1,ysccc_2,ysccc_3,rbe_1,rbe_2,rbe_3,snu1196_1,snu1196_2,snu1196_3,uok262_1,uok262_2,uok262_3,a549_1,a549_2,a549_3
0,"((121.01976, 8.619), (125.02686, 8.636), pos, ...",121.02,8.619,125.027,8.636,pos,cys,Cysteine,,,,Cysteine,1552.28,3447.87,4717.68,238777.0,207467.0,320614.0,142408.0,140247.0,156743.0,6762.86,11768.9,4369.7,25779.5,...,24306.0,103608.0,73719.8,85899.1,31690.5,29104.6,28148.4,110189.0,145990.0,145780.0,2307.9,2003.08,1370.01,3579.32,5286.55,7038.94,13150.0,27055.3,18159.9,17279.0,10151.4,11439.3,67554.7,68239.1,49993.6
1,"((133.01974, 5.534), (137.02686, 5.534), pos, ...",133.02,5.534,137.027,5.534,pos,cys,Timonacic,,,,,5792.99,7565.96,10467.6,222309.0,229248.0,216545.0,197523.0,182412.0,204462.0,24509.1,34530.2,31527.8,139439.0,...,139335.0,320199.0,327878.0,302345.0,155187.0,161157.0,199129.0,208412.0,298470.0,314376.0,2320.92,2285.79,3123.67,10165.6,13020.9,25543.4,8091.07,4440.75,3907.82,81295.2,66903.2,64506.3,147628.0,156350.0,190156.0
2,"((145.01977, 7.718), (149.02689, 7.726), pos, ...",145.02,7.718,149.027,7.726,pos,cys,3_4-Dehydrothiomorpholine-3-carboxylate,,,,,1226.65,1803.76,2245.88,149580.0,142019.0,104267.0,14968.2,12351.5,17779.0,6271.48,6115.16,6456.88,2519.34,...,2327.2,7605.86,10678.5,14836.0,5595.94,4278.15,5597.07,11246.9,1889.31,11014.2,577.057,706.362,959.471,990.348,836.72,2040.86,8513.37,6286.52,5926.18,8868.02,8057.88,6027.13,5038.75,6089.6,9586.6
3,"((147.0354, 2.773), (151.04248, 2.772), pos, cys)",147.035,2.773,151.042,2.772,pos,cys,Thiomorpholine3-carboxylate,,,,,2611.73,5154.53,2797.3,67091.1,61220.4,65621.9,53340.1,44538.1,57010.1,4439.42,3795.36,3598.07,28518.2,...,25989.1,57993.2,47875.7,54875.3,34468.7,42213.7,40871.8,37744.0,44412.4,48609.8,629.241,748.875,880.213,3381.24,3403.24,2398.37,1289.02,540.843,959.179,15468.3,10421.8,6927.99,12130.6,12854.1,13296.7
4,"((147.0354, 4.281), (151.04248, 4.284), pos, cys)",147.035,4.281,151.042,4.284,pos,cys,Thiomorpholine3-carboxylate,,,,,222468.0,305549.0,204378.0,2898480.0,2986800.0,2824340.0,2323980.0,2287060.0,2549820.0,336798.0,422342.0,286847.0,1516410.0,...,1706820.0,3102150.0,3062990.0,3448800.0,2245700.0,2097400.0,2541880.0,2360300.0,2748640.0,2741890.0,35531.6,50483.0,57431.5,169301.0,178809.0,235781.0,98195.2,45220.8,48179.4,1074630.0,823559.0,784184.0,1110730.0,1148060.0,1220240.0
5,"((161.05104, 2.696), (165.05819, 2.697), pos, ...",161.051,2.696,165.058,2.697,pos,cys,allylcysteine,,,,,6070.73,8234.39,8378.18,153510.0,111253.0,179470.0,110582.0,125663.0,108947.0,13871.6,21727.7,48804.0,77172.4,...,97185.1,160352.0,182737.0,151680.0,74850.7,76340.4,82644.1,96808.5,137839.0,124968.0,1468.25,814.142,1957.04,13476.0,5330.17,7810.77,2857.7,1672.05,1135.49,49176.4,31620.0,76048.2,68107.7,57746.2,67735.3
6,"((161.05105, 3.995), (165.05819, 3.998), pos, ...",161.051,3.995,165.058,3.998,pos,cys,allylcysteine,,,,,15331.9,25310.9,23639.7,453252.0,361145.0,326787.0,398055.0,385909.0,323045.0,44450.6,55803.3,46494.7,202220.0,...,269431.0,536563.0,392344.0,433590.0,280509.0,306297.0,267248.0,326493.0,405229.0,380619.0,4482.68,6560.31,6116.89,39213.5,26784.3,40180.2,11967.9,4611.89,4625.24,121918.0,78339.0,101447.0,183713.0,165135.0,200894.0
7,"((175.03028, 2.706), (179.03739, 2.715), pos, ...",175.03,2.706,179.037,2.715,pos,cys,3-Acetyl-4-thiazolidinecarboxylic acid RT@4.13,,,,,538.103,668.369,878.588,1826.5,1626.99,2582.12,14336.7,9222.62,12686.0,9811.72,15834.1,6817.29,4553.82,...,3524.18,24212.2,55882.5,46247.1,24036.1,18637.2,40953.2,3026.53,11899.8,9976.18,1516.08,1051.51,912.362,3645.83,1701.03,3053.79,2062.57,890.735,1378.15,5881.35,4250.59,4283.48,3644.69,5146.82,6982.82
8,"((175.0303, 4.346), (179.03736, 4.348), pos, cys)",175.03,4.346,179.037,4.348,pos,cys,3-Acetyl-4-thiazolidinecarboxylic acid RT@4.13,,,,,6666.86,11287.4,16892.6,56303.2,58675.4,63012.4,388296.0,328176.0,465389.0,254253.0,384724.0,208808.0,112511.0,...,146537.0,757271.0,991531.0,1015860.0,700111.0,657581.0,955387.0,136792.0,413165.0,442410.0,11192.5,16912.9,14400.4,44780.2,54035.1,56583.0,22949.9,13392.5,13267.5,220929.0,176719.0,192937.0,81164.4,104965.0,139139.0
9,"((175.06671, 2.67), (179.07376, 2.67), pos, cys)",175.067,2.67,179.074,2.67,pos,cys,"2,5,5-trimethylthiazolidine-4-carboxylic acid ...",,,,,16711.6,18882.6,18208.8,163037.0,168570.0,189116.0,144530.0,137557.0,149292.0,24269.4,26326.9,17898.1,89045.1,...,99950.0,203078.0,193561.0,222943.0,118633.0,111315.0,116284.0,147501.0,174525.0,167887.0,9604.07,31943.4,11423.3,13968.8,21207.6,19395.7,47070.4,26150.7,20419.7,62897.5,57027.0,50636.1,66994.9,59126.6,72644.4


In [18]:
peak_obj.write_pairs('projects/cys_tracing_bile-duct/pos/cys_tracing_bile-duct_pairs_pos_new', 'pos')

In [19]:
peak_obj.write_pairs('projects/cys_tracing_bile-duct/pos/cys_tracing_bile-duct_pairs_neg_new', 'neg')

In [30]:
peak_obj.min_label

1

In [34]:
peak_obj.sample_names

{'av040821_040821_av04': 'cclp_1',
 'av040821_040821_av05': 'cclp_2',
 'av040821_040821_av06': 'cclp_3',
 'av040821_040821_av10': 'ssp25_1',
 'av040821_040821_av11': 'ssp25_2',
 'av040821_040821_av12': 'ssp25_3',
 'av040821_040821_av16': 'tfk_1',
 'av040821_040821_av17': 'tfk_2',
 'av040821_040821_av18': 'tfk_3',
 'av040821_040821_av22': 'icc2_1',
 'av040821_040821_av23': 'icc2_2',
 'av040821_040821_av24': 'icc2_3',
 'av040821_040821_av28': 'tgbc_1',
 'av040821_040821_av29': 'tgbc_2',
 'av040821_040821_av30': 'tgbc_3',
 'av040821_040821_av34': 'ocug1_1',
 'av040821_040821_av35': 'ocug1_2',
 'av040821_040821_av36': 'ocug1_3',
 'av040821_040821_av40': 'snu308_1',
 'av040821_040821_av41': 'snu308_2',
 'av040821_040821_av42': 'snu308_3',
 'av040821_040821_av46': 'kku100_1',
 'av040821_040821_av47': 'kku100_2',
 'av040821_040821_av48': 'kku100_3',
 'av040821_041221_av52': 'ysccc_1',
 'av040821_041221_av53': 'ysccc_2',
 'av040821_041221_av54': 'ysccc_3',
 'av040821_041221_av58': 'rbe_1',
 'a

In [35]:
peak_obj.sample_label

{'cys': ['cclp_1',
  'cclp_2',
  'cclp_3',
  'ssp25_1',
  'ssp25_2',
  'ssp25_3',
  'tfk_1',
  'tfk_2',
  'tfk_3',
  'icc2_1',
  'icc2_2',
  'icc2_3',
  'tgbc_1',
  'tgbc_2',
  'tgbc_3',
  'ocug1_1',
  'ocug1_2',
  'ocug1_3',
  'snu308_1',
  'snu308_2',
  'snu308_3',
  'kku100_1',
  'kku100_2',
  'kku100_3',
  'ysccc_1',
  'ysccc_2',
  'ysccc_3',
  'rbe_1',
  'rbe_2',
  'rbe_3',
  'snu1196_1',
  'snu1196_2',
  'snu1196_3',
  'uok262_1',
  'uok262_2',
  'uok262_3',
  'a549_1',
  'a549_2',
  'a549_3']}

In [33]:
peak_obj.label_peaks['cys']['label_colnames']

['CCLP_1',
 'CCLP_2',
 'CCLP_3',
 'SSP25_1',
 'SSP25_2',
 'SSP25_3',
 'TFK_1',
 'TFK_2',
 'TFK_3',
 'ICC2_1',
 'ICC2_2',
 'ICC2_3',
 'TGBC_1',
 'TGBC_2',
 'TGBC_3',
 'OCUG1_1',
 'OCUG1_2',
 'OCUG1_3',
 'SNU308_1',
 'SNU308_2',
 'SNU308_3',
 'KKU100_1',
 'KKU100_2',
 'KKU100_3',
 'YSCCC_1',
 'YSCCC_2',
 'YSCCC_3',
 'RBE_1',
 'RBE_2',
 'RBE_3',
 'SNU1196_1',
 'SNU1196_2',
 'SNU1196_3',
 'UOK262_1',
 'UOK262_2',
 'UOK262_3',
 'A549_1',
 'A549_2',
 'A549_3']

In [59]:
# Data as Excel file:
datafile_pos = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/SLC33A1/pos/SLC33A1_KO_BSO_control_pairs_pos.xlsx'
datafile_neg = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/SLC33A1/neg/SLC33A1_KO_BSO_control_pairs_neg.xlsx'

# JSON file with "sample_name": "content_string":
datafile_col_desc = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/SLC33A1/SLC33A1_KO_BSO_control.json'

col_names = eval(open(datafile_col_desc, 'r').read())
# This just specifies which samples to run which mass shift analysis on:
sample_label = {'cys': list(col_names.values())}
# Read the data:
peak_obj = PeakData('SLC33A1_cys_tracing', col_names, sample_label, 1, params)
# Read peaks:
peak_obj.read_peaks(datafile_pos, 'pos')
peak_obj.read_peaks(datafile_neg, 'neg')


FileNotFoundError: [Errno 2] No such file or directory: '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/SLC33A1/SLC33A1_KO_BSO_control.json'

In [60]:
intra_label_names = [n for n in sample_label['cys'] if 'intra' in n]
pick_ratio(peak_obj.peak_data_pos, peak_obj.area_colnames_pos, intra_label_names, known_cys_fnam, ['m', 'm+4'])

TypeError: pick_ratio() missing 1 required positional argument: 'params'

In [10]:
pick_ratio(peak_obj.peak_data_neg, peak_obj.area_colnames_neg, intra_label_names, known_cys_fnam, ['m', 'm+4'])

Unnamed: 0,GSH_m,GSH_m+4,S-Glutathionyl-cysteine_m,S-Glutathionyl-cysteine_m+4,GSSG_m,GSSG_m+4,Cystine_m,Cystine_m+4
count,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0
mean,0.749375,0.250625,0.521994,0.478006,0.628581,0.371419,0.478846,0.521154
std,0.038978,0.038978,0.108185,0.108185,0.098954,0.098954,0.112098,0.112098
min,0.608552,0.209858,0.374614,0.275311,0.455718,0.201084,0.286949,0.285293
25%,0.751545,0.230593,0.42872,0.373754,0.596003,0.348669,0.417201,0.478529
50%,0.75572,0.24428,0.489234,0.510766,0.606638,0.393362,0.456501,0.543499
75%,0.769407,0.248455,0.626246,0.57128,0.651331,0.403997,0.521471,0.582799
max,0.790142,0.391448,0.724689,0.625386,0.798916,0.544282,0.714707,0.713051


In [11]:
# Based on above analysis cutoff is chosen:
params['area_ratio_cutoff']['cys'] = ((0.2, 0.28), (0.34, 0.42))

In [12]:
# Find pairs:
peak_obj.find_pairs('pos')
peak_obj.find_pairs('neg')
# Write pairs as Excel:
peak_obj.write_pairs('cys_tracing_bile-duct/pos/cys_tracing_bile-duct_pairs_pos', 'pos')
peak_obj.write_pairs('cys_tracing_bile-duct/neg/cys_tracing_bile-duct_pairs_neg', 'neg')

In [13]:
filter_pos = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/SLC33A1/pos/SLC33A1_KO_BSO_control_pairs_pos_filter.xlsx'
filename_name_pos = 'SLC33A1/pos/SLC33A1_KO_BSO_control_pairs_pos_filter.filterset'
write_filterset(filter_pos, filename_name_pos)

In [14]:
filter_neg = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/SLC33A1/neg/SLC33A1_KO_BSO_control_pairs_neg_filter.xlsx'
filename_name_neg = 'SLC33A1/neg/SLC33A1_KO_BSO_control_pairs_neg_filter.filterset'
write_filterset(filter_neg, filename_name_neg)

In [48]:
# Data as Excel file:
datafile_pos = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/cys_tracing/pos/cys_tracing_BSO_control_pos.xlsx'
datafile_neg = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/cys_tracing/neg/cys_tracing_BSO_control_neg.xlsx'

# JSON file with "sample_name": "content_string":
datafile_col_desc = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/cys_tracing/cys_tracing_BSO_control.json'

col_names = eval(open(datafile_col_desc, 'r').read())
# This just specifies which samples to run which mass shift analysis on:
sample_label = {'cys': [n for n in list(col_names.values()) if 'blank' not in n and 'BSO' not in n]}
# Read the data:
peak_obj = PeakData('cys_tracing', col_names, sample_label, 1, params)
# Read peaks:
peak_obj.read_peaks(datafile_pos, 'pos')
peak_obj.read_peaks(datafile_neg, 'neg')

Filtered 0 peaks out. 16870 peaks left.
Filtered 0 peaks out. 7917 peaks left.


In [16]:
intra_label_names = [n for n in sample_label['cys'] if 'intra' in n]
pick_ratio(peak_obj.peak_data_pos, peak_obj.area_colnames_pos, intra_label_names, known_cys_fnam, ['m', 'm+4'])

Unnamed: 0,GSH_m,GSH_m+4,GSSG_m,GSSG_m+4,Cystine_m,Cystine_m+4,S-Lactoylglutathione_m,S-Lactoylglutathione_m+4
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.749566,0.250434,0.60306,0.39694,0.890055,0.109945,0.63968,0.36032
std,0.011359,0.011359,0.058084,0.058084,0.086078,0.086078,0.186614,0.186614
min,0.722637,0.23318,0.453819,0.265154,0.587343,0.053287,0.270197,0.095994
25%,0.742505,0.243661,0.577031,0.378606,0.892646,0.06472,0.534997,0.221872
50%,0.750758,0.249242,0.610556,0.389444,0.917108,0.082892,0.651667,0.348333
75%,0.756339,0.257495,0.621394,0.422969,0.93528,0.107354,0.778128,0.465003
max,0.76682,0.277363,0.734846,0.546181,0.946713,0.412657,0.904006,0.729803


In [17]:
pick_ratio(peak_obj.peak_data_neg, peak_obj.area_colnames_neg, intra_label_names, known_cys_fnam, ['m', 'm+4'])

Unnamed: 0,GSH_m,GSH_m+4,GSSG_m,GSSG_m+4,Cystine_m,Cystine_m+4
count,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.744862,0.255138,0.601879,0.398121,0.534536,0.465464
std,0.009874,0.009874,0.091513,0.091513,0.084828,0.084828
min,0.725068,0.233624,0.447604,0.263661,0.36149,0.27641
25%,0.73816,0.249895,0.537536,0.318107,0.497768,0.425466
50%,0.742668,0.257332,0.613386,0.386614,0.515666,0.484334
75%,0.750105,0.26184,0.681893,0.462464,0.574534,0.502232
max,0.766376,0.274932,0.736339,0.552396,0.72359,0.63851


In [47]:
# Based on above analysis cutoff is chosen:
params['area_ratio_cutoff']['cys'] = ((0.23, 0.28), (0.37, 0.42))

In [49]:
# Find pairs:
peak_obj.find_pairs('pos')
peak_obj.find_pairs('neg')
# Write pairs as Excel:
peak_obj.write_pairs('cys_tracing/pos/cys_tracing_BSO_control_pairs_pos', 'pos')
peak_obj.write_pairs('cys_tracing/neg/cys_tracing_BSO_control_pairs_neg', 'neg')

KeyboardInterrupt: 

In [None]:
filter_pos = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/cys_tracing/pos/cys_tracing_BSO_control_pairs_pos_filter.xlsx'
filename_name_pos = 'cys_tracing/pos/cys_tracing_BSO_control_pairs_pos_filter.filterset'
write_filterset(filter_pos, filename_name_pos)

In [None]:
filter_neg = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/cys_tracing/neg/cys_tracing_BSO_control_pairs_neg_filter.xlsx'
filename_name_neg = 'cys_tracing/neg/cys_tracing_BSO_control_pairs_neg_filter.filterset'
write_filterset(filter_neg, filename_name_neg)

In [54]:
# Data as Excel file:
datafile_pos = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/cys_tracing_three_cell_lines/pos/three_cell_lines_cys_tracing_pos.xlsx'
datafile_neg = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/cys_tracing_three_cell_lines/neg/three_cell_lines_cys_tracing_neg.xlsx'

# JSON file with "sample_name": "content_string":
datafile_col_desc = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/cys_tracing_three_cell_lines/cys_tracing_three_cell_lines.json'

col_names = eval(open(datafile_col_desc, 'r').read())
# This just specifies which samples to run which mass shift analysis on:
sample_label = {
    'cys': ['B16_Cys_p1', 'B16_Cys_p2', 'HCT116_Cys_p1', 'HCT116_Cys_p2', 'HUH7_Cys_p1', 'HUH7_Cys_p2',
               'B16_Cys_n1', 'B16_Cys_n2', 'HCT116_Cys_n1', 'HCT116_Cys_n2', 'HUH7_Cys_n1', 'HUH7_Cys_n2'],
    'gln': ['B16_Gln_p1', 'B16_Gln_p2', 'HCT116_Gln_p1', 'HCT116_Gln_p2', 'HUH7_Gln_p1', 'HUH7_Gln_p2',
               'B16_Gln_n1', 'B16_Gln_n2', 'HCT116_Gln_n1', 'HCT116_Gln_n2', 'HUH7_Gln_n1', 'HUH7_Gln_n2']
}

# Read the data:
peak_obj = PeakData('cys_tracing_three_cell_lines', col_names, sample_label, 2, params)
# Read peaks:
peak_obj.read_peaks(datafile_pos, 'pos')
peak_obj.read_peaks(datafile_neg, 'neg')

Filtered 0 peaks out. 17890 peaks left.
Filtered 0 peaks out. 21515 peaks left.


In [55]:
pick_ratio(peak_obj.peak_data_pos, peak_obj.area_colnames_pos, sample_label['cys'], known_cys_fnam, ['m', 'm+4'])

Unnamed: 0,GSH_m,GSH_m+4,GSSG_m,GSSG_m+4,S-Lactoylglutathione_m,S-Lactoylglutathione_m+4,S-(Formylmethyl)glutathione_m,S-(Formylmethyl)glutathione_m+4
count,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
mean,0.569966,0.430034,0.413248,0.586752,0.442596,0.557404,0.569054,0.430946
std,0.010166,0.010166,0.018826,0.018826,0.111632,0.111632,0.063904,0.063904
min,0.556794,0.418184,0.394089,0.564164,0.269642,0.431306,0.498035,0.354839
25%,0.561803,0.423798,0.397343,0.569854,0.391868,0.47018,0.511675,0.379299
50%,0.572881,0.427119,0.409683,0.590317,0.441243,0.558757,0.570968,0.429032
75%,0.576202,0.438197,0.430146,0.602657,0.52982,0.608132,0.620701,0.488325
max,0.581816,0.443206,0.435836,0.605911,0.568694,0.730358,0.645161,0.501965


In [56]:
pick_ratio(peak_obj.peak_data_neg, peak_obj.area_colnames_neg, sample_label['cys'], known_cys_fnam, ['m', 'm+4'])

Unnamed: 0,GSH_m,GSH_m+4,S-Glutathionyl-cysteine_m,S-Glutathionyl-cysteine_m+4,GSSG_m,GSSG_m+4,S-(Formylmethyl)glutathione_m,S-(Formylmethyl)glutathione_m+4
count,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
mean,0.575397,0.424603,0.459412,0.540588,0.433302,0.566698,0.585779,0.414221
std,0.014168,0.014168,0.082605,0.082605,0.028915,0.028915,0.016095,0.016095
min,0.557188,0.411601,0.360158,0.442697,0.394857,0.522006,0.570408,0.396468
25%,0.563622,0.415052,0.387002,0.483665,0.420471,0.552839,0.571123,0.400478
50%,0.581524,0.418476,0.47592,0.52408,0.427591,0.572409,0.584907,0.415093
75%,0.584948,0.436378,0.516335,0.612998,0.447161,0.579529,0.599522,0.428877
max,0.588399,0.442812,0.557303,0.639842,0.477994,0.605143,0.603532,0.429592


In [57]:
# Based on above analysis cutoff is chosen:
#params['area_ratio_cutoff']['cys'] = ((0.40, 45), (0.555, 0.605))
params['area_ratio_cutoff']['cys'] = ((0.2, 0.55), (0.5, 0.8))

In [58]:
pick_ratio(peak_obj.peak_data_pos, peak_obj.area_colnames_pos, sample_label['gln'], known_gln_fnam, ['m', 'm+5'])

Unnamed: 0,GSH_m,GSH_m+5,GSSG_m,GSSG_m+5,Glutamine_m,Glutamine_m+5,S-(Formylmethyl)glutathione_m,S-(Formylmethyl)glutathione_m+5
count,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
mean,0.710004,0.289996,0.495551,0.504449,0.681321,0.318679,0.62211,0.37789
std,0.046537,0.046537,0.067136,0.067136,0.246122,0.246122,0.058232,0.058232
min,0.675878,0.215999,0.437061,0.402181,0.499758,0.00197,0.508943,0.330725
25%,0.681334,0.26399,0.444241,0.460352,0.511704,0.114654,0.621336,0.347341
50%,0.682766,0.317234,0.469933,0.530067,0.545552,0.454448,0.641836,0.358164
75%,0.73601,0.318666,0.539648,0.555759,0.885346,0.488296,0.652659,0.378664
max,0.784001,0.324122,0.597819,0.562939,0.99803,0.500242,0.669275,0.491057


In [59]:
pick_ratio(peak_obj.peak_data_neg, peak_obj.area_colnames_neg, sample_label['gln'], known_gln_fnam, ['m', 'm+5'])

Unnamed: 0,GSH_m,GSH_m+5,GSSG_m,GSSG_m+5,Glutamine_m,Glutamine_m+5,S-(Formylmethyl)glutathione_m,S-(Formylmethyl)glutathione_m+5
count,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
mean,0.707229,0.292771,0.507203,0.492797,0.569884,0.430116,0.705972,0.294028
std,0.0441,0.0441,0.069031,0.069031,0.092609,0.092609,0.057023,0.057023
min,0.665801,0.236557,0.423435,0.386273,0.490369,0.306792,0.639362,0.208282
25%,0.67848,0.255298,0.473454,0.454107,0.498479,0.354279,0.675548,0.258536
50%,0.687418,0.312582,0.485195,0.514805,0.532595,0.467405,0.686553,0.313447
75%,0.744702,0.32152,0.545893,0.526546,0.645721,0.501521,0.741464,0.324452
max,0.763443,0.334199,0.613727,0.576565,0.693208,0.509631,0.791718,0.360638


In [60]:
# Based on above analysis cutoff is chosen:
#params['area_ratio_cutoff']['gln'] = ((0.255, 0.325), (0.46, 0.56))
params['area_ratio_cutoff']['gln'] = ((0.1, 0.47), (0.46, 0.70))


In [61]:
# Find pairs:
peak_obj.find_pairs('pos')
peak_obj.find_pairs('neg')
# Write pairs as Excel:
peak_obj.write_pairs('cys_tracing_three_cell_lines/pos/three_cell_lines_cys_tracing_pairs_pos', 'pos')
peak_obj.write_pairs('cys_tracing_three_cell_lines/neg/three_cell_lines_cys_tracing_pairs_neg', 'neg')
# Take intersection between cys and gln:
peak_obj.intersection_pairs(('cys', 'gln'), 'pos')
peak_obj.intersection_pairs(('cys', 'gln'), 'neg')
# Write pairs as Excel:
peak_obj.write_pairs('cys_tracing_three_cell_lines/pos/three_cell_lines_cys_tracing_pairs_cys_gln_intersection_pos', 'pos')
peak_obj.write_pairs('cys_tracing_three_cell_lines/neg/three_cell_lines_cys_tracing_pairs_cys_gln_intersection_neg', 'neg')

In [30]:
filter_pos = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/cys_tracing_three_cell_lines/pos/three_cell_lines_cys_tracing_pairs_pos_filter.xlsx'
filename_name_pos = 'cys_tracing_three_cell_lines/pos/three_cell_lines_cys_tracing_pairs_pos_filter.filterset'
write_filterset(filter_pos, filename_name_pos)

In [53]:
filter_neg = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/cys_tracing_three_cell_lines/neg/three_cell_lines_cys_tracing_pairs_neg_filter.xlsx'
filename_name_neg = 'cys_tracing_three_cell_lines/neg/three_cell_lines_cys_tracing_pairs_neg_filter.filterset'
write_filterset(filter_neg, filename_name_neg)

In [51]:
# Data as Excel file:
datafile_pos = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/cys_tracing_bile-duct/pos/bile-duct_cys-tracing_pos.xlsx'
datafile_neg = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/cys_tracing_bile-duct/neg/bile-duct_cys-tracing_neg.xlsx'

# JSON file with "sample_name": "content_string":
datafile_col_desc = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/cys_tracing_bile-duct/bile-duct_cys-tracing.json'

col_names = eval(open(datafile_col_desc, 'r').read())
# This just specifies which samples to run which mass shift analysis on:
sample_label = {'cys': [n for n in list(col_names.values()) if 'blank' not in n and 'BSO' not in n]}
# Read the data:
peak_obj = PeakData('bile-duct_cys-tracing', col_names, sample_label, 1, params)
# Read peaks:
peak_obj.read_peaks(datafile_pos, 'pos')
peak_obj.read_peaks(datafile_neg, 'neg')

Filtered 0 peaks out. 3973 peaks left.
Filtered 39 peaks out. 4328 peaks left.


In [43]:
pick_ratio(peak_obj.peak_data_pos, peak_obj.area_colnames_pos, peak_obj.area_colnames_pos, known_cys_fnam, ['m', 'm+4'])

Unnamed: 0,GSH_m,GSH_m+4,GSSG_m,GSSG_m+4
count,39.0,39.0,39.0,39.0
mean,0.704605,0.295395,0.535899,0.464101
std,0.029365,0.029365,0.043179,0.043179
min,0.654711,0.242629,0.473532,0.3528
25%,0.677197,0.272801,0.510386,0.448885
50%,0.711859,0.288141,0.524275,0.475725
75%,0.727199,0.322803,0.551115,0.489614
max,0.757371,0.345289,0.6472,0.526468


In [50]:
# Based on above analysis cutoff is chosen:
params['area_ratio_cutoff']['cys'] = ((0.24, 0.35), (0.35, 0.53))

In [52]:
# Find pairs:
peak_obj.find_pairs('pos')
peak_obj.find_pairs('neg')
# Write pairs as Excel:
peak_obj.write_pairs('cys_tracing_bile-duct/pos/bile-duct_cys-tracing_pairs_pos', 'pos')
peak_obj.write_pairs('cys_tracing_bile-duct/neg/bile-duct_cys-tracing_pairs_neg', 'neg')