In [1]:
### Notes ###

# Consolidate peaks that are very close.
# Often this is due to a slight mismatch in m/z
# could simply merge by addition.


# Filter adducts and isotopes
#   Adducts don't correlate well with parent compound abundance
#   Isopote peaks correlate well with its isopote peaks (use this as a criterium)



# Input data requirement:
# Name	Molecular Weight	RT [min]







##### Filtering isotopes #####
# Specify which isotopes to search for
# Specify a MW_ppm_tol
# Specify a RT_tol


##### Filtering adducts #####
# Specify which adduct to search for
# Specify a MW_ppm_tol
# Specify a RT_tol



In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
pd.set_option('display.max_columns', 50)
import numpy as np
import copy
import random
import json
import pprint
pp = pprint.PrettyPrinter(width=41, compact=True)

In [4]:
from resources.helper import PeakData, pick_ratio, isotopes2mass_shift, make_blacklist_dict
elements_fnam = 'resources/element_masses.tab'
known_cys_fnam = 'resources/known_cys_labeled.tab'
known_gln_fnam = 'resources/known_gln_labeled.tab'
blacklist_fnam = 'resources/blacklist.tab'
blacklist_dict = make_blacklist_dict(blacklist_fnam)

In [5]:
# Define parameters for peak filtering and peak pair finding and filtering:
params = {
    'labels': { # Specify the formula for the isotopes in the labels used
        'cys': '[13]C3 [15]N',
        'gly': '[13]C2',
        'gln': '[13]C5'
    },
    'min_area': 5000,         # for a given peak the peak area at least one sample has to be above this value
    'min_MW': 70,             # the minimum molecular weight of a compound in the peak list
    'MW_shift_ppm_tol': 4,    # PPM mass tolerance when searching for peak pairs
    'RT_tol': 0.25,           # Retention time tolerance when searching for peak pairs

    # Range, or list of ranges, of labelled to unlabelled peak area ratios
    # within the accepted cutoff
    # These ratios are adjusted on an experiment-to-experiment basis (see below examples).
    'area_ratio_cutoff': {
        'cys': ((0.2, 0.28),
                (0.34, 0.42)),  # GSSG theoretical parent/(parent+heavy) is 0.25 with 50/50 labelled
        'gly': ((0.08, 0.24),),
        'gln': ((0.18, 0.6),),
    },
}

# Calculate the mass shift for each label:
params['MW_shift'] = dict()
for label in params['labels']:
    params['MW_shift'][label] = isotopes2mass_shift(params['labels'][label], elements_fnam)
print('Calculated mass shift for entered labels:')
pp.pprint(params['MW_shift'])

Calculated mass shift for entered labels:
{'cys': 4.007100000000008,
 'gln': 5.01677500000001,
 'gly': 2.0067100000000018}


In [6]:
# Data as Excel file:
datafile_pos = 'projects/SLC33A1/pos/SLC33A1_KO_BSO_control_pos.xlsx'
datafile_neg = 'projects/SLC33A1/neg/SLC33A1_KO_BSO_control_neg.xlsx'

# JSON file with "sample_name": "content_string":
datafile_col_desc = 'projects/SLC33A1/SLC33A1_KO_BSO_control.json'
# Use eval function to turn JSON into a dictionary:
col_names = eval(open(datafile_col_desc, 'r').read())

# This just specifies which samples to run which mass shift analysis on:
sample_label = {'cys': list(col_names.values())}
# Read the data:
params['min_area'] = 10000
peak_obj = PeakData('SLC33A1_cys_tracing', col_names, sample_label, 1, params)
# Read peaks:
peak_obj.read_peaks(datafile_pos, 'pos')
peak_obj.read_peaks(datafile_neg, 'neg')

Filtered 295 peaks out based on minimum peak area (0) and minimum molecular weight (295). 10172 peaks left.
Filtered 0 peaks out based on minimum peak area (0) and minimum molecular weight (0). 4585 peaks left.


In [7]:
peak_obj.remove_blacklist_peaks(blacklist_dict)

Blacklist filter in positive polarity filtered 3 peaks out. 10169 peaks left.
Blacklist filter in negative polarity filtered 8 peaks out. 4577 peaks left.


In [8]:
intra_label_names = [n for n in peak_obj.sample_label['cys'] if 'intra' in n]
pick_ratio(peak_obj.peak_data_pos, peak_obj.area_colnames_pos, intra_label_names, known_cys_fnam, elements_fnam, ['m', 'm+4'], params)

Unnamed: 0,GSH (m),GSH (m+4),S-Glutathionyl-cysteine (m),S-Glutathionyl-cysteine (m+4),GSSG (m),GSSG (m+4),Cystine (m),Cystine (m+4),S-Lactoylglutathione (m),S-Lactoylglutathione (m+4)
count,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0
mean,0.762995,0.237005,0.54776,0.45224,0.608568,0.391432,0.798537,0.201463,0.611027,0.388973
std,0.011387,0.011387,0.134278,0.134278,0.057578,0.057578,0.154761,0.154761,0.208055,0.208055
min,0.742863,0.219501,0.253226,0.31076,0.439347,0.27266,0.443392,0.040515,0.243774,0.132663
25%,0.756543,0.226949,0.443241,0.333877,0.590786,0.382094,0.693779,0.079296,0.461205,0.229739
50%,0.759055,0.240945,0.598852,0.401148,0.605012,0.394988,0.861909,0.138091,0.639021,0.360979
75%,0.773051,0.243457,0.666123,0.556759,0.617906,0.409214,0.920704,0.306221,0.770261,0.538795
max,0.780499,0.257137,0.68924,0.746774,0.72734,0.560653,0.959485,0.556608,0.867337,0.756226


In [126]:
# Based on above analysis cutoff is chosen:
params['area_ratio_cutoff']['cys'] = ((0.2, 0.28), (0.34, 0.42))

In [127]:
# Find pairs:
#peak_obj.find_pairs('pos')
peak_obj.find_pairs('neg')

In [59]:
# Data as Excel file:
datafile_pos = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/SLC33A1/pos/SLC33A1_KO_BSO_control_pairs_pos.xlsx'
datafile_neg = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/SLC33A1/neg/SLC33A1_KO_BSO_control_pairs_neg.xlsx'

# JSON file with "sample_name": "content_string":
datafile_col_desc = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/SLC33A1/SLC33A1_KO_BSO_control.json'

col_names = eval(open(datafile_col_desc, 'r').read())
# This just specifies which samples to run which mass shift analysis on:
sample_label = {'cys': list(col_names.values())}
# Read the data:
peak_obj = PeakData('SLC33A1_cys_tracing', col_names, sample_label, 1, params)
# Read peaks:
peak_obj.read_peaks(datafile_pos, 'pos')
peak_obj.read_peaks(datafile_neg, 'neg')


FileNotFoundError: [Errno 2] No such file or directory: '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/SLC33A1/SLC33A1_KO_BSO_control.json'

In [60]:
intra_label_names = [n for n in sample_label['cys'] if 'intra' in n]
pick_ratio(peak_obj.peak_data_pos, peak_obj.area_colnames_pos, intra_label_names, known_cys_fnam, ['m', 'm+4'])

TypeError: pick_ratio() missing 1 required positional argument: 'params'

In [10]:
pick_ratio(peak_obj.peak_data_neg, peak_obj.area_colnames_neg, intra_label_names, known_cys_fnam, ['m', 'm+4'])

Unnamed: 0,GSH_m,GSH_m+4,S-Glutathionyl-cysteine_m,S-Glutathionyl-cysteine_m+4,GSSG_m,GSSG_m+4,Cystine_m,Cystine_m+4
count,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0
mean,0.749375,0.250625,0.521994,0.478006,0.628581,0.371419,0.478846,0.521154
std,0.038978,0.038978,0.108185,0.108185,0.098954,0.098954,0.112098,0.112098
min,0.608552,0.209858,0.374614,0.275311,0.455718,0.201084,0.286949,0.285293
25%,0.751545,0.230593,0.42872,0.373754,0.596003,0.348669,0.417201,0.478529
50%,0.75572,0.24428,0.489234,0.510766,0.606638,0.393362,0.456501,0.543499
75%,0.769407,0.248455,0.626246,0.57128,0.651331,0.403997,0.521471,0.582799
max,0.790142,0.391448,0.724689,0.625386,0.798916,0.544282,0.714707,0.713051


In [11]:
# Based on above analysis cutoff is chosen:
params['area_ratio_cutoff']['cys'] = ((0.2, 0.28), (0.34, 0.42))

In [12]:
# Find pairs:
peak_obj.find_pairs('pos')
peak_obj.find_pairs('neg')
# Write pairs as Excel:
peak_obj.write_pairs('cys_tracing_bile-duct/pos/cys_tracing_bile-duct_pairs_pos', 'pos')
peak_obj.write_pairs('cys_tracing_bile-duct/neg/cys_tracing_bile-duct_pairs_neg', 'neg')

In [13]:
filter_pos = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/SLC33A1/pos/SLC33A1_KO_BSO_control_pairs_pos_filter.xlsx'
filename_name_pos = 'SLC33A1/pos/SLC33A1_KO_BSO_control_pairs_pos_filter.filterset'
write_filterset(filter_pos, filename_name_pos)

In [14]:
filter_neg = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/SLC33A1/neg/SLC33A1_KO_BSO_control_pairs_neg_filter.xlsx'
filename_name_neg = 'SLC33A1/neg/SLC33A1_KO_BSO_control_pairs_neg_filter.filterset'
write_filterset(filter_neg, filename_name_neg)

In [48]:
# Data as Excel file:
datafile_pos = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/cys_tracing/pos/cys_tracing_BSO_control_pos.xlsx'
datafile_neg = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/cys_tracing/neg/cys_tracing_BSO_control_neg.xlsx'

# JSON file with "sample_name": "content_string":
datafile_col_desc = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/cys_tracing/cys_tracing_BSO_control.json'

col_names = eval(open(datafile_col_desc, 'r').read())
# This just specifies which samples to run which mass shift analysis on:
sample_label = {'cys': [n for n in list(col_names.values()) if 'blank' not in n and 'BSO' not in n]}
# Read the data:
peak_obj = PeakData('cys_tracing', col_names, sample_label, 1, params)
# Read peaks:
peak_obj.read_peaks(datafile_pos, 'pos')
peak_obj.read_peaks(datafile_neg, 'neg')

Filtered 0 peaks out. 16870 peaks left.
Filtered 0 peaks out. 7917 peaks left.


In [16]:
intra_label_names = [n for n in sample_label['cys'] if 'intra' in n]
pick_ratio(peak_obj.peak_data_pos, peak_obj.area_colnames_pos, intra_label_names, known_cys_fnam, ['m', 'm+4'])

Unnamed: 0,GSH_m,GSH_m+4,GSSG_m,GSSG_m+4,Cystine_m,Cystine_m+4,S-Lactoylglutathione_m,S-Lactoylglutathione_m+4
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.749566,0.250434,0.60306,0.39694,0.890055,0.109945,0.63968,0.36032
std,0.011359,0.011359,0.058084,0.058084,0.086078,0.086078,0.186614,0.186614
min,0.722637,0.23318,0.453819,0.265154,0.587343,0.053287,0.270197,0.095994
25%,0.742505,0.243661,0.577031,0.378606,0.892646,0.06472,0.534997,0.221872
50%,0.750758,0.249242,0.610556,0.389444,0.917108,0.082892,0.651667,0.348333
75%,0.756339,0.257495,0.621394,0.422969,0.93528,0.107354,0.778128,0.465003
max,0.76682,0.277363,0.734846,0.546181,0.946713,0.412657,0.904006,0.729803


In [17]:
pick_ratio(peak_obj.peak_data_neg, peak_obj.area_colnames_neg, intra_label_names, known_cys_fnam, ['m', 'm+4'])

Unnamed: 0,GSH_m,GSH_m+4,GSSG_m,GSSG_m+4,Cystine_m,Cystine_m+4
count,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.744862,0.255138,0.601879,0.398121,0.534536,0.465464
std,0.009874,0.009874,0.091513,0.091513,0.084828,0.084828
min,0.725068,0.233624,0.447604,0.263661,0.36149,0.27641
25%,0.73816,0.249895,0.537536,0.318107,0.497768,0.425466
50%,0.742668,0.257332,0.613386,0.386614,0.515666,0.484334
75%,0.750105,0.26184,0.681893,0.462464,0.574534,0.502232
max,0.766376,0.274932,0.736339,0.552396,0.72359,0.63851


In [47]:
# Based on above analysis cutoff is chosen:
params['area_ratio_cutoff']['cys'] = ((0.23, 0.28), (0.37, 0.42))

In [49]:
# Find pairs:
peak_obj.find_pairs('pos')
peak_obj.find_pairs('neg')
# Write pairs as Excel:
peak_obj.write_pairs('cys_tracing/pos/cys_tracing_BSO_control_pairs_pos', 'pos')
peak_obj.write_pairs('cys_tracing/neg/cys_tracing_BSO_control_pairs_neg', 'neg')

KeyboardInterrupt: 

In [None]:
filter_pos = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/cys_tracing/pos/cys_tracing_BSO_control_pairs_pos_filter.xlsx'
filename_name_pos = 'cys_tracing/pos/cys_tracing_BSO_control_pairs_pos_filter.filterset'
write_filterset(filter_pos, filename_name_pos)

In [None]:
filter_neg = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/cys_tracing/neg/cys_tracing_BSO_control_pairs_neg_filter.xlsx'
filename_name_neg = 'cys_tracing/neg/cys_tracing_BSO_control_pairs_neg_filter.filterset'
write_filterset(filter_neg, filename_name_neg)

In [54]:
# Data as Excel file:
datafile_pos = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/cys_tracing_three_cell_lines/pos/three_cell_lines_cys_tracing_pos.xlsx'
datafile_neg = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/cys_tracing_three_cell_lines/neg/three_cell_lines_cys_tracing_neg.xlsx'

# JSON file with "sample_name": "content_string":
datafile_col_desc = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/cys_tracing_three_cell_lines/cys_tracing_three_cell_lines.json'

col_names = eval(open(datafile_col_desc, 'r').read())
# This just specifies which samples to run which mass shift analysis on:
sample_label = {
    'cys': ['B16_Cys_p1', 'B16_Cys_p2', 'HCT116_Cys_p1', 'HCT116_Cys_p2', 'HUH7_Cys_p1', 'HUH7_Cys_p2',
               'B16_Cys_n1', 'B16_Cys_n2', 'HCT116_Cys_n1', 'HCT116_Cys_n2', 'HUH7_Cys_n1', 'HUH7_Cys_n2'],
    'gln': ['B16_Gln_p1', 'B16_Gln_p2', 'HCT116_Gln_p1', 'HCT116_Gln_p2', 'HUH7_Gln_p1', 'HUH7_Gln_p2',
               'B16_Gln_n1', 'B16_Gln_n2', 'HCT116_Gln_n1', 'HCT116_Gln_n2', 'HUH7_Gln_n1', 'HUH7_Gln_n2']
}

# Read the data:
peak_obj = PeakData('cys_tracing_three_cell_lines', col_names, sample_label, 2, params)
# Read peaks:
peak_obj.read_peaks(datafile_pos, 'pos')
peak_obj.read_peaks(datafile_neg, 'neg')

Filtered 0 peaks out. 17890 peaks left.
Filtered 0 peaks out. 21515 peaks left.


In [55]:
pick_ratio(peak_obj.peak_data_pos, peak_obj.area_colnames_pos, sample_label['cys'], known_cys_fnam, ['m', 'm+4'])

Unnamed: 0,GSH_m,GSH_m+4,GSSG_m,GSSG_m+4,S-Lactoylglutathione_m,S-Lactoylglutathione_m+4,S-(Formylmethyl)glutathione_m,S-(Formylmethyl)glutathione_m+4
count,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
mean,0.569966,0.430034,0.413248,0.586752,0.442596,0.557404,0.569054,0.430946
std,0.010166,0.010166,0.018826,0.018826,0.111632,0.111632,0.063904,0.063904
min,0.556794,0.418184,0.394089,0.564164,0.269642,0.431306,0.498035,0.354839
25%,0.561803,0.423798,0.397343,0.569854,0.391868,0.47018,0.511675,0.379299
50%,0.572881,0.427119,0.409683,0.590317,0.441243,0.558757,0.570968,0.429032
75%,0.576202,0.438197,0.430146,0.602657,0.52982,0.608132,0.620701,0.488325
max,0.581816,0.443206,0.435836,0.605911,0.568694,0.730358,0.645161,0.501965


In [56]:
pick_ratio(peak_obj.peak_data_neg, peak_obj.area_colnames_neg, sample_label['cys'], known_cys_fnam, ['m', 'm+4'])

Unnamed: 0,GSH_m,GSH_m+4,S-Glutathionyl-cysteine_m,S-Glutathionyl-cysteine_m+4,GSSG_m,GSSG_m+4,S-(Formylmethyl)glutathione_m,S-(Formylmethyl)glutathione_m+4
count,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
mean,0.575397,0.424603,0.459412,0.540588,0.433302,0.566698,0.585779,0.414221
std,0.014168,0.014168,0.082605,0.082605,0.028915,0.028915,0.016095,0.016095
min,0.557188,0.411601,0.360158,0.442697,0.394857,0.522006,0.570408,0.396468
25%,0.563622,0.415052,0.387002,0.483665,0.420471,0.552839,0.571123,0.400478
50%,0.581524,0.418476,0.47592,0.52408,0.427591,0.572409,0.584907,0.415093
75%,0.584948,0.436378,0.516335,0.612998,0.447161,0.579529,0.599522,0.428877
max,0.588399,0.442812,0.557303,0.639842,0.477994,0.605143,0.603532,0.429592


In [57]:
# Based on above analysis cutoff is chosen:
#params['area_ratio_cutoff']['cys'] = ((0.40, 45), (0.555, 0.605))
params['area_ratio_cutoff']['cys'] = ((0.2, 0.55), (0.5, 0.8))

In [58]:
pick_ratio(peak_obj.peak_data_pos, peak_obj.area_colnames_pos, sample_label['gln'], known_gln_fnam, ['m', 'm+5'])

Unnamed: 0,GSH_m,GSH_m+5,GSSG_m,GSSG_m+5,Glutamine_m,Glutamine_m+5,S-(Formylmethyl)glutathione_m,S-(Formylmethyl)glutathione_m+5
count,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
mean,0.710004,0.289996,0.495551,0.504449,0.681321,0.318679,0.62211,0.37789
std,0.046537,0.046537,0.067136,0.067136,0.246122,0.246122,0.058232,0.058232
min,0.675878,0.215999,0.437061,0.402181,0.499758,0.00197,0.508943,0.330725
25%,0.681334,0.26399,0.444241,0.460352,0.511704,0.114654,0.621336,0.347341
50%,0.682766,0.317234,0.469933,0.530067,0.545552,0.454448,0.641836,0.358164
75%,0.73601,0.318666,0.539648,0.555759,0.885346,0.488296,0.652659,0.378664
max,0.784001,0.324122,0.597819,0.562939,0.99803,0.500242,0.669275,0.491057


In [59]:
pick_ratio(peak_obj.peak_data_neg, peak_obj.area_colnames_neg, sample_label['gln'], known_gln_fnam, ['m', 'm+5'])

Unnamed: 0,GSH_m,GSH_m+5,GSSG_m,GSSG_m+5,Glutamine_m,Glutamine_m+5,S-(Formylmethyl)glutathione_m,S-(Formylmethyl)glutathione_m+5
count,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
mean,0.707229,0.292771,0.507203,0.492797,0.569884,0.430116,0.705972,0.294028
std,0.0441,0.0441,0.069031,0.069031,0.092609,0.092609,0.057023,0.057023
min,0.665801,0.236557,0.423435,0.386273,0.490369,0.306792,0.639362,0.208282
25%,0.67848,0.255298,0.473454,0.454107,0.498479,0.354279,0.675548,0.258536
50%,0.687418,0.312582,0.485195,0.514805,0.532595,0.467405,0.686553,0.313447
75%,0.744702,0.32152,0.545893,0.526546,0.645721,0.501521,0.741464,0.324452
max,0.763443,0.334199,0.613727,0.576565,0.693208,0.509631,0.791718,0.360638


In [60]:
# Based on above analysis cutoff is chosen:
#params['area_ratio_cutoff']['gln'] = ((0.255, 0.325), (0.46, 0.56))
params['area_ratio_cutoff']['gln'] = ((0.1, 0.47), (0.46, 0.70))


In [61]:
# Find pairs:
peak_obj.find_pairs('pos')
peak_obj.find_pairs('neg')
# Write pairs as Excel:
peak_obj.write_pairs('cys_tracing_three_cell_lines/pos/three_cell_lines_cys_tracing_pairs_pos', 'pos')
peak_obj.write_pairs('cys_tracing_three_cell_lines/neg/three_cell_lines_cys_tracing_pairs_neg', 'neg')
# Take intersection between cys and gln:
peak_obj.intersection_pairs(('cys', 'gln'), 'pos')
peak_obj.intersection_pairs(('cys', 'gln'), 'neg')
# Write pairs as Excel:
peak_obj.write_pairs('cys_tracing_three_cell_lines/pos/three_cell_lines_cys_tracing_pairs_cys_gln_intersection_pos', 'pos')
peak_obj.write_pairs('cys_tracing_three_cell_lines/neg/three_cell_lines_cys_tracing_pairs_cys_gln_intersection_neg', 'neg')

In [30]:
filter_pos = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/cys_tracing_three_cell_lines/pos/three_cell_lines_cys_tracing_pairs_pos_filter.xlsx'
filename_name_pos = 'cys_tracing_three_cell_lines/pos/three_cell_lines_cys_tracing_pairs_pos_filter.filterset'
write_filterset(filter_pos, filename_name_pos)

In [53]:
filter_neg = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/cys_tracing_three_cell_lines/neg/three_cell_lines_cys_tracing_pairs_neg_filter.xlsx'
filename_name_neg = 'cys_tracing_three_cell_lines/neg/three_cell_lines_cys_tracing_pairs_neg_filter.filterset'
write_filterset(filter_neg, filename_name_neg)

In [51]:
# Data as Excel file:
datafile_pos = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/cys_tracing_bile-duct/pos/bile-duct_cys-tracing_pos.xlsx'
datafile_neg = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/cys_tracing_bile-duct/neg/bile-duct_cys-tracing_neg.xlsx'

# JSON file with "sample_name": "content_string":
datafile_col_desc = '/Users/krdav/Google Drive/MCB/Sullivan_lab/mass_shift_search/branch_03_29_20/cys_tracing_bile-duct/bile-duct_cys-tracing.json'

col_names = eval(open(datafile_col_desc, 'r').read())
# This just specifies which samples to run which mass shift analysis on:
sample_label = {'cys': [n for n in list(col_names.values()) if 'blank' not in n and 'BSO' not in n]}
# Read the data:
peak_obj = PeakData('bile-duct_cys-tracing', col_names, sample_label, 1, params)
# Read peaks:
peak_obj.read_peaks(datafile_pos, 'pos')
peak_obj.read_peaks(datafile_neg, 'neg')

Filtered 0 peaks out. 3973 peaks left.
Filtered 39 peaks out. 4328 peaks left.


In [43]:
pick_ratio(peak_obj.peak_data_pos, peak_obj.area_colnames_pos, peak_obj.area_colnames_pos, known_cys_fnam, ['m', 'm+4'])

Unnamed: 0,GSH_m,GSH_m+4,GSSG_m,GSSG_m+4
count,39.0,39.0,39.0,39.0
mean,0.704605,0.295395,0.535899,0.464101
std,0.029365,0.029365,0.043179,0.043179
min,0.654711,0.242629,0.473532,0.3528
25%,0.677197,0.272801,0.510386,0.448885
50%,0.711859,0.288141,0.524275,0.475725
75%,0.727199,0.322803,0.551115,0.489614
max,0.757371,0.345289,0.6472,0.526468


In [50]:
# Based on above analysis cutoff is chosen:
params['area_ratio_cutoff']['cys'] = ((0.24, 0.35), (0.35, 0.53))

In [52]:
# Find pairs:
peak_obj.find_pairs('pos')
peak_obj.find_pairs('neg')
# Write pairs as Excel:
peak_obj.write_pairs('cys_tracing_bile-duct/pos/bile-duct_cys-tracing_pairs_pos', 'pos')
peak_obj.write_pairs('cys_tracing_bile-duct/neg/bile-duct_cys-tracing_pairs_neg', 'neg')