In [None]:
%matplotlib inline

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
import sys
import scipy.stats
import pylab as plt
from IPython import display
import pylab as plt
import glob
from collections import defaultdict

In [None]:
sys.path.append('..')

In [None]:
from vimms.Chemicals import *
from vimms.Chromatograms import *
from vimms.MassSpec import *
from vimms.Controller import *
from vimms.Common import *
from vimms.DataGenerator import *
from vimms.DsDA import *

In [None]:
 set_log_level_warning()
# set_log_level_info()
# set_log_level_debug()

In [None]:
# base_dir = '..\\data'
base_dir = 'C:\\Users\\joewa\\University of Glasgow\\Vinny Davies - CLDS Metabolomics Project\\Trained Models'

In [None]:
ps = load_obj(os.path.join(base_dir, 'peak_sampler_mz_rt_int_19_beers_fullscan.p'))

In [None]:
hmdb = load_obj(os.path.join(base_dir, 'hmdb_compounds.p'))

In [None]:
out_dir = 'C:\\Users\\joewa\\Work\\data\\ClassificationExample\\Samples_id_0'

# Create Initial Chemical

In [None]:
ROI_Sources = ["C:\\Users\\joewa\\Work\\data\\beer_t10_simulator_files"]
min_ms1_intensity = 1.75E5
rt_range = [(400, 800)]
mz_range = [(100, 400)]
n_peaks = 20
roi_rt_range = [20, 40]
chems = ChemicalCreator(ps, ROI_Sources, hmdb)
dataset = chems.sample(mz_range, rt_range, min_ms1_intensity, n_peaks, 1, 
                       fixed_mz=False, roi_rt_range=roi_rt_range)
save_obj(dataset, os.path.join(out_dir, 'BaseDataset\\dataset.p'))

In [None]:
for chem in dataset:
    print(np.abs(chem.chromatogram.min_rt - chem.chromatogram.max_rt))

# Create Multiple Samples

In [None]:
n_samples = [100,100] # number of files per class
classes = ["class%d" % i for i in range(len(n_samples))] # creates default list of classes
intensity_noise_sd = [1000] # noise on max intensity

In [None]:
classes

Add intensity changes between different classes

In [None]:
change_probabilities = [0 for i in range(len(n_samples))] # probability of intensity changes between different classes
change_differences_means = [0 for i in range(len(n_samples))] # mean of those intensity changes
change_differences_sds = [0 for i in range(len(n_samples))] # SD of those intensity changes

Add experimental variables (examples in comments)

In [None]:
experimental_classes = None # [["male","female"],["Positive","Negative","Unknown"]]
experimental_probabilitities = None # [[0.5,0.5],[0.33,0.33,0.34]]
experimental_sds = None # [[250],[250]]

Dropout chemicals from in different classes

In [None]:
#dropout_probability = 0.2
#dropout_probabilities = [dropout_probability for i in range(len(n_samples))]
dropout_probabilities = None
dropout_numbers = 2 # number of chemicals dropped out in each class

Set save location

In [None]:
save_location = os.path.join(out_dir, 'ChemicalFiles')

In [None]:
multiple_samples = MultiSampleCreator(dataset, n_samples, classes, intensity_noise_sd, 
                                      change_probabilities, change_differences_means, change_differences_sds, dropout_probabilities, dropout_numbers,
                                     experimental_classes, experimental_probabilitities, experimental_sds, save_location=save_location)

In [None]:
total_samples = np.sum(multiple_samples.n_samples)
total_samples

In [None]:
save_obj(multiple_samples.missing_chemicals, os.path.join(out_dir, 'MissingChemicals\\missing_chemicals.p'))
multiple_samples.missing_chemicals

Run MS1 controller and save out .mzML files

In [None]:
min_rt = rt_range[0][0]
max_rt = rt_range[0][1]
controllers = defaultdict(list)
controller_to_mzml = {}

mzml_dir = os.path.join(out_dir, 'mzmlFiles')
num_classes = len(n_samples)
sample_idx = 0
for j in range(num_classes):
    num_samples = n_samples[j]
    for i in range(num_samples):
        fname = os.path.join(save_location, 'sample_%d.p' % sample_idx) 
        sample = load_obj(fname)
        sample_idx += 1
        
        mass_spec = IndependentMassSpectrometer(POSITIVE, sample, density=ps.density_estimator)
        mzml_filename = mzml_dir + '\\sample_id_0_number_%d' % i + '_class_%d.mzML' % j 
        controller = SimpleMs1Controller(mass_spec)
        controller.run(min_rt,max_rt)
        controller.write_mzML('my_analysis', mzml_filename)
        
        controllers[j].append(controller)
        controller_to_mzml[controller] = (j, mzml_filename, )

Print out the missing peaks

In [None]:
def get_chem_to_peaks(controller):
    chem_to_peaks = defaultdict(list)
    frag_events = controller.mass_spec.fragmentation_events
    for frag_event in frag_events:
        chem = frag_event.chem
        peaks = frag_event.peaks
        chem_to_peaks[chem].extend(peaks)
    return chem_to_peaks

In [None]:
for controller, (current_class, mzml_filename) in controller_to_mzml.items():
    controller_peaks = get_chem_to_peaks(controller)
    basename = os.path.basename(mzml_filename)
    front, back = os.path.splitext(mzml_filename)
    outfile = front + '.csv'

    missing_peaks = []            
    for other_class in range(num_classes):
        if current_class == other_class:
            continue

        # get the peaks that are present in current_class but missing in other_class
        missing_chems = multiple_samples.missing_chemicals[other_class]
        for chem in missing_chems:
            peaks = controller_peaks[chem]
            for peak in peaks:
                row = (chem.formula.formula_string, current_class, other_class, peak.mz, peak.rt, peak.intensity)
                missing_peaks.append(row)
    
    # convert to dataframe
    columns = ['formula', 'present_in', 'missing_in', 'mz', 'RT', 'intensity']
    missing_df = pd.DataFrame(missing_peaks, columns=columns)
    missing_df.to_csv(os.path.join(out_dir, 'MissingChemicals', os.path.basename(outfile)))