# Loader + KDE

- this needs to have a link to download data
- create peak_sampler
- download and trun HMDB to compounds (extract_hmdb_compounds notebook in OneDrive)
- turn beer files to chemicals and pickle (example_vinny notebook in OneDrive)
    - these can then be used to run the other notebooks

In [None]:
import sys
sys.path.append('..')

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import pylab as plt
import os

from vimms.Common import *
from vimms.DataGenerator import *
from vimms.PlotsForPaper import *

## Load fragmentation mzML files containing combined MS1+MS2 data

In [None]:
def get_data_source(mzml_path, max_data, filename, ms_level, min_rt=None, max_rt=None, xcms_filename=None, plot=False):
    ds = DataSource()
    ds.load_data(mzml_path, file_name=filename)
    if xcms_filename is not None:
        ds.load_xcms_output(xcms_filename)
        
    if plot:
        for data_type in data_types:
            if data_type == SCAN_DURATION:
                X = ds.get_scan_durations(filename)
                ds.plot_histogram(X, data_type)                    
            else:            
                X = ds.get_data(data_type, filename, ms_level, min_rt=min_rt, max_rt=max_rt, max_data=max_data)  
                if data_type == INTENSITY:
                    X = np.log(X)
                ds.plot_histogram(X, data_type)        
                ds.plot_boxplot(X, data_type)
    return ds

In [None]:
max_data = 50000
filename = None
ms_level = 1

## Load fragmentation data

In [None]:
data_dir = 'C:\\Users\\Vinny\\OneDrive - University of Glasgow\\CLDS Metabolomics Project\\Data\\multibeers_urine_data'

In [None]:
mzml_path = os.path.join(data_dir, 'beers\\fragmentation')
xcms_output = os.path.join(mzml_path, 'mzML\\extracted_peaks_ms1.csv')
ds_fragmentation = get_data_source(mzml_path, max_data, filename, ms_level, xcms_filename=xcms_output)

## Load fullscan data

In [None]:
mzml_path = os.path.join(data_dir, 'beers\\fullscan')
xcms_output = os.path.join(mzml_path, 'mzML\\extracted_peaks_ms1.csv')
ds_fullscan = get_data_source(mzml_path, max_data, filename, ms_level, xcms_filename=xcms_output)

# Alternative KDE where we fit RT, m/z and Intensity together

## For beer1pos only

In [None]:
ds_fragmentation.df.head()

In [None]:
set_log_level_debug()

In [None]:
# min_ms1_intensity = 2.5E5
# min_ms2_intensity = 5000
min_ms1_intensity = 0
min_ms2_intensity = 0

In [None]:
# min_rt = 3*60
# max_rt = 21*60
min_rt = 0
max_rt = 1440

In [None]:
filename = 'Beer_multibeers_1_T10_POS.mzML'
densities = PeakDensityEstimator(min_ms1_intensity, min_ms2_intensity, min_rt, max_rt, plot=True)
densities.kde(ds_fragmentation, filename, 1, bandwidth_mz_intensity_rt=1.0, bandwidth_n_peaks=1.0)
densities.kde(ds_fragmentation, filename, 2, bandwidth_mz_intensity_rt=1.0, bandwidth_n_peaks=1.0)
ps = PeakSampler(densities)
save_obj(ps, 'C:\\Users\\joewa\\University of Glasgow\\Vinny Davies - CLDS Metabolomics Project\\Trained Models\\peak_sampler_mz_rt_int_beer1pos_fragmentation.p')

In [None]:
ps.sample(2, 10)

In [None]:
filename = 'Beer_multibeers_1_fullscan1.mzML'
densities = PeakDensityEstimator(min_ms1_intensity, min_ms2_intensity, min_rt, max_rt, plot=True)
densities.kde(ds_fullscan, filename, 1, bandwidth_mz_intensity_rt=1.0, bandwidth_n_peaks=1.0)
ps = PeakSampler(densities)
save_obj(ps, 'C:\\Users\\joewa\\University of Glasgow\\Vinny Davies - CLDS Metabolomics Project\\Trained Models\\peak_sampler_mz_rt_int_beer1pos_fullscan.p')

In [None]:
ps.sample(1, 10)

## Train on all the 19 beers

In [None]:
filename = None
densities = PeakDensityEstimator(min_ms1_intensity, min_ms2_intensity, min_rt, max_rt, plot=True)
densities.kde(ds_fragmentation, filename, 1, bandwidth_mz_intensity_rt=1.0, bandwidth_n_peaks=1.0)
densities.kde(ds_fragmentation, filename, 2, bandwidth_mz_intensity_rt=1.0, bandwidth_n_peaks=1.0)
ps = PeakSampler(densities)
save_obj(ps, 'C:\\Users\\Vinny\\work\\vimms\\data\\peak_sampler_mz_rt_int_19_beers_fragmentation.p')

In [None]:
ps.sample(2, 10)

In [None]:
filename = None
densities = PeakDensityEstimator(min_ms1_intensity, min_ms2_intensity, min_rt, max_rt, plot=True)
densities.kde(ds_fullscan, filename, 1, bandwidth_mz_intensity_rt=1.0, bandwidth_n_peaks=1.0)
ps = PeakSampler(densities)
save_obj(ps, 'C:\\Users\\joewa\\University of Glasgow\\Vinny Davies - CLDS Metabolomics Project\\Trained Models\\peak_sampler_mz_rt_int_19_beers_fullscan.p')

In [None]:
ps.sample(1, 10)