# Compute fragmentation performance from mzML

In this notebook, we calculate performance for section 2.4

In [None]:
import sys
sys.path.append('C:\\Users\\joewa\\Work\\git\\vimms')

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.ticker import LinearLocator, FormatStrFormatter
import numpy as np
import sys
from collections import defaultdict
import seaborn as sns
import os
import math
import glob

In [None]:
from vimms.Common import *
from vimms.Roi import *
from vimms.MassSpec import *
from vimms.TopNExperiment import *
from vimms.PlotsForPaper import *

In [None]:
base_dir = 'C:\\Users\\joewa\\University of Glasgow\\Vinny Davies - CLDS Metabolomics Project\\'
manuscript_data_dir = 'C:\\Users\\joewa\\Work\\data\\evaluation'

In [None]:
result_dir = os.path.join(base_dir, 'Manuscript\\2.4. Varying Multiple Parameters in Top-N Simulations') 
real_file = os.path.join(result_dir, 'beerqcb_real_results.p')
simulated_file = os.path.join(result_dir, 'beerqcb_mzml_simulated_results.p')

In [None]:
Ns = [1, 2, 3, 4, 5, 10, 15, 20, 35, 50]
rt_tols = [15, 30, 60, 120]
experiment_name = 'beerqcb'

In [None]:
set_log_level_debug()

### Load ground truth peaks

Count how many ground truth peaks were found by xcms for each fullscan mzML
- P = peaks picked by XCMS from the ms1 data (ground truth)
- Q = peaks picked by XCMS from the fragmentation data

In [None]:
min_ms1_intensity = 0
# rt_range = [(3*60, 21*60)]
rt_range = [(0, 1600)]
mz_range = [(0, math.inf)]

In [None]:
results_dir = os.path.join(manuscript_data_dir, 'ground_truth\\mzML')   
csv_file = os.path.join(results_dir, 'extracted_peaks_ms1_alternative_2.csv')
P_peaks_df = get_df(csv_file, min_ms1_intensity, rt_range, mz_range)
P_count_df = P_peaks_df.groupby('filename').size().reset_index(name='counts')
P_count_df

In [None]:
csv_file = os.path.join(base_dir, 'Data\\Fusion_1578_Ronan_Daly_CLP_pHILIC_22May19\\Positive\\fragmentation\\mzML\\extracted_peaks_ms1_alternative_2.csv')
Q_peaks_real_df = get_df(csv_file, min_ms1_intensity, rt_range, mz_range)
Q_count_real_df = Q_peaks_real_df.groupby('filename').size().reset_index(name='counts')
Q_count_real_df.head()

In [None]:
csv_file = os.path.join(manuscript_data_dir, '%s\\mzML\\extracted_peaks_ms1_alternative_2.csv' % experiment_name)
Q_peaks_simulated_df = get_df(csv_file, min_ms1_intensity, rt_range, mz_range)
Q_count_simulated_df = Q_peaks_simulated_df.groupby('filename').size().reset_index(name='counts')
Q_count_simulated_df.head()

### Define some methods

In [None]:
def get_params(N, rt_tol, controller_file, chemicals_file, fragfile, P_peaks_df, Q_peaks_df, scenario):
    return {
        'fragfile': fragfile,
        'N': N,
        'rt_tol': rt_tol,
        'roi_mz_tol': 30,
        'roi_min_length': 1,
        'roi_min_ms1_intensity': 0,        
        'fragmentation_min_ms1_intensity': 0,
        'min_rt': rt_range[0][0],
        'max_rt': rt_range[0][1],
        'fullscan_filename': 'QCB_22May19_1.mzML',
        'P_peaks_df': P_peaks_df,
        'Q_peaks_df': Q_peaks_df,
        'matching_mz_tol': 10,
        'matching_rt_tol': 30,
        'scenario': scenario,
        'controller_file': controller_file,
        'chemicals_file': chemicals_file
    }

### Compute Real Performance

In [None]:
fragfile_dir = os.path.join(base_dir, 'Data\\Fusion_1578_Ronan_Daly_CLP_pHILIC_22May19\\Positive\\fragmentation\\mzML\\*.mzML')
fragfiles = glob.glob(fragfile_dir)

filtered_fragfiles = []
for fragfile in fragfiles:
    N, rt_tol = get_N_rt_tol_from_qcb_filename(fragfile)     
    if 'QCB_N' not in fragfile:
        continue
    filtered_fragfiles.append(fragfile)
        
len(filtered_fragfiles)

In [None]:
all_params = []
for fragfile in filtered_fragfiles:
    N, rt_tol = get_N_rt_tol_from_qcb_filename(fragfile)     
     # extract chemicals from fullscan file
    chemicals_file = os.path.join(manuscript_data_dir, experiment_name, 'mzML\\dataset.p')
     # extract frag events from fragfile    
    controller_file = fragfile
    # all_params.append(get_params(N, rt_tol, controller_file, chemicals_file, fragfile, P_peaks_df, Q_peaks_real_df, 1))
    all_params.append(get_params(N, rt_tol, controller_file, chemicals_file, fragfile, P_peaks_df, Q_peaks_real_df, 2))
len(all_params)

In [None]:
# real_df = evaluate_serial(all_params)

In [None]:
real_df = evaluate_parallel(all_params)

In [None]:
real_df.head()

In [None]:
save_obj(real_df, real_file)

### Compute Simulated Performance from mzML

In [None]:
fragfile_dir = os.path.join(manuscript_data_dir, experiment_name, 'mzML\\')
all_params = []
for N in Ns:
    for rt_tol in rt_tols:
        fragfile = os.path.join(fragfile_dir, 'experiment_%s_N_%d_rttol_%d.mzML' % (experiment_name, N, rt_tol))        
        
        # extract chemicals and fragmentation events from mzML file
        # chemicals_file = fragfile
        # controller_file = fragfile
        
        # load chemicals and fragmentation events from controller
        # chemicals_file = os.path.join(fragfile_dir, 'dataset.p')      
        # controller_file = os.path.join(fragfile_dir, 'experiment_%s_N_%d_rttol_%d.p' % (experiment_name, N, rt_tol))                
          
        # extract chemicals from fullscan file
        chemicals_file = os.path.join(fragfile_dir, 'dataset.p')              
        # extract frag events from mzML file
        controller_file = fragfile
            
        # all_params.append(get_params(N, rt_tol, controller_file, chemicals_file, fragfile, P_peaks_df, Q_peaks_simulated_df, 1))
        all_params.append(get_params(N, rt_tol, controller_file, chemicals_file, fragfile, P_peaks_df, Q_peaks_simulated_df, 2))

In [None]:
simulated_df = evaluate_parallel(all_params)

In [None]:
save_obj(simulated_df, simulated_file)

### Evaluate performance

In [None]:
plt.rcParams.update({'font.size': 14})

In [None]:
real_df = load_obj(real_file)
simulated_df = load_obj(simulated_file)

In [None]:
scenario = 2
real_df = real_df[real_df['scenario'] == scenario]
simulated_df = simulated_df[simulated_df['scenario'] == scenario]

In [None]:
real_df.head(1)

In [None]:
simulated_df.head(1)

In [None]:
real_df['data'] = 'real'
simulated_df['data'] = 'simulated'
combined_df = pd.concat([real_df, simulated_df])

In [None]:
plt.rcParams.update({'font.size': 22})
sns.set_style("whitegrid")

In [None]:
all_f1 = np.concatenate([real_df['F1'].values, simulated_df['F1'].values])
ylim = [min(all_f1)-0.05, max(all_f1)+0.05]

In [None]:
def lineplot(df, column_name, title):
    plt.figure(figsize=(12, 6))
    ax = sns.lineplot(x='N', y=column_name, hue='rt_tol', legend='brief', data=df, palette=sns.color_palette("Blues")[0:4])
    legend = ax.legend()
    legend.texts[0].set_text('DEW (s)')
    plt.title(title)
    for l in ax.lines:
        plt.setp(l, linewidth=5)
    plt.tight_layout()
    plt.show()

In [None]:
column_name = 'TP'
lineplot(real_df, column_name, 'BeerQCB (Real)')
lineplot(simulated_df, column_name, 'BeerQCB (Simulated)')

In [None]:
column_name = 'FP'
lineplot(real_df, column_name, 'BeerQCB (Real)')
lineplot(simulated_df, column_name, 'BeerQCB (Simulated)')

In [None]:
column_name = 'FN'
lineplot(real_df, column_name, 'BeerQCB (Real)')
lineplot(simulated_df, column_name, 'BeerQCB (Simulated)')

In [None]:
column_name = 'Prec'
lineplot(real_df, column_name, 'BeerQCB (Real)')
lineplot(simulated_df, column_name, 'BeerQCB (Simulated)')

In [None]:
column_name = 'Rec'
lineplot(real_df, column_name, 'BeerQCB (Real)')
lineplot(simulated_df, column_name, 'BeerQCB (Simulated)')

In [None]:
column_name = 'F1'
lineplot(real_df, column_name, 'BeerQCB (Real)')
lineplot(simulated_df, column_name, 'BeerQCB (Simulated)')

### Heatmaps

In [None]:
def get_value(result_df, column_name, N, rt_tol, scenario):
    idx = (result_df['N'] == N) & (result_df['rt_tol'] == rt_tol) & (result_df['scenario'] == scenario)
    row = result_df[idx]
    val = row[column_name].values[0]
    return val

In [None]:
def make_heatmap(mat, xticklabels, yticklabels, title, xlabel, ylabel, vmin, vmax, outfile):
    plt.figure(figsize=(6, 6))    
    ax = sns.heatmap(mat, xticklabels=xticklabels, yticklabels=yticklabels, vmin=vmin, vmax=vmax)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    # for l in ax.lines:
    #     plt.setp(l, linewidth=5)
    plt.tight_layout()
    plt.savefig(outfile, dpi=300)

In [None]:
f1_real = np.zeros((len(Ns), len(rt_tols)))
f1_simulated = np.zeros((len(Ns), len(rt_tols)))

In [None]:
scenario = 2
for i in range(len(Ns)):
    N = Ns[i]
    for j in range(len(rt_tols)):
        rt_tol = rt_tols[j]
        f1_real[i, j] = get_value(real_df, 'F1', N, rt_tol, scenario)
        f1_simulated[i, j] = get_value(simulated_df, 'F1', N, rt_tol, scenario)

In [None]:
all_f1 = np.concatenate([real_df['F1'].values, simulated_df['F1'].values])
ylim = [min(all_f1), max(all_f1)]

In [None]:
boxplot_data = np.array([f1_real.flatten(), f1_simulated.flatten()]).transpose()
boxplot_data
boxplot_df = pd.DataFrame(boxplot_data, columns=['Real', 'Simulated'])
boxplot_df

In [None]:
ax = sns.boxplot(data=boxplot_df)
ax.set_ylabel('F1-score')
ax.set_xticklabels(['Real', 'Simulated'])
# plt.title('Fragmentation Performance (BeerQCB)', fontsize=20)
plt.tight_layout()
plt.savefig('qcb_fragmentation_performance.png', dpi=300)

In [None]:
make_heatmap(f1_real, rt_tols, Ns, None, 'DEW (s)', 'Top-N', ylim[0], ylim[1],
             'qcb_real_performance_heatmap.png')

In [None]:
make_heatmap(f1_simulated, rt_tols, Ns, None, 'DEW (s)', 'Top-N', ylim[0], ylim[1],
             'qcb_simulated_performance_heatmap.png')

In [None]:
real_df.nlargest(5, 'F1').round(3)

In [None]:
simulated_df.nlargest(5, 'F1').round(3)

In [None]:
real_df.nsmallest(5, 'F1').round(3)

In [None]:
simulated_df.nsmallest(5, 'F1').round(3)

In [None]:
f1_diff = f1_simulated - f1_real
make_heatmap(f1_diff, rt_tols, Ns, None, 'DEW (s)', 'Top-N', None, None,
             'qcb_diff.png')