# Experiment: Varying N in top-N DDA fragmentation

In this notebook, we calculate performance for section 2.3 in the paper.

In [None]:
import sys
sys.path.append('C:\\Users\\joewa\\Work\\git\\vimms')

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.ticker import LinearLocator, FormatStrFormatter
import numpy as np
import sys
from collections import defaultdict
import seaborn as sns
import os
import math

In [None]:
from vimms.PlotsForPaper import *
from vimms.Common import *

In [None]:
base_dir = 'C:\\Users\\joewa\\University of Glasgow\\Vinny Davies - CLDS Metabolomics Project\\'
manuscript_data_dir = 'C:\\Users\\joewa\\Work\\data\\evaluation'

## Scenario 1: both full-scan and Top-N data are available.

Load XCMS peak picking results on the ground truth fullscan data. Peak picking was done using the script `extract_peaks.R` in the folder.

In [None]:
min_ms1_intensity = 0
rt_range = [(3*60, 21*60)]
mz_range = [(0, math.inf)]
results_dir = os.path.join(manuscript_data_dir, 'ground_truth\\mzML')   
csv_file = os.path.join(results_dir, 'extracted_peaks_ms1.csv')
P_peaks_df = get_df(csv_file, min_ms1_intensity, rt_range, mz_range)

Count how many ground truth peaks were found by xcms for each fullscan mzML
- P = peaks picked by XCMS from the ms1 data (ground truth)
- Q = peaks picked by XCMS from the fragmentation data

In [None]:
P_count_df = P_peaks_df.groupby('filename').size().reset_index(name='counts')

In [None]:
P_count_df

Try to load previous evaluation results, if it exists

In [None]:
df_file = os.path.join(base_dir, 'Manuscript\\2.3. Varying N in Top-N Simulations\\result_df.p')
try:
    result_df = load_obj(df_file)
    print(result_df.head())
except FileNotFoundError:
    result_df = None
except AttributeError:
    result_df = None

If not found, then compute a new performance evaluation and save it

In [None]:
# mapping between experiment folder name to the actual full scan mzML filename in ms1_df
experiment_to_filename = {
    'beer1pos': 'Beer_multibeers_1_fullscan1.mzML',
    'beer2pos': 'Beer_multibeers_2_fullscan1.mzML',
    'urine02pos': 'Urine_StrokeDrugs_02_fullscan.mzML',
    'urine03pos': 'Urine_StrokeDrugs_03_fullscan.mzML',
    'beerqcb': 'QCB_22May19_1.mzML'
}
experiment_names = list(experiment_to_filename.keys())

Ns = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]
rt_tols = [15]

matching_mz_tol = 10 # ppm
matching_rt_tol = 30 # seconds
min_ms1_intensity = 1.75E5 # should be the same as what's used to run the experiments

In [None]:
if result_df is None:
    results = []
    for experiment_name in experiment_names: 
        experiment_out_dir = os.path.join(manuscript_data_dir, experiment_name, 'mzML')
        fullscan_filename = experiment_to_filename[experiment_name]
        
        if experiment_name == 'beerqcb':
            min_ms1 = 0
        else:
            min_ms1 = min_ms1_intensity
            
        for N in Ns:
            for rt_tol in rt_tols:
                controller = load_controller(experiment_out_dir, experiment_name, N, rt_tol)
                if controller is not None:
                    # compute performance for scenario 1
                    chemicals = load_obj(os.path.join(experiment_out_dir, 'dataset.p'))           
                    tp, fp, fn, prec, rec, f1 = compute_performance_scenario_1(controller, chemicals, min_ms1,
                                                                               fullscan_filename, P_peaks_df,
                                                                               matching_mz_tol, matching_rt_tol)      
                    
                    print('%s N=%d rt_tol=%d tp=%d fp=%d fn=%d prec=%.3f rec=%.3f f1=%.3f' % (experiment_name, 
                        N, rt_tol, tp, fp, fn, prec, rec, f1))
                    res = (experiment_name, N, rt_tol, tp, fp, fn, prec, rec, f1)    
                    results.append(res)  

    result_df = pd.DataFrame(results, columns=['experiment', 'N', 'rt_tol', 'TP', 'FP', 'FN', 'Prec', 'Rec', 'F1'])
    save_obj(result_df, df_file)

Add a column to the dataframe for the group (beer or urine).

In [None]:
add_group_column(result_df)

In [None]:
result_df.head()

Plot precision, recall, f1

In [None]:
result_df['N'].unique()

Exclude beer_qcb from the plot

In [None]:
mask = result_df['experiment'].isin(['beerqcb'])
filtered_df = result_df[~mask]

In [None]:
# filtered_df = result_df

Plot precision, recall, f1

In [None]:
plt.rcParams.update({'font.size': 30})

In [None]:
plt.figure(figsize=(12, 6))
ax = sns.lineplot(x='N', y='Prec', hue='experiment', legend='brief', data=filtered_df)
plt.legend(prop={'size': 14})
# plt.title('Precision (Alternative Case)')
for l in ax.lines:
    plt.setp(l, linewidth=5)
plt.ylabel('Precision')
plt.xlabel(r'Top-$N$')
plt.tight_layout()
plt.savefig('topN_precision.png', dpi=300)

In [None]:
plt.figure(figsize=(12, 6))
ax = sns.lineplot(x='N', y='Rec', hue='experiment', legend='brief', data=filtered_df)
plt.legend(prop={'size': 14})
# plt.title('Recall (Alternative Case)')
for l in ax.lines:
    plt.setp(l, linewidth=5)
plt.ylabel('Recall')
plt.xlabel(r'Top-$N$')
plt.tight_layout()
plt.savefig('topN_recall.png', dpi=300)

In [None]:
plt.figure(figsize=(12, 6))
ax = sns.lineplot(x='N', y='F1', hue='experiment', legend='brief', data=filtered_df)
plt.legend(prop={'size': 14})
# plt.title('Fragmentation Performance (Alternative Case)')
for l in ax.lines:
    plt.setp(l, linewidth=5)
plt.ylabel(r'$F_{1}\;score$')
plt.xlabel(r'Top-$N$')
plt.tight_layout()
plt.savefig('topN_f1.png', dpi=300)

### Plot XCMS peak picking results

We want to evaluate how the quality of peak picking reduces as we increase N.

- P = peaks picked by XCMS from the ms1 data (ground truth)
- Q = peaks picked by XCMS from the fragmentation data

#### Does Q decrease as N increases?

Load peak picking results for all mzMLs generated by the simulator in all experiments (beer1pos, beer2pos, urine02pos, urine03pos).

In [None]:
results_dir = os.path.join(manuscript_data_dir, 'ground_truth\\mzML')   

In [None]:
min_ms1_intensity = 0
rt_range = [(3*60, 21*60)]
mz_range = [(0, math.inf)]
experiment_names = ['beer1pos', 'beer2pos', 'urine02pos', 'urine03pos', 'beerqcb']
experiment_names = ['beer1pos', 'beer2pos', 'urine02pos', 'urine03pos']
dfs = []
for experiment_name in experiment_names:
    print('Loading %s' % experiment_name)
    csv_file = os.path.join(manuscript_data_dir, '%s\\mzML\\extracted_peaks_ms1.csv' % experiment_name)
    df = get_df(csv_file, min_ms1_intensity, rt_range, mz_range)
    dfs.append(df)

Combine all the peak picking results and count how many peaks for each file.

In [None]:
Q_peaks = pd.concat(dfs)

In [None]:
Q_peaks.head()

In [None]:
count_df = Q_peaks.groupby('filename').size().reset_index(name='counts')
count_df['N'] = count_df.apply (lambda row: get_N(row), axis=1)
count_df[['N']] = count_df[['N']].astype('int')
count_df['rt_tol'] = count_df.apply (lambda row: get_dew(row), axis=1)
count_df[['rt_tol']] = count_df[['rt_tol']].astype('int')
count_df['experiment_name'] = count_df.apply(lambda row: row['filename'].split('_')[1], axis=1)
count_df['group'] = count_df.apply(lambda row: experiment_group(row), axis=1)

In [None]:
count_df.head()

In [None]:
# actual_count = count_df[count_df['filename'].str.contains('multibeers') | count_df['filename'].str.contains('StrokeDrugs')]
# experimental_count = count_df[~count_df['filename'].str.contains('multibeers') & ~count_df['filename'].str.contains('StrokeDrugs')]
# experimental_count = experimental_count.sort_values(by=['experiment_name', 'N'])

In [None]:
selected_rt_tol = 15
Q_count = count_df[count_df['rt_tol'] == selected_rt_tol]
Q_count = Q_count.sort_values(by=['experiment_name', 'N', 'rt_tol'])

In [None]:
Q_count.head()

In [None]:
mask = Q_count['experiment_name'].isin(['beerqcb'])
filtered_Q_count = Q_count[~mask]

In [None]:
sns.set_style("whitegrid")
plt.figure(figsize=(12, 6))
ax = sns.lineplot(x='N', y='counts', data=filtered_Q_count, hue='experiment_name', markers=True)
# plt.title('Number of MS1 features in fragmentation files')
for l in ax.lines:
    plt.setp(l, linewidth=5)
plt.legend(prop={'size': 20})
plt.ylabel('Counts')
plt.xlabel(r'Top-$N$')
plt.tight_layout()
plt.savefig('topN_num_peaks.png', dpi=300)

### What proportion of Q peaks are in P peaks (and are not)? 

In [None]:
def get_matches(row):
    # get the fullscan and fragmentation mzML file names for the current row
    Q_filename = row['filename']
    Q_group = Q_filename.split('_')[1]
    P_filename = experiment_to_filename[Q_group]

    # extract peaks picked by XCMS from that file and turn them into Chemicals
    Q_chemicals = df_to_chemicals(Q_peaks, Q_filename)  
    P_chemicals = df_to_chemicals(P_peaks_df, P_filename)
    
    mz_tol = 10
    rt_tol = 10
    matches = match(P_chemicals, Q_chemicals, mz_tol, rt_tol, verbose=False)
    prop = len(matches) / len(P_chemicals)
    print('%s matches = %d/%d (%f)' % (Q_filename, len(matches), len(P_chemicals), prop))    
    return prop

In [None]:
Q_count['matches'] = Q_count.apply(lambda row: get_matches(row), axis=1)

In [None]:
mask = Q_count['experiment_name'].isin(['beerqcb'])
filtered_Q_count = Q_count[~mask]

In [None]:
sns.set_style("whitegrid")
plt.figure(figsize=(12, 6))
ax = sns.lineplot(x='N', y='matches', data=filtered_Q_count, hue='experiment_name', markers=True)
# plt.title('Proportion of MS1 features in fragmentation files that are also found in full-scan files', size=20)
for l in ax.lines:
    plt.setp(l, linewidth=5)
plt.legend(prop={'size': 24})  
plt.ylabel('Matches')
plt.xlabel(r'Top-$N$')
plt.tight_layout()
plt.savefig('topN_prop_matched.png', dpi=300)

## Scenario 2:  only Top-N DDA is available.

In [None]:
df_file_2 = os.path.join(base_dir, 'Manuscript\\2.3. Varying N in Top-N Simulations\\result_df_2.p')
try:
    result_df_2 = load_obj(df_file_2)
    print(result_df_2.head())
except FileNotFoundError:
    result_df_2 = None
except AttributeError:
    result_df_2 = None

If not found, then compute a new performance evaluation and save it

In [None]:
if result_df_2 is None:
    results = []
    for experiment_name in experiment_names:         
        experiment_out_dir = os.path.join(manuscript_data_dir, experiment_name, 'mzML')
        fullscan_filename = experiment_to_filename[experiment_name]                
        
        if experiment_name == 'beerqcb':
            min_ms1 = 0
        else:
            min_ms1 = min_ms1_intensity
        
        for N in Ns:
            for rt_tol in rt_tols:
                
                # load chemicals and check for matching
                chemicals = load_obj(os.path.join(experiment_out_dir, 'dataset.p'))           
                fragfile_filename = 'experiment_%s_N_%d_rttol_%d.mzML' % (experiment_name, N, rt_tol) 

                # load controller and compute performance
                controller = load_controller(experiment_out_dir, experiment_name, N, rt_tol)
                if controller is not None:
                    tp, fp, fn, prec, rec, f1 = compute_performance_scenario_2(controller, chemicals, min_ms1,
                                                                               fullscan_filename, fragfile_filename,
                                                                               P_peaks_df, Q_peaks, matching_mz_tol, matching_rt_tol)
                    print('%s N=%d rt_tol=%d tp=%d fp=%d fn=%d prec=%.3f rec=%.3f f1=%.3f' % (experiment_name, 
                        N, rt_tol, tp, fp, fn, prec, rec, f1))
                    res = (experiment_name, N, rt_tol, tp, fp, fn, prec, rec, f1)    
                    results.append(res)  

    result_df_2 = pd.DataFrame(results, columns=['experiment', 'N', 'rt_tol', 'TP', 'FP', 'FN', 'Prec', 'Rec', 'F1'])
    save_obj(result_df_2, df_file_2)

Add a column to the dataframe for the group (beer or urine).

In [None]:
add_group_column(result_df_2)

In [None]:
result_df_2.head()

Exclude beer_qcb from plotting

In [None]:
mask = result_df_2['experiment'].isin(['beerqcb'])
filtered_result_df_2 = result_df_2[~mask]

In [None]:
filtered_result_df_2 = filtered_result_df_2[filtered_result_df_2['rt_tol'] == 15]

Plot precision, recall, f1

In [None]:
filtered_result_df_2['N'].unique()

In [None]:
filtered_result_df_2.head()

In [None]:
plt.figure(figsize=(12, 6))
ax = sns.lineplot(x='N', y='Prec', hue='experiment', legend='brief', data=filtered_result_df_2)
# plt.title('Precision')
for l in ax.lines:
    plt.setp(l, linewidth=5)
plt.ylabel('Precision')
plt.xlabel(r'Top-$N$')
plt.legend(prop={'size': 20})
plt.tight_layout()
plt.savefig('topN_precision_2.png', dpi=300)

In [None]:
plt.figure(figsize=(12, 6))
ax = sns.lineplot(x='N', y='Rec', hue='experiment', legend='brief', data=filtered_result_df_2)
# plt.title('Recall')
for l in ax.lines:
    plt.setp(l, linewidth=5)
plt.ylabel('Recall')
plt.xlabel(r'Top-$N$')
plt.legend(prop={'size': 20})
plt.tight_layout()
plt.savefig('topN_recall_2.png', dpi=300)

In [None]:
plt.figure(figsize=(12, 6))
ax = sns.lineplot(x='N', y='F1', hue='experiment', legend='brief', data=filtered_result_df_2)
# plt.title('Fragmentation Performance (F1-score)', fontsize=24)
for l in ax.lines:
    plt.setp(l, linewidth=5)
plt.legend(prop={'size': 20})
plt.xlabel(r'Top-$N$')
plt.ylabel(r'$F_{1}\;score$')
plt.tight_layout()
plt.savefig('topN_f1_2.png', dpi=300)

In [None]:
# plt.figure(figsize=(12, 6))
# ax = sns.lineplot(x='N', y='TP', hue='experiment', legend='brief', data=filtered_result_df_2)
# # plt.title('TP')
# for l in ax.lines:
#     plt.setp(l, linewidth=5)
# plt.legend(prop={'size': 20})
# plt.tight_layout()
# plt.savefig('topN_tp_2.png', dpi=300)

In [None]:
# plt.figure(figsize=(12, 6))
# ax = sns.lineplot(x='N', y='FP', hue='experiment', legend='brief', data=filtered_result_df_2)
# # plt.title('FP')
# for l in ax.lines:
#     plt.setp(l, linewidth=5)
# plt.tight_layout()
# plt.savefig('topN_fp_2.png', dpi=300)

In [None]:
# plt.figure(figsize=(12, 6))
# ax = sns.lineplot(x='N', y='FN', hue='experiment', legend='brief', data=filtered_result_df_2)
# # plt.title('FN')
# for l in ax.lines:
#     plt.setp(l, linewidth=5)
# plt.tight_layout()
# plt.savefig('topN_fn_2.png', dpi=300)

### Make fancy 3D plot

In [None]:
# experiment_name = 'beerqcb'
# experiment_out_dir = os.path.join(manuscript_data_dir, experiment_name, 'mzML')

In [None]:
# plot_data_file = os.path.join(base_dir, 'Manuscript\\2.3. Comparison of Multiple Settings within Top N Simulations\\plot_data.p')
# try:
#     plot_data = load_obj(plot_data_file)
# except FileNotFoundError:
#     plot_data = None
# except AttributeError:
#     plot_data = None

In [None]:
# if plot_data is None:    
#     Ns = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]
#     rt_tols = [1, 5, 10, 15, 20, 25, 30, 45, 60, 90, 120]    
    
#     X, Y = np.meshgrid(Ns, rt_tols)
#     Z_precision = np.zeros_like(X).astype(float)
#     Z_recall = np.zeros_like(X).astype(float)
#     Z_f1 = np.zeros_like(X).astype(float)
    
#     for j in range(X.shape[1]):
#         for i in range(X.shape[0]):    
#             N = X[i, j]
#             rt_tol = Y[i, j]            
#             analysis_name = 'experiment_N_%d_rttol_%d' % (N, rt_tol) 
            
#             # load the list of chemicals that we put into the simulator for each experiment        
#             experiment_out_dir = os.path.join(manuscript_data_dir, experiment_name, 'mzML')
#             dataset = load_obj(os.path.join(experiment_out_dir, 'dataset.p'))           
#             fullscan_filename = experiment_to_filename[experiment_name]            

#             # load controller and compute performance
#             fragfile_filename = 'experiment_beer1pos_N_%d_rttol_%d.mzML' % (N, rt_tol)             
#             controller = load_controller(experiment_out_dir, experiment_name, N, rt_tol)
#             if controller is not None:
#                 tp, fp, fn, prec, rec, f1 = compute_performance_scenario_2(controller, chemicals, min_ms1_intensity,
#                                                                            fullscan_filename, fragfile_filename,
#                                                                            P_peaks_df, Q_peaks, matching_mz_tol, matching_rt_tol)
                
#                 print('%s N=%d rt_tol=%d tp=%d fp=%d fn=%d prec=%.3f rec=%.3f f1=%.3f' % (experiment_name, 
#                     N, rt_tol, tp, fp, fn, prec, rec, f1))
#                 Z_precision[i, j] = prec
#                 Z_recall[i, j] = rec
#                 Z_f1[i, j] = f1
                
#     plot_data = {
#         'X': X,
#         'Y': Y,
#         'Z_precision': Z_precision,
#         'Z_recall': Z_recall,
#         'Z_f1': Z_f1
#     }
#     save_obj(plot_data, plot_data_file)

### Make plot

In [None]:
# def make_plot(X, Y, Z, xlabel, ylabel, zlabel, title, out_file=None):
#     # Plot the surface.
#     fig = plt.figure()
#     ax = fig.gca(projection='3d')
#     surf = ax.plot_surface(X, Y, Z, cmap=cm.coolwarm,
#                            linewidth=0, antialiased=False)

#     # Customize the z axis.
#     # ax.set_zlim(-1.01, 1.01)
#     # ax.zaxis.set_major_locator(LinearLocator(10))
#     # ax.zaxis.set_major_formatter(FormatStrFormatter('%.02f'))

#     # Add a color bar which maps values to colors.
#     fig.colorbar(surf, shrink=0.5, aspect=5)
#     ax.set_xlabel(xlabel)
#     ax.set_ylabel(ylabel)
#     ax.set_zlabel(zlabel)    
#     plt.title(title)
#     plt.tight_layout()
#     if out_file is not None:
#         plt.savefig(out_file, dpi=300)
#     plt.show()

In [None]:
# X = plot_data['X']
# Y = plot_data['Y']
# Z_precision = plot_data['Z_precision']
# Z_recall = plot_data['Z_recall']
# Z_f1 = plot_data['Z_f1']

In [None]:
# %matplotlib notebook

In [None]:
# %matplotlib inline

In [None]:
# make_plot(X, Y, Z_precision, 
#           'N', 'Dynamic exclusion window (s)', 'Precision', 'Precision with varying Ns and dynamic exclusion windows',
#          out_file='plot_3d_precision.png')

In [None]:
# make_plot(X, Y, Z_recall, 
#           'N', 'Dynamic exclusion window (s)', 'Recall', 'Recall with varying Ns and dynamic exclusion windows',
#            out_file='plot_3d_recall.png')

In [None]:
# make_plot(X, Y, Z_f1, 
#           'N', 'Dynamic exclusion window (s)', 'F_1', 'F_1 score with varying Ns and dynamic exclusion windows',
#          out_file='plot_3d_f1.png')