# TRAINING NOTEBOOK

## Environment

In [1]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
# Change main system path to be able to run code from src folder
import sys
p = sys.path[0]
# Mac OS
if sys.path[0].endswith('/notebooks'):
    main_path = p[:-len('/notebooks')]
if sys.path[0].endswith('/techdoc/content'):
    main_path = p[:-len('/techdoc/content')]
    
# Windows OS
if sys.path[0].endswith('\\notebooks'): 
    main_path = p[:-len('\\notebooks')]
if sys.path[0].endswith('\\techdoc\content'): 
    main_path = p[:-len('\\techdoc\content')]

sys.path[0] = main_path

In [3]:
import os, gc
from termcolor import colored
from tqdm import tqdm

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.metrics import log_loss

from scipy.signal import find_peaks
from scipy.ndimage.filters import gaussian_filter1d

from src import (config, features, preprocess, training)

  from pandas import MultiIndex, Int64Index


# DATA PREPARATION

In [4]:
# ===== LOAD DATA ======
metadata = pd.read_csv(config.DATA_DIR + 'metadata.csv')
print(f'Metadata: {metadata.shape}')

train_labels = pd.read_csv(config.DATA_DIR + 'train_labels.csv')
print(f'Train labels: {train_labels.shape}')

valid_labels = pd.read_csv(config.DATA_DIR + 'val_labels.csv')
print(f'Train labels: {valid_labels.shape}')

submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
print(f'Submission: {submission.shape}')

Metadata: (1570, 5)
Train labels: (766, 11)
Train labels: (293, 11)
Submission: (804, 11)


In [5]:
# Combine train and valid labels
trvl_labels = pd.concat([train_labels, valid_labels], axis = 0)

In [6]:
# ===== MODEL SAMPLES ======
train = metadata[metadata.split == 'train'].copy().reset_index(drop=True)
print(f'TRAIN: {train.shape}')

valid = metadata[metadata.split == 'val'].copy().reset_index(drop=True)
print(f'VALID: {valid.shape}')

test = metadata[metadata.split == 'test'].copy().reset_index(drop=True)
print(f'TEST: {test.shape}')

TRAIN: (766, 5)
VALID: (293, 5)
TEST: (511, 5)


In [7]:
# ===== FILE PATHS OF SAMPLES =====
train_files = metadata[metadata.split == 'train']['features_path'].to_dict()
valid_files = metadata[metadata.split == 'val']['features_path'].to_dict()
test_files = metadata[metadata.split == 'test']['features_path'].to_dict()
# Train & Valid files
trva_files = train_files.copy()
trva_files.update(valid_files)
# All files
all_test_files = valid_files.copy()
all_test_files.update(test_files)

# Ion type list
ion_list = list(np.arange(0,100,1.0))
ion_list.remove(4.0)

# Get the names of the target columns in a list
target_labels_list = [i for i in train_labels.columns if i not in ['sample_id']]
print(target_labels_list)

['basalt', 'carbonate', 'chloride', 'iron_oxide', 'oxalate', 'oxychlorine', 'phyllosilicate', 'silicate', 'sulfate', 'sulfide']


# FEATURE ENGINEERING

In the benchmark notebook the features are discretized into bins of 100 degrees.

In [None]:
#TODO How long does it take for temperature to rise to certain level where we get abundance?
#TODO How to account for the non-linear diferences of time vs temp?
#TODO Each type of ion as a feature, make a matrix and append to the training dataset
#TODO Max value of abundance of each ion - take temp reading and time - or duration
#TODO How to define characteristic of peaks in ion abundances; temp at which they occur, the shape of peaks (height, width, area)
#TODO How to deal with overlapping peaks
#TODO Number of peaks, size of the peaks
#TODO Can we do some target encoding?

## Metadata level

**Compute min and max temp for all available samples, including the ion types**

In [None]:
# Get min/max temperature values and unique ion list
# Preprocess the data to remove noise before calculation
min_temp, max_temp, ion_list = preprocess.compute_min_max_temp_ion(metadata)
print(colored(f'Min temp = {min_temp}; Max temp = {max_temp}', 'blue'))
print(colored(f'Number of unqiue ions: {len(ion_list)}', 'blue'))

**Top N ions based on max relative abundance**

In [8]:
# Compute 
topN_ions = features.get_topN_ions(metadata, N=3)
topN_ions.index = topN_ions.index.set_names('sample_id')
topN_ions = topN_ions.reset_index()
topN_ions.head()

100%|██████████| 1570/1570 [02:09<00:00, 12.10it/s]


Unnamed: 0,sample_id,top_1,top_2,top_3
0,S0000,0.18,0.17,0.16
1,S0001,0.44,0.16,0.02
2,S0002,0.18,0.32,0.17
3,S0003,0.18,0.44,0.17
4,S0004,0.18,0.32,0.17


In [9]:
# Merge to metadata
metadata = pd.merge(metadata, topN_ions, on='sample_id', how='left')
metadata.head()

Unnamed: 0,sample_id,split,instrument_type,features_path,features_md5_hash,top_1,top_2,top_3
0,S0000,train,commercial,train_features/S0000.csv,017b9a71a702e81a828e6242aa15f049,0.18,0.17,0.16
1,S0001,train,commercial,train_features/S0001.csv,0d09840214054d254bd49436c6a6f315,0.44,0.16,0.02
2,S0002,train,commercial,train_features/S0002.csv,3f58b3c9b001bfed6ed4e4f757083e09,0.18,0.32,0.17
3,S0003,train,commercial,train_features/S0003.csv,e9a12f96114a2fda60b36f4c0f513fb1,0.18,0.44,0.17
4,S0004,train,commercial,train_features/S0004.csv,b67603d3931897bfa796ac42cc16de78,0.18,0.32,0.17


In [10]:
metadata.to_csv(os.path.join(config.DATA_DIR_OUT + 'metadata.csv'), 
                index=False)

## Max relative abundance per temp bin and ion type

Bin the temp from min to max in intervals of 100 degrees and combine with the ion type i.e. `m/z`. For each combination compute the maximum relative abundance.

In [11]:
# ===== TRAIN SAMPLE =====
print(f'Number of all test files: {len(trva_files)}')
fts_maxrelabund_tempion = features.features_iontemp_abun(metadata, trva_files)
print(f'Val-Test features: {fts_maxrelabund_tempion.shape}')
fts_maxrelabund_tempion = fts_maxrelabund_tempion.replace(np.nan, 0)
fts_maxrelabund_tempion.to_csv(os.path.join(config.DATA_DIR_OUT + 
                                            'fts_maxrelabund_tempion_trvl.csv'), 
                index=False)

Number of all test files: 1059
Number of samples: 1059
Val-Test features: (1059, 1584)


In [12]:
# ===== VALID & TEST SAMPLE =====
print(f'Number of all test files: {len(all_test_files)}')
fts_maxrelabund_tempion_VT = features.features_iontemp_abun(metadata, all_test_files)
print(f'Val-Test features: {fts_maxrelabund_tempion_VT.shape}')
fts_maxrelabund_tempion_VT = fts_maxrelabund_tempion_VT.replace(np.nan,0)
fts_maxrelabund_tempion_VT.to_csv(os.path.join(config.DATA_DIR_OUT + 
                                            'fts_maxrelabund_tempion_VT_trvl.csv'),
                index=False)

Number of all test files: 804
Number of samples: 804
Val-Test features: (804, 1584)


## Duration to max temperature per ion type

Ion types are presented in column, rows are samples and values are time in seconds to max abundance. We should also record at what temperature this happens.

In [13]:
# ===== TRAIN SAMPLE =====
fts_dur_maxabund_ion = features.features_ion_duration_maxtemp(metadata, trva_files, ion_list)
print(f'Features: {fts_dur_maxabund_ion.shape}')
fts_dur_maxabund_ion.to_csv(os.path.join(config.DATA_DIR_OUT + 
                                            'fts_dur_maxabund_ion_trvl.csv'))
#TODO Fix the warnings

  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df

Features: (1059, 99)


In [14]:
# ===== VALID & TEST SAMPLE =====
fts_dur_maxabund_ion_VT = features.features_ion_duration_maxtemp(metadata, all_test_files, ion_list)
print(f'Features: {fts_dur_maxabund_ion_VT.shape}')
fts_dur_maxabund_ion_VT.to_csv(os.path.join(config.DATA_DIR_OUT + 
                                            'fts_dur_maxabund_ion_VT_trvl.csv'),
                index=False)

  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df

Features: (804, 99)





In [24]:
fts_dur_maxabund_ion.head()

m/z,Ion_0.0,Ion_1.0,Ion_2.0,Ion_3.0,Ion_5.0,Ion_6.0,Ion_7.0,Ion_8.0,Ion_9.0,Ion_10.0,...,Ion_90.0,Ion_91.0,Ion_92.0,Ion_93.0,Ion_94.0,Ion_95.0,Ion_96.0,Ion_97.0,Ion_98.0,Ion_99.0
S0000,431.06,420.772,451.608,646.673,461.865,441.332,379.675,369.41,451.608,482.409,...,1427.113,677.473,1057.428,872.615,225.726,297.58,903.392,441.332,821.276,656.939
S0001,,2179.0,0.0,0.0,1775.0,2193.0,993.0,2193.0,832.0,1748.0,...,,,,,,,,,,
S0002,250.135,244.92,250.135,354.444,271.001,224.037,208.391,1198.828,239.706,338.81,...,1537.174,1058.248,693.318,787.266,323.175,1469.532,849.85,818.571,265.797,1329.009
S0003,208.427,197.995,208.427,218.85,145.891,312.485,208.427,333.314,1067.88,994.89,...,1177.466,1208.744,1302.667,1250.478,458.154,958.416,291.69,161.526,1443.645,10.425
S0004,223.558,218.36,228.759,426.501,369.18,202.767,233.955,259.948,213.153,192.38,...,1213.207,1525.535,1056.942,447.297,635.024,452.52,525.469,1135.043,775.7,135.219


## Combine mx relative abundance and duration

In [15]:
# ===== TRAIN SAMPLE =====
combo_maxabund_dur = pd.concat([fts_maxrelabund_tempion, fts_dur_maxabund_ion], 
                               axis = 1, 
                               ignore_index=False)
print(combo_maxabund_dur.shape)
combo_maxabund_dur.to_csv(os.path.join(config.DATA_DIR_OUT + 
                                            'combo_maxabund_dur_trvl.csv'),
                index=False)

(1059, 1683)


In [16]:
# ===== VALID & TEST SAMPLE =====
combo_maxabund_dur_VT = pd.concat([fts_maxrelabund_tempion_VT, fts_dur_maxabund_ion_VT], 
                               axis = 1, 
                               ignore_index=False)
print(combo_maxabund_dur_VT.shape)
combo_maxabund_dur_VT.to_csv(os.path.join(config.DATA_DIR_OUT +
                                            'combo_maxabund_dur_VT_trvl.csv'),
                index=False)

(804, 1683)


## Analysis of time series

- Percentage change in relative abundance per temp-ion - For each temperature(bin)-ion combination compute the change in relativen abundance.
- find number of peaks
- area under the curve for each ion type

### Hands-On Analysis

- add ion_cnt_peaks
- add time to peak
- first bigger than second if more than one

Refernces:  
- [finding peaks in MS data](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2631518/)
- [SO comment on scipy](https://stackoverflow.com/questions/1713335/peak-finding-algorithm-for-python-scipy)

In [None]:
ht = preprocess.get_sample(metadata,0)
ht = preprocess.preprocess_samples(ht)

In [None]:
htt = ht[ht['m/z'] == 9.0].copy()
htt['abun_minsub_scaled_filtered'] = gaussian_filter1d(htt['abun_minsub_scaled'], sigma=4)

In [None]:
fig, ax = plt.subplots(1,2, figsize=(12,4))
plt.subplot(1, 2, 1); plt.plot(htt['temp'], htt['abun_minsub_scaled'])
plt.subplot(1, 2, 1); plt.plot(htt['temp'], htt['abun_minsub_scaled_filtered'], color='red')
plt.subplot(1, 2, 2); plt.boxplot(htt['abun_minsub_scaled'])
plt.show()

In [None]:
ion_abund_median = htt['abun_minsub_scaled_filtered'].mean()
print(ion_abund_median)
peaks, _ = find_peaks(htt['abun_minsub_scaled_filtered'], prominence=ion_abund_median)
#peaks_s = find_peaks_cwt(htt['abun_minsub_scaled'], noise_perc=0.1)
print(f'Peak idx: {peaks}') # indexes where peaks are found
#print(f'Peak idx: {peaks_s}') # indexes where peaks are found
# Get temp values for peaks
for i in peaks:
    print(htt.iloc[i]['temp'])
    
# Compute area under the curve
#xx = htt['temp']
#yy = htt['abun_minsub_scaled']
#print(f'Area under the curve: {np.round(auc(xx,yy),5)}')

In [None]:
# Computation for sample S0000
ion_peaks_cnt = {}

for ion in ion_list:
    ion_peaks_info = []
    temp_dt = ht[ht['m/z'] == ion].copy()
    temp_dt['abun_minsub_scaled_filtered'] = gaussian_filter1d(temp_dt['abun_minsub_scaled'], 
                                                               sigma=4)
    med = temp_dt['abun_minsub_scaled_filtered'].median()
    
    peaks, _ = find_peaks(temp_dt['abun_minsub_scaled_filtered'], prominence=med)
    ion_peaks_info.append(len(peaks))
    
    peak_temp = []
    peak_time = []
    peak_abund = []
    for i in peaks:
        tm = temp_dt.iloc[i]['time']; peak_time.append(tm) 
        t = temp_dt.iloc[i]['temp']; peak_temp.append(t)
        a = temp_dt.iloc[i]['abun_minsub_scaled']; peak_abund.append(a)
    
    if len(peak_time)>0 and len(peak_temp)>0 and len(peak_abund)>0:
        peak_time = max(peak_time)
        peak_temp = max(peak_temp)
        peak_abund = max(peak_abund)
    else: 
        peak_time, peak_temp, peak_abund = 0, 0, 0
        
    # Compute AUC
    if not temp_dt.empty:
        area_abund = np.round(auc(temp_dt['temp'],temp_dt['abun_minsub_scaled']),5)
    else: area_abund = 0
    
    # Add values
    ion_peaks_info.append(peak_time)
    ion_peaks_info.append(peak_temp)
    ion_peaks_info.append(peak_abund)
    ion_peaks_info.append(area_abund)
        
    ion_peaks_cnt[ion] = ion_peaks_info
    
ion_peaks_cnt

In [None]:
new_cols = ['m/z','peak_cnt', 'peak_time', 'peak_temp', 'peak_abund', 'abund_area']
ion_peaks_stats = pd.DataFrame(ion_peaks_cnt)
ion_peaks_stats = ion_peaks_stats.T
ion_peaks_stats.reset_index(inplace=True)
ion_peaks_stats.columns = new_cols
ion_peaks_stats['sample_id'] = 'S0000'
ion_peaks_stats

In [None]:
features.compute_ion_peaks(metadata, 0, ion_list)

In [None]:
file_paths = {A:N for (A,N) in [x for x in train_files.items()][:5]}
ita = features.features_ion_peaks(file_paths, metadata, ion_list)
ita

### Abundance Peaks + Stats

In [17]:
# ===== TRAIN SAMPLE =====
fts_ion_peaks = features.features_ion_peaks(trva_files, metadata, ion_list)
print(fts_ion_peaks.shape)
fts_ion_peaks.head()

100%|██████████| 1059/1059 [03:26<00:00,  5.12it/s]

(1059, 396)





Unnamed: 0_level_0,peak_cnt_0.0,peak_cnt_1.0,peak_cnt_2.0,peak_cnt_3.0,peak_cnt_5.0,peak_cnt_6.0,peak_cnt_7.0,peak_cnt_8.0,peak_cnt_9.0,peak_cnt_10.0,...,peak_abund_90.0,peak_abund_91.0,peak_abund_92.0,peak_abund_93.0,peak_abund_94.0,peak_abund_95.0,peak_abund_96.0,peak_abund_97.0,peak_abund_98.0,peak_abund_99.0
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
S0000,2.0,2.0,2.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
S0001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
S0002,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
S0003,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
S0004,2.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# ===== VALID & TEST SAMPLE =====
fts_ion_peaks_VT = features.features_ion_peaks(all_test_files, metadata, ion_list)
print(fts_ion_peaks_VT.shape)
fts_ion_peaks_VT.head()

100%|██████████| 804/804 [03:06<00:00,  4.32it/s]

(804, 396)





Unnamed: 0_level_0,peak_cnt_0.0,peak_cnt_1.0,peak_cnt_2.0,peak_cnt_3.0,peak_cnt_5.0,peak_cnt_6.0,peak_cnt_7.0,peak_cnt_8.0,peak_cnt_9.0,peak_cnt_10.0,...,peak_abund_90.0,peak_abund_91.0,peak_abund_92.0,peak_abund_93.0,peak_abund_94.0,peak_abund_95.0,peak_abund_96.0,peak_abund_97.0,peak_abund_98.0,peak_abund_99.0
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
S0766,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
S0767,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
S0768,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
S0769,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
S0770,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**ANALYZE THE ABUNDANCE CURVE**

## Combine mx relative abundance and ion peaks

In [19]:
# ===== TRAIN SAMPLE =====
combo_maxabund_peaks = pd.concat([fts_maxrelabund_tempion, fts_ion_peaks], 
                               axis = 1, 
                               ignore_index=False)
print(combo_maxabund_peaks.shape)
combo_maxabund_peaks.to_csv(os.path.join(config.DATA_DIR_OUT +
                                            'combo_maxabund_peaks_trvl.csv'),
                index=False)

(1059, 1980)


In [20]:
# ===== VALID & TEST SAMPLE =====
combo_maxabund_peaks_VT = pd.concat([fts_maxrelabund_tempion_VT, fts_ion_peaks_VT], 
                               axis = 1, 
                               ignore_index=False)
print(combo_maxabund_peaks_VT.shape)
combo_maxabund_peaks_VT.to_csv(os.path.join(config.DATA_DIR_OUT +
                                            'combo_maxabund_peaks_VT_trvl.csv'),
                index=False)

(804, 1980)


## Slope between time and temp per sample + `combo_maxabund_peaks`

In [21]:
# ===== TRAIN SAMPLE =====
# Dict with sample_id as keys
fts_slope_timetemp = features.slope_time_temp(trva_files, metadata)
combo_maxabund_peaks_slope = combo_maxabund_peaks.copy()
combo_maxabund_peaks_slope['slope_tt'] = combo_maxabund_peaks_slope.index.map(fts_slope_timetemp)
combo_maxabund_peaks_slope.to_csv(os.path.join(config.DATA_DIR_OUT +
                                            'combo_maxabund_peaks_slope_trvl.csv'),
                index=False)
combo_maxabund_peaks_slope.head()

100%|██████████| 1059/1059 [01:18<00:00, 13.50it/s]


Unnamed: 0_level_0,Ion_0.0_-100_0,Ion_0.0_0_100,Ion_0.0_100_200,Ion_0.0_200_300,Ion_0.0_300_400,Ion_0.0_400_500,Ion_0.0_500_600,Ion_0.0_600_700,Ion_0.0_700_800,Ion_0.0_800_900,...,peak_abund_91.0,peak_abund_92.0,peak_abund_93.0,peak_abund_94.0,peak_abund_95.0,peak_abund_96.0,peak_abund_97.0,peak_abund_98.0,peak_abund_99.0,slope_tt
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
S0000,0.0,0.004085,0.004641,0.001394,0.000188,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.195083
S0001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.167294
S0002,0.0,0.00227,0.002495,0.001688,0.000636,0.000597,0.000819,0.000155,0.000235,0.000227,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.601609
S0003,0.0,0.001336,0.002464,0.001099,0.000992,0.000676,0.000883,0.000892,0.000631,0.000361,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.609247
S0004,0.0,0.005993,0.012429,0.00938,0.005099,0.006921,0.001966,0.000499,0.00088,0.000767,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.613325


In [22]:
# ===== VALID & TEST SAMPLE =====
fts_slope_timetemp_VT = features.slope_time_temp(all_test_files, metadata)
combo_maxabund_peaks_slope_VT = combo_maxabund_peaks_VT.copy()
combo_maxabund_peaks_slope_VT['slope_tt'] = combo_maxabund_peaks_slope_VT.index.map(fts_slope_timetemp_VT)
combo_maxabund_peaks_slope_VT.to_csv(os.path.join(config.DATA_DIR_OUT +
                                            'combo_maxabund_peaks_slope_VT_trvl.csv'),
                index=False)
combo_maxabund_peaks_slope_VT.head()

100%|██████████| 804/804 [01:02<00:00, 12.93it/s]


Unnamed: 0_level_0,Ion_0.0_-100_0,Ion_0.0_0_100,Ion_0.0_100_200,Ion_0.0_200_300,Ion_0.0_300_400,Ion_0.0_400_500,Ion_0.0_500_600,Ion_0.0_600_700,Ion_0.0_700_800,Ion_0.0_800_900,...,peak_abund_91.0,peak_abund_92.0,peak_abund_93.0,peak_abund_94.0,peak_abund_95.0,peak_abund_96.0,peak_abund_97.0,peak_abund_98.0,peak_abund_99.0,slope_tt
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
S0766,0.0,0.4101,0.593334,0.606578,0.678006,0.553591,0.596561,0.606902,0.535487,0.69642,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.609997
S0767,0.0,0.058563,0.073878,0.081069,0.083869,0.085114,0.08781,0.078234,0.079305,0.074327,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.60809
S0768,0.0,0.029256,0.040753,0.037022,0.041347,0.035408,0.036785,0.033979,0.02397,0.023692,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.612946
S0769,0.0,0.000261,0.000842,0.001313,0.001544,0.001405,0.002993,0.001424,0.000768,0.000992,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.541738
S0770,0.0,0.00346,0.003929,0.003286,0.002893,0.010054,0.003877,0.001918,0.000824,0.000323,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.61167


## Target encoding on instrument type

Target encoding on `instrument_type` for each label. Each label is a new feature.

```
# ===== TRAIN SAMPLE =====
ht = metadata[metadata.split.isin(['train', 'val'])][['sample_id','instrument_type']].copy()
ht_y = pd.concat([train_labels, valid_labels], axis = 0).reset_index(drop=True)
assert all(ht.index == ht_y.index)
assert ht_y.shape[0] == ht.shape[0]
temp = pd.merge(ht, ht_y, on='sample_id', how='left')

# ===== VALID & TEST SAMPLE =====
temp_VT = metadata[metadata.split.isin(['val', 'test'])][['sample_id','instrument_type']].copy()
temp_VT.shape

temp, temp_VT, le_dict = features.label_encode_multi(df=temp,
                                                     df_test=temp_VT, 
                                                     feature='instrument_type', 
                                                     target_labels_list=target_labels_list)
```

# MODELS

In [None]:
# Data frame to save local CV results
models_log_loss = pd.DataFrame(index=target_labels_list)

## Logistic Regression - Benchmark

In [None]:
train_cv_loss_LR, train_full_clf_LR, submission_LR = training.train_tbl(
    df_train=fts_maxrelabund_tempion,
    df_labels=trvl_labels,
    target_list=target_labels_list,
    df_test=fts_maxrelabund_tempion_VT,
    model_algo='LR_reg',
    sub_name='LR_reg_trvl'
    )
models_log_loss['LR_reg'] = models_log_loss.index.map(train_cv_loss_LR)

## XGBoost - optimized
- numerical data needs to be scaled
- categorical data needs to be encoded

In [None]:
train_cv_loss_XGB_opt, train_full_clf_XGB_opt, submission_XGB_opt = training.train_tbl(
    df_train=fts_maxrelabund_tempion,
    df_labels=trvl_labels,
    target_list=target_labels_list,
    df_test=fts_maxrelabund_tempion_VT,
    model_algo='XGB_opt',
    sub_name='XGB_opt_trvl'
    )
models_log_loss['XGB_opt'] = models_log_loss.index.map(train_cv_loss_XGB_opt)

## XGBoost opt - Temp bin + Ion peaks

In [None]:
train_cv_loss_XGB_tempb_peaks_opt, train_full_clf_XGB_tempb_peaks_opt, submission_XGB_tempb_peaks_opt =\
    training.train_tbl(
        df_train=combo_maxabund_peaks,
        df_labels=trvl_labels,
        target_list=target_labels_list,
        df_test=combo_maxabund_peaks_VT,
        model_algo='XGB_opt',
        sub_name='XGB_tempb_peaks_opt'
    )
models_log_loss['XGB_tempb_peaks_opt'] = models_log_loss.index.map(train_cv_loss_XGB_tempb_peaks_opt)

## XGBopt - Temp bin + Ion peaks + slope_tt

In [None]:
train_cv_loss_XGB_tempb_peaks_slope_opt, train_full_clf_XGB_tempb_peaks_slope_opt, submission_XGB_tempb_peaks_slope_opt =\
    training.train_tbl(
        df_train=combo_maxabund_peaks_slope,
        df_labels=trvl_labels,
        target_list=target_labels_list,
        df_test=combo_maxabund_peaks_slope_VT,
        model_algo='XGB_opt',
        sub_name='XGB_tempb_peaks_slope_opt'
    )
models_log_loss['XGB_tempb_peaks_slope_opt'] = models_log_loss.index.map(train_cv_loss_XGB_tempb_peaks_slope_opt)

In [None]:
#TODO Normalize peaks ?
#TODO Add instrument type as binary

***

## Model Summary

In [None]:
models_log_loss.style.highlight_min(axis=1, props='color:darkblue; background-color:lightblue;')

In [None]:
models_log_loss.style.highlight_min(axis=1, props='color:darkblue; background-color:lightblue;')

In [None]:
models_log_loss.agg('mean').sort_values()

# Predict Validation on Individual Labels

Compute predictions for each label depending on which model performed the best for that label. Validate the results on the validation sample only since we have the labels.

In [None]:
submission_by_label = pd.read_csv(config.DATA_DIR + 'submission_format.csv', 
                             index_col='sample_id')
submission_by_label = submission_by_label
print(submission_by_label.shape)

log_loss_ind_label = {}

for label in target_labels_list:
    if label == 'oxalate':
        submission_by_label[label] = submission_LR[label]
    elif label in ['silicate']:
        submission_by_label[label] = submission_XGB_opt[label]
    elif label in ['iron_oxide', 'sulfide']:
        submission_by_label[label] = submission_XGB_tempb_peaks_opt[label]
    else:
        submission_by_label[label] = submission_XGB_tempb_peaks_slope_opt[label]
    
    # Compute log-loss
    ll = log_loss(valid_labels[label], submission_by_label.iloc[:valid_labels.shape[0]][label])
    log_loss_ind_label[label] = ll

models_log_loss['Ind_labels'] = models_log_loss.index.map(log_loss_ind_label)

submission_by_label.to_csv(config.MODELS_DIR + 'ind_label_trvl_slope' + '.csv')

print(f'Average Log Loss: {np.mean(list(log_loss_ind_label.values()))}')
log_loss_ind_label

In [None]:
submission_by_label.tail()

In [None]:
models_log_loss.style.highlight_min(axis=1, props='color:darkblue; background-color:lightblue;')

# Test SAM-testbed samples

In [None]:
# Select only SAM-testbed samples
tr_sam = metadata[(metadata.split.isin(['train', 'val'])) & (metadata.instrument_type == 'sam_testbed')].copy()
print(f'Number of SAM samples: {tr_sam.shape}')

# Get the index and sample_id
tr_sam_ids = tr_sam['sample_id']
tr_sam_idx = tr_sam.index

In [None]:
tr_sam_y = train_labels[train_labels.sample_id.isin(tr_sam_ids)]
log_loss_sam = {}
sam_preds = pd.DataFrame(index=tr_sam_idx)

for label in target_labels_list:
    #print(colored(label, 'blue'))
    if label == 'oxalate':
        df = fts_maxrelabund_tempion[fts_maxrelabund_tempion.index.isin(tr_sam_ids)]
        assert df.shape[0] == tr_sam_y.shape[0]
        clf = train_full_clf_LR[label]
    elif label in ['carbonate', 'iron_oxide', 'silicate', 'sulfate']:
        df = fts_maxrelabund_tempion[fts_maxrelabund_tempion.index.isin(tr_sam_ids)]
        assert df.shape[0] == tr_sam_y.shape[0]
        clf = train_full_clf_XGB_opt[label]
    else:
        df = combo_maxabund_peaks[combo_maxabund_peaks.index.isin(tr_sam_ids)]
        assert df.shape[0] == tr_sam_y.shape[0]
        clf = train_full_clf_XGB_tempb_peaks_opt[label]
    
    # Compute predictions
    preds = clf.predict_proba(df)[:,1]
    sam_preds[label] = preds
    
    # Compute log-loss per label
    y_true = tr_sam_y[label]
    ll = log_loss(y_true, preds, labels=(0,1))
    #print(colored(f'Log-Loss: {label} = {ll}', 'blue'))
    log_loss_sam[label] = ll

In [None]:
print(colored(f'Avg log-loss: {np.mean(list(log_loss_sam.values()))}', 'blue'))
log_loss_sam

In [None]:
sam_preds