# DL TRAINING NOTEBOOK

## Environment

In [2]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [3]:
# Change main system path to be able to run code from src folder
import sys
p = sys.path[0]
# Mac OS
if sys.path[0].endswith('/notebooks'):
    main_path = p[:-len('/notebooks')]
if sys.path[0].endswith('/techdoc/content'):
    main_path = p[:-len('/techdoc/content')]
    
# Windows OS
if sys.path[0].endswith('\\notebooks'): 
    main_path = p[:-len('\\notebooks')]
if sys.path[0].endswith('\\techdoc\content'): 
    main_path = p[:-len('\\techdoc\content')]

sys.path[0] = main_path

In [4]:
import gc, itertools
from termcolor import colored
from tqdm import tqdm

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, f1_score
from sklearn.model_selection import cross_val_predict
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

from src import (config, describe_data, features,
                 preprocess, training, utils)

  from pandas import MultiIndex, Int64Index


# DATA LOAD

In [5]:
# ===== LOAD DATA ======
metadata = pd.read_csv(config.DATA_DIR + 'metadata.csv')
print(f'Metadata: {metadata.shape}')

train_labels = pd.read_csv(config.DATA_DIR + 'train_labels.csv')
print(f'Train labels: {train_labels.shape}')

valid_labels = pd.read_csv(config.DATA_DIR + 'val_labels.csv')
print(f'Train labels: {valid_labels.shape}')

submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
print(f'Submission: {submission.shape}')

Metadata: (1570, 5)
Train labels: (766, 11)
Train labels: (293, 11)
Submission: (804, 11)


In [6]:
# ===== MODEL SAMPLES ======
train = metadata[metadata.split == 'train'].copy().reset_index(drop=True)
print(f'TRAIN: {train.shape}')

valid = metadata[metadata.split == 'val'].copy().reset_index(drop=True)
print(f'VALID: {valid.shape}')

test = metadata[metadata.split == 'test'].copy().reset_index(drop=True)
print(f'TEST: {test.shape}')

TRAIN: (766, 5)
VALID: (293, 5)
TEST: (511, 5)


In [7]:
# ===== FILE PATHS OF SAMPLES =====
train_files = metadata[metadata.split == 'train']['features_path'].to_dict()
valid_files = metadata[metadata.split == 'val']['features_path'].to_dict()
test_files = metadata[metadata.split == 'test']['features_path'].to_dict()
all_test_files = valid_files.copy()
all_test_files.update(test_files)

ion_list = list(np.arange(0,100,1.0))

# Get the names of the target columns in a list
target_labels_list = [i for i in train_labels.columns if i not in ['sample_id']]
print(target_labels_list)

['basalt', 'carbonate', 'chloride', 'iron_oxide', 'oxalate', 'oxychlorine', 'phyllosilicate', 'silicate', 'sulfate', 'sulfide']


# DEEP LEARNING PREPPROCESSING

We need to create a 3D array, where the 1st dimension are samples, 2nd dimension are features and the 3rd dimension are the time steps. Since the raw time steps repeat given a different ion type, i.e. `m/z` we need to construct features with respect to the ion type. So we will have `1.0_temp`, `1.0_abundance`, `2.0_temp`, `2.0_abundance`, etc. Furthermore, the time is measured in seconds and time steps are not uniform and even across samples. Different time measurements are across samples. Hence, we will first compute the maximum time present in the training, validation and test samples and then aggregate all the data in 10 second time intervals. This will provide a full data array instead of a sparse one. Also it will significantly reduce the training size.

In [8]:
# COMPUTE MAXIMUM TIME ACROSS ALL SAMPLES
max_time = preprocess.compute_max_time_samples(metadata)
max_time

100%|██████████| 1570/1570 [01:45<00:00, 14.88it/s]


5248.14

**Are the time steps and `m/z` values unique?** We need only one row per sample given one time step. Hence, we need to ensure that the time steps and the ion types are unique. We should add this check (that one row is being created for one sample and one time period) when we construct the final array.

In the `sam_testbed` samples, there are differences in temperature per `time_bin` and the ion type, where per one `temp_bin`we get different temperature values per diferent ion type. This leads to duplicates in the rows per time step. See sample 765 for example.

```python
ht['check'] = ht.groupby(['time', 'm/z'])['time'].transform('count')
ht[ht['check'] > 1]
```

In [99]:
# ===== CREATE TS DF - TRAIN =====
df_meta = metadata[metadata.split == 'train']
fts_dl_ts = features.dl_ts(df_meta, max_time)
print(fts_dl_ts.shape)
fts_dl_ts.head()

100%|██████████| 766/766 [05:27<00:00,  2.34it/s]


(402150, 104)


Unnamed: 0,time_bin,temp,temp_osc_time,mz_0_abund,mz_1_abund,mz_2_abund,mz_3_abund,mz_5_abund,mz_6_abund,mz_7_abund,...,mz_92_abund,mz_93_abund,mz_94_abund,mz_95_abund,mz_96_abund,mz_97_abund,mz_98_abund,mz_99_abund,sample_id,instrument_type
0,"[0.0, 10.0)",35.289,0.0,0.000172,0.000151,0.000045,0.000126,0.000011,1.914604e-06,1.719285e-06,...,0.000002,1.339071e-06,9.866536e-07,1.572230e-06,0.000002,0.000003,5.738569e-07,2.132724e-06,S0000,commercial
1,"[10.0, 20.0)",35.420,0.0,0.000182,0.000163,0.000042,0.000125,0.000007,3.390866e-06,1.932152e-06,...,0.000001,2.459132e-06,2.169595e-06,2.155304e-06,0.000002,0.000002,1.218669e-06,2.416544e-06,S0000,commercial
2,"[20.0, 30.0)",35.680,0.0,0.000158,0.000112,0.000038,0.000186,0.000007,8.339909e-07,1.657618e-06,...,0.000002,2.039618e-06,1.211452e-06,5.682833e-07,0.000002,0.000002,1.589236e-06,9.542845e-07,S0000,commercial
3,"[30.0, 40.0)",36.329,0.0,0.000184,0.000113,0.000031,0.000170,0.000004,1.631496e-06,5.990105e-07,...,0.000000,7.548532e-07,3.817495e-06,1.514565e-06,0.000002,0.000002,1.403524e-06,9.485680e-07,S0000,commercial
4,"[40.0, 50.0)",37.293,0.0,0.000170,0.000136,0.000062,0.000192,0.000005,2.161267e-06,1.754442e-06,...,0.000002,8.333112e-07,2.654991e-06,2.064771e-06,0.000002,0.000002,1.821823e-06,1.904639e-06,S0000,commercial
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
520,"[5200.0, 5210.0)",,,,,,,,,,...,,,,,,,,,S0765,sam_testbed
521,"[5210.0, 5220.0)",,,,,,,,,,...,,,,,,,,,S0765,sam_testbed
522,"[5220.0, 5230.0)",,,,,,,,,,...,,,,,,,,,S0765,sam_testbed
523,"[5230.0, 5240.0)",,,,,,,,,,...,,,,,,,,,S0765,sam_testbed


We need to label encode all the non numeric features since the neural network handles only numeric values. In the code below we check which features are not numeric and encode them. Note that the `time_bin` and `sample_id` are not features so no need to encode them.

In [111]:
# ===== ENCODE NON NUMERIC FEATURES =====
print(f'Non numeric features: {[i for i in fts_dl_ts if fts_dl_ts[i].dtype not in ["int", "float"]]}')
#print(fts_dl_ts['instrument_type'].value_counts())
fts_dl_ts['instrument'] = np.where(fts_dl_ts['instrument_type'] == 'commercial', 1, 0)
#print(fts_dl_ts['instrument'].value_counts())
del fts_dl_ts['instrument_type']

Non numeric features: ['time_bin', 'sample_id', 'instrument_type']


In [135]:
# ===== FEATURES LIST =====
features_dl = [i for i in fts_dl_ts if i not in ['sample_id', 'time_bin']]
print(f'Number of features: {len(features_dl)}')
#print(features_dl)

Number of features: 102


Also, we need to replace the NaN values. Since zero is a meaningful value in the sample we will assign a value of -1 to all NaN values.

In [114]:
# ===== FIX NaN VALUES & NORMALIZE VALUES =====
print(fts_dl_ts.isnull().sum())
fts_dl_ts[features_dl] = fts_dl_ts[features_dl].fillna(-1)

In [120]:
fts_dl_ts[['sample_id', 'time_bin'] + features_dl].sort_values(by=['time_bin', 'sample_id']).head()

Unnamed: 0,sample_id,time_bin,temp,temp_osc_time,mz_0_abund,mz_1_abund,mz_2_abund,mz_3_abund,mz_5_abund,mz_6_abund,...,mz_91_abund,mz_92_abund,mz_93_abund,mz_94_abund,mz_95_abund,mz_96_abund,mz_97_abund,mz_98_abund,mz_99_abund,instrument
0,S0000,"[0.0, 10.0)",35.289,0.0,0.000172,0.000151,4.5e-05,0.000126,1.1e-05,2e-06,...,1.055751e-06,2.298143e-06,1.339071e-06,9.866536e-07,1.57223e-06,1.681341e-06,2.709511e-06,5.738569e-07,2.132724e-06,1
0,S0001,"[0.0, 10.0)",-60.37,0.0,-1.0,0.0,0.061658,0.002596,4e-06,1e-05,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1
0,S0002,"[0.0, 10.0)",29.7725,0.0,0.000266,0.00047,0.00019,0.000279,1.6e-05,1e-06,...,5.866712e-07,4.55586e-07,3.054592e-07,3.556442e-07,4.361277e-07,6.539513e-07,4.645773e-07,4.514163e-07,5.846176e-07,1
0,S0003,"[0.0, 10.0)",29.8905,0.0,0.000493,0.00063,0.000347,0.000582,3.3e-05,3e-06,...,1.901334e-06,3.092897e-06,1.288515e-06,1.090843e-06,2.855537e-06,1.841429e-06,1.96925e-06,1.841522e-06,2.438128e-06,1
0,S0004,"[0.0, 10.0)",32.2585,0.0,0.002611,0.005093,0.002071,0.003169,0.000241,1.3e-05,...,1.425812e-06,2.047351e-06,1.205654e-06,1.481131e-06,7.65147e-07,1.777987e-06,9.578115e-07,2.110925e-06,1.651033e-06,1


In [134]:
# Check that each time bin has all the samples accounted for
assert all(fts_dl_ts['time_bin'].value_counts() == fts_dl_ts.sample_id.nunique())

The input for the LSTM is a 3D tensor with shape `[batch, timesteps, feature]`.

In [136]:
# ===== CREATE 3D ARRAY (samples, time step, features) =====
# ----- Define input layer data -----
# Number of samples
no_samples = fts_dl_ts.sample_id.nunique()      
print(f'Samples: {no_samples}')

# Number of time steps
no_time_steps = fts_dl_ts.time_bin.nunique()    
print(f'Time steps: {no_time_steps}')

# Number of features
no_features = len(features_dl)                  
print(f'Features: {no_features}')

Samples: 766
Time steps: 525
Features: 102


In [129]:
data = fts_dl_ts[features_dl].copy()

# Reshape data
data = np.array(data)
data = data.reshape((no_samples, no_time_steps, no_features))
print(data.shape)

(766, 525, 102)


array([[[ 3.52890000e+01,  0.00000000e+00,  1.72406959e-04, ...,
          5.73856879e-07,  2.13272382e-06,  1.00000000e+00],
        [ 3.54200000e+01,  0.00000000e+00,  1.82146643e-04, ...,
          1.21866863e-06,  2.41654419e-06,  1.00000000e+00],
        [ 3.56800000e+01,  0.00000000e+00,  1.57973097e-04, ...,
          1.58923574e-06,  9.54284461e-07,  1.00000000e+00],
        ...,
        [-1.00000000e+00, -1.00000000e+00, -1.00000000e+00, ...,
         -1.00000000e+00, -1.00000000e+00,  1.00000000e+00],
        [-1.00000000e+00, -1.00000000e+00, -1.00000000e+00, ...,
         -1.00000000e+00, -1.00000000e+00,  1.00000000e+00],
        [-1.00000000e+00, -1.00000000e+00, -1.00000000e+00, ...,
         -1.00000000e+00, -1.00000000e+00,  1.00000000e+00]],

       [[-6.03700000e+01,  0.00000000e+00, -1.00000000e+00, ...,
         -1.00000000e+00, -1.00000000e+00,  1.00000000e+00],
        [-6.03510000e+01,  0.00000000e+00, -1.00000000e+00, ...,
         -1.00000000e+00, -1.00000000e

In [126]:
data.shape    

(402150, 102)

In [97]:
ht1 = features.dl_time_pivot(metadata, 765, max_time)
ht1

Unnamed: 0,time_bin,temp,temp_osc_time,mz_0_abund,mz_5_abund,mz_6_abund,mz_7_abund,mz_12_abund,mz_13_abund,mz_14_abund,...,mz_92_abund,mz_93_abund,mz_94_abund,mz_95_abund,mz_96_abund,mz_97_abund,mz_98_abund,mz_99_abund,sample_id,instrument_type
0,"[0.0, 10.0)",35.620765,0.198793,0.0,0.0,0.000002,0.000005,0.011006,0.000707,0.001775,...,0.000051,0.000046,0.000040,0.000034,0.000183,0.000149,0.000131,0.000029,S0765,sam_testbed
1,"[10.0, 20.0)",35.582467,0.259518,,0.0,0.000000,0.000003,0.010468,0.000549,0.001709,...,0.000051,0.000034,0.000074,0.000046,0.000143,0.000137,0.000063,0.000017,S0765,sam_testbed
2,"[20.0, 30.0)",35.657034,0.191759,,0.0,0.000000,0.000003,0.010033,0.000583,0.001732,...,0.000057,0.000046,0.000034,0.000017,0.000143,0.000109,0.000063,0.000000,S0765,sam_testbed
3,"[30.0, 40.0)",35.796819,0.232425,,0.0,0.000007,0.000000,0.009330,0.000520,0.001761,...,0.000060,0.000060,0.000034,0.000043,0.000137,0.000214,0.000077,0.000011,S0765,sam_testbed
4,"[40.0, 50.0)",35.997046,0.266277,,0.0,0.000006,0.000003,0.008610,0.000543,0.001847,...,0.000029,0.000057,0.000051,0.000091,0.000154,0.000149,0.000086,0.000017,S0765,sam_testbed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
520,"[5200.0, 5210.0)",,,,,,,,,,...,,,,,,,,,S0765,sam_testbed
521,"[5210.0, 5220.0)",,,,,,,,,,...,,,,,,,,,S0765,sam_testbed
522,"[5220.0, 5230.0)",,,,,,,,,,...,,,,,,,,,S0765,sam_testbed
523,"[5230.0, 5240.0)",,,,,,,,,,...,,,,,,,,,S0765,sam_testbed


In [83]:
ht1.time_bin.nunique()

525

In [93]:
ht = preprocess.get_sample(metadata, 765)
ht = preprocess.preprocess_samples(ht)

print(f'Rows: {ht.shape[0]}')
print(f'Ion rows: {ht.groupby("m/z")["m/z"].agg("count").unique()[0]}')

time_range = pd.interval_range(start=0.0, 
                               end=utils.roundup(max_time), 
                               freq=10, 
                               closed='left')
ht['time_bin'] = pd.cut(ht['time'], bins=time_range)
del ht['time']
#ht.sort_values(['m/z', 'time_bin'])

Rows: 116422
Ion rows: 1


In [96]:
ht_agg = ht.groupby(['m/z', 'time_bin']).agg('mean').reset_index()
print(ht_agg.shape)
ht_agg['temp_osc'] = ht_agg.groupby('time_bin')['temp'].transform('std')
ht_agg['temp'] = ht_agg.groupby('time_bin')['temp'].transform('mean')
ht_agg[ht_agg['time_bin'] == pd.Interval(0.0, 10.0, closed=('left'))].head(50)

(48300, 4)


Unnamed: 0,m/z,time_bin,temp,abun_minsub_scaled,temp_osc
0,0.0,"[0.0, 10.0)",35.620765,0.0,0.198793
525,5.0,"[0.0, 10.0)",35.620765,0.0,0.198793
1050,6.0,"[0.0, 10.0)",35.620765,2e-06,0.198793
1575,7.0,"[0.0, 10.0)",35.620765,5e-06,0.198793
2100,12.0,"[0.0, 10.0)",35.620765,0.011006,0.198793
2625,13.0,"[0.0, 10.0)",35.620765,0.000707,0.198793
3150,14.0,"[0.0, 10.0)",35.620765,0.001775,0.198793
3675,15.0,"[0.0, 10.0)",35.620765,0.009124,0.198793
4200,16.0,"[0.0, 10.0)",35.620765,0.055259,0.198793
4725,17.0,"[0.0, 10.0)",35.620765,0.2711,0.198793


In [55]:
ht = preprocess.get_sample(metadata, 765)
ht = preprocess.preprocess_samples(ht)

print(f'Rows: {ht.shape[0]}')
print(f'Ion rows: {ht.groupby("m/z")["m/z"].agg("count").unique()[0]}')

time_range = pd.interval_range(start=0.0, 
                               end=utils.roundup(max_time), 
                               freq=10, 
                               closed='left')
ht['time_bin'] = pd.cut(ht['time'], bins=time_range)
ht = ht[['time_bin', 'temp', 'm/z', 'abun_minsub_scaled']]


ht['temp'] = np.round(ht['temp'],0)
ht['temp_agg'] = ht.groupby(['time_bin', 'm/z'])['temp'].transform('mean')
del ht['temp']
ht.drop_duplicates(inplace=True)
ht['abun_agg'] = ht.groupby(['time_bin', 'temp_agg', 'm/z'])['abun_minsub_scaled']\
            .transform('mean')
del ht['abun_minsub_scaled']
ht.drop_duplicates(inplace=True)
    
ht.head()
#ht.head(25)
ht.sort_values(['time_bin', 'm/z'])

Rows: 116422
Ion rows: 1


Unnamed: 0,time_bin,m/z,temp_agg,abun_agg
0,"[0.0, 10.0)",0.0,36.000000,0.000000
1,"[0.0, 10.0)",5.0,35.142857,0.000000
2,"[0.0, 10.0)",6.0,35.285714,0.000009
3,"[0.0, 10.0)",7.0,35.142857,0.000009
4,"[0.0, 10.0)",12.0,35.500000,0.011006
...,...,...,...,...
141240,"[4200.0, 4210.0)",95.0,1048.000000,0.000017
141241,"[4200.0, 4210.0)",96.0,1046.000000,0.000017
141242,"[4200.0, 4210.0)",97.0,1048.000000,0.000069
141243,"[4200.0, 4210.0)",98.0,1048.000000,0.000034


## LSTM Model

> LSTMs work better with 200-to-400 time steps (J. Brownlee)