# DL TRAINING NOTEBOOK

## Environment

In [2]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [3]:
# Change main system path to be able to run code from src folder
import sys
p = sys.path[0]
# Mac OS
if sys.path[0].endswith('/notebooks'):
    main_path = p[:-len('/notebooks')]
if sys.path[0].endswith('/techdoc/content'):
    main_path = p[:-len('/techdoc/content')]
    
# Windows OS
if sys.path[0].endswith('\\notebooks'): 
    main_path = p[:-len('\\notebooks')]
if sys.path[0].endswith('\\techdoc\content'): 
    main_path = p[:-len('\\techdoc\content')]

sys.path[0] = main_path

In [4]:
import gc, itertools
from termcolor import colored
from tqdm import tqdm

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, f1_score
from sklearn.model_selection import cross_val_predict
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

from src import (config, describe_data, features,
                 preprocess, training, utils)

  from pandas import MultiIndex, Int64Index


# DATA LOAD

In [5]:
# ===== LOAD DATA ======
metadata = pd.read_csv(config.DATA_DIR + 'metadata.csv')
print(f'Metadata: {metadata.shape}')

train_labels = pd.read_csv(config.DATA_DIR + 'train_labels.csv')
print(f'Train labels: {train_labels.shape}')

valid_labels = pd.read_csv(config.DATA_DIR + 'val_labels.csv')
print(f'Train labels: {valid_labels.shape}')

submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
print(f'Submission: {submission.shape}')

Metadata: (1570, 5)
Train labels: (766, 11)
Train labels: (293, 11)
Submission: (804, 11)


In [6]:
# ===== MODEL SAMPLES ======
train = metadata[metadata.split == 'train'].copy().reset_index(drop=True)
print(f'TRAIN: {train.shape}')

valid = metadata[metadata.split == 'val'].copy().reset_index(drop=True)
print(f'VALID: {valid.shape}')

test = metadata[metadata.split == 'test'].copy().reset_index(drop=True)
print(f'TEST: {test.shape}')

TRAIN: (766, 5)
VALID: (293, 5)
TEST: (511, 5)


In [7]:
# ===== FILE PATHS OF SAMPLES =====
train_files = metadata[metadata.split == 'train']['features_path'].to_dict()
valid_files = metadata[metadata.split == 'val']['features_path'].to_dict()
test_files = metadata[metadata.split == 'test']['features_path'].to_dict()
all_test_files = valid_files.copy()
all_test_files.update(test_files)

ion_list = list(np.arange(0,100,1.0))

# Get the names of the target columns in a list
target_labels_list = [i for i in train_labels.columns if i not in ['sample_id']]
print(target_labels_list)

['basalt', 'carbonate', 'chloride', 'iron_oxide', 'oxalate', 'oxychlorine', 'phyllosilicate', 'silicate', 'sulfate', 'sulfide']


# DEEP LEARNING PREPPROCESSING

We need to create a 3D array, where the 1st dimension are samples, 2nd dimension are features and the 3rd dimension are the time steps. Since the raw time steps repeat given a different ion type, i.e. `m/z` we need to construct features with respect to the ion type. So we will have `1.0_temp`, `1.0_abundance`, `2.0_temp`, `2.0_abundance`, etc. Furthermore, the time is measured in seconds and time steps are not uniform and even across samples. Different time measurements are across samples. Hence, we will first compute the maximum time present in the training, validation and test samples and then aggregate all the data in 10 second time intervals. This will provide a full data array instead of a sparse one. Also it will significantly reduce the training size.

In [8]:
# COMPUTE MAXIMUM TIME ACROSS ALL SAMPLES
max_time = preprocess.compute_max_time_samples(metadata)
max_time

100%|██████████| 1570/1570 [01:45<00:00, 14.88it/s]


5248.14

**Are the time steps and `m/z` values unique?** We need only one row per sample given one time step. Hence, we need to ensure that the time steps and the ion types are unique. We should add this check (that one row is being created for one sample and one time period) when we construct the final array.

```python
ht['check'] = ht.groupby(['time', 'm/z'])['time'].transform('count')
ht[ht['check'] > 1]
```

In [72]:
ht1 = features.dl_time_pivot(metadata, 765, max_time)
ht1

Unnamed: 0_level_0,Unnamed: 1_level_0,mz_0_abund,mz_5_abund,mz_6_abund,mz_7_abund,mz_12_abund,mz_13_abund,mz_14_abund,mz_15_abund,mz_16_abund,mz_17_abund,...,mz_91_abund,mz_92_abund,mz_93_abund,mz_94_abund,mz_95_abund,mz_96_abund,mz_97_abund,mz_98_abund,mz_99_abund,sample_id
time_bin,temp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
"[0.0, 10.0)",35.227000,,,,,,,,,,,...,,,,,,,,,,S0765
"[0.0, 10.0)",35.227000,,,,,,,,,,,...,,,,,,,,,,S0765
"[0.0, 10.0)",35.265143,,,,0.000005,,,,,,,...,,,,,,,,,,S0765
"[0.0, 10.0)",35.316000,,,,,,,,,,,...,,,,,,,,,,S0765
"[0.0, 10.0)",35.316333,,,,,,,,,,,...,,,0.000046,,,,,,,S0765
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"[5200.0, 5210.0)",,,,,,,,,,,,...,,,,,,,,,,S0765
"[5210.0, 5220.0)",,,,,,,,,,,,...,,,,,,,,,,S0765
"[5220.0, 5230.0)",,,,,,,,,,,,...,,,,,,,,,,S0765
"[5230.0, 5240.0)",,,,,,,,,,,,...,,,,,,,,,,S0765


In [75]:
df_meta = metadata[metadata.split == 'train']
fts_dl_ts = features.dl_ts(df_meta, max_time)
print(fts_dl_ts.shape)
fts_dl_ts

100%|██████████| 766/766 [06:04<00:00,  2.10it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,mz_0_abund,mz_1_abund,mz_2_abund,mz_3_abund,mz_5_abund,mz_6_abund,mz_7_abund,mz_8_abund,mz_9_abund,mz_10_abund,...,mz_91_abund,mz_92_abund,mz_93_abund,mz_94_abund,mz_95_abund,mz_96_abund,mz_97_abund,mz_98_abund,mz_99_abund,sample_id
time_bin,temp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
"[0.0, 10.0)",35.289,0.000172,0.000151,0.000045,0.000126,0.000011,1.914604e-06,1.719285e-06,0.000003,0.000002,0.000004,...,0.000001,0.000002,1.339071e-06,9.866536e-07,1.572230e-06,0.000002,0.000003,5.738569e-07,2.132724e-06,S0000
"[10.0, 20.0)",35.420,0.000182,0.000163,0.000042,0.000125,0.000007,3.390866e-06,1.932152e-06,0.000004,0.000002,0.000004,...,0.000002,0.000001,2.459132e-06,2.169595e-06,2.155304e-06,0.000002,0.000002,1.218669e-06,2.416544e-06,S0000
"[20.0, 30.0)",35.680,0.000158,0.000112,0.000038,0.000186,0.000007,8.339909e-07,1.657618e-06,0.000004,0.000003,0.000001,...,0.000001,0.000002,2.039618e-06,1.211452e-06,5.682833e-07,0.000002,0.000002,1.589236e-06,9.542845e-07,S0000
"[30.0, 40.0)",36.329,0.000184,0.000113,0.000031,0.000170,0.000004,1.631496e-06,5.990105e-07,0.000004,0.000003,0.000003,...,0.000001,0.000000,7.548532e-07,3.817495e-06,1.514565e-06,0.000002,0.000002,1.403524e-06,9.485680e-07,S0000
"[40.0, 50.0)",37.293,0.000170,0.000136,0.000062,0.000192,0.000005,2.161267e-06,1.754442e-06,0.000002,0.000002,0.000002,...,0.000002,0.000002,8.333112e-07,2.654991e-06,2.064771e-06,0.000002,0.000002,1.821823e-06,1.904639e-06,S0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"[5200.0, 5210.0)",,,,,,,,,,,,...,,,,,,,,,,S0765
"[5210.0, 5220.0)",,,,,,,,,,,,...,,,,,,,,,,S0765
"[5220.0, 5230.0)",,,,,,,,,,,,...,,,,,,,,,,S0765
"[5230.0, 5240.0)",,,,,,,,,,,,...,,,,,,,,,,S0765


In [65]:
ht = preprocess.get_sample(metadata, 765)
ht = preprocess.preprocess_samples(ht)

print(f'Rows: {ht.shape[0]}')
print(f'Ion rows: {ht.groupby("m/z")["m/z"].agg("count").unique()[0]}')

time_range = pd.interval_range(start=0.0, 
                               end=utils.roundup(max_time), 
                               freq=10, 
                               closed='left')
ht['time_bin'] = pd.cut(ht['time'], bins=time_range)
del ht['time']
ht.sort_values(['m/z', 'time_bin'])

Rows: 116422
Ion rows: 1


Unnamed: 0,temp,m/z,abun_minsub_scaled,time_bin
0,35.763,0.0,0.000000,"[0.0, 10.0)"
1,34.959,5.0,0.000000,"[0.0, 10.0)"
74,34.959,5.0,0.000000,"[0.0, 10.0)"
98,35.227,5.0,0.000000,"[0.0, 10.0)"
171,35.495,5.0,0.000000,"[0.0, 10.0)"
...,...,...,...,...
140804,1047.199,99.0,0.000051,"[4180.0, 4190.0)"
140901,1046.333,99.0,0.000017,"[4190.0, 4200.0)"
141024,1047.632,99.0,0.000034,"[4190.0, 4200.0)"
141121,1047.632,99.0,0.000017,"[4190.0, 4200.0)"


In [70]:
ht_agg = ht.groupby(['m/z', 'time_bin']).agg('mean').reset_index()
print(ht_agg.shape)
ht_agg[ht_agg['m/z'] == 5.0].head(50)

(48300, 4)


Unnamed: 0,m/z,time_bin,temp,abun_minsub_scaled
525,5.0,"[0.0, 10.0)",35.341857,0.0
526,5.0,"[10.0, 20.0)",35.584167,0.0
527,5.0,"[20.0, 30.0)",35.6558,0.0
528,5.0,"[30.0, 40.0)",35.6022,0.0
529,5.0,"[40.0, 50.0)",35.851833,0.0
530,5.0,"[50.0, 60.0)",36.5665,0.0
531,5.0,"[60.0, 70.0)",37.0575,0.0
532,5.0,"[70.0, 80.0)",38.10825,0.0
533,5.0,"[80.0, 90.0)",39.697333,0.0
534,5.0,"[90.0, 100.0)",41.508,0.0


In [55]:
ht = preprocess.get_sample(metadata, 765)
ht = preprocess.preprocess_samples(ht)

print(f'Rows: {ht.shape[0]}')
print(f'Ion rows: {ht.groupby("m/z")["m/z"].agg("count").unique()[0]}')

time_range = pd.interval_range(start=0.0, 
                               end=utils.roundup(max_time), 
                               freq=10, 
                               closed='left')
ht['time_bin'] = pd.cut(ht['time'], bins=time_range)
ht = ht[['time_bin', 'temp', 'm/z', 'abun_minsub_scaled']]


ht['temp'] = np.round(ht['temp'],0)
ht['temp_agg'] = ht.groupby(['time_bin', 'm/z'])['temp'].transform('mean')
del ht['temp']
ht.drop_duplicates(inplace=True)
ht['abun_agg'] = ht.groupby(['time_bin', 'temp_agg', 'm/z'])['abun_minsub_scaled']\
            .transform('mean')
del ht['abun_minsub_scaled']
ht.drop_duplicates(inplace=True)
    
ht.head()
#ht.head(25)
ht.sort_values(['time_bin', 'm/z'])

Rows: 116422
Ion rows: 1


Unnamed: 0,time_bin,m/z,temp_agg,abun_agg
0,"[0.0, 10.0)",0.0,36.000000,0.000000
1,"[0.0, 10.0)",5.0,35.142857,0.000000
2,"[0.0, 10.0)",6.0,35.285714,0.000009
3,"[0.0, 10.0)",7.0,35.142857,0.000009
4,"[0.0, 10.0)",12.0,35.500000,0.011006
...,...,...,...,...
141240,"[4200.0, 4210.0)",95.0,1048.000000,0.000017
141241,"[4200.0, 4210.0)",96.0,1046.000000,0.000017
141242,"[4200.0, 4210.0)",97.0,1048.000000,0.000069
141243,"[4200.0, 4210.0)",98.0,1048.000000,0.000034


In [31]:
ht['check'] = ht.groupby(['time_bin', 'temp', 'm/z'])['abun_minsub_scaled'].transform('nunique')
ht[ht.check > 1]

Unnamed: 0,time_bin,temp,m/z,abun_minsub_scaled,check
0,"[0.0, 10.0)",32.262,0.0,1.891701e-03,2
1,"[0.0, 10.0)",32.262,1.0,2.691521e-03,2
2,"[0.0, 10.0)",32.262,2.0,1.061821e-03,2
3,"[0.0, 10.0)",32.262,3.0,1.571066e-03,2
5,"[0.0, 10.0)",32.262,5.0,1.126072e-04,2
...,...,...,...,...,...
195,"[0.0, 10.0)",32.262,95.0,5.618730e-07,2
196,"[0.0, 10.0)",32.262,96.0,9.059254e-07,2
197,"[0.0, 10.0)",32.262,97.0,5.129891e-07,2
198,"[0.0, 10.0)",32.262,98.0,1.641385e-06,2


In [None]:
data = np.zeros(())
labels = np.empty(())