# DL TRAINING NOTEBOOK

## Environment

In [1]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
# Change main system path to be able to run code from src folder
import sys
p = sys.path[0]
# Mac OS
if sys.path[0].endswith('/notebooks'):
    main_path = p[:-len('/notebooks')]
if sys.path[0].endswith('/techdoc/content'):
    main_path = p[:-len('/techdoc/content')]
    
# Windows OS
if sys.path[0].endswith('\\notebooks'): 
    main_path = p[:-len('\\notebooks')]
if sys.path[0].endswith('\\techdoc\content'): 
    main_path = p[:-len('\\techdoc\content')]

sys.path[0] = main_path

In [3]:
import gc, itertools
from termcolor import colored
from tqdm import tqdm

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, f1_score
from sklearn.model_selection import cross_val_predict
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

from src import (config, describe_data, features,
                 preprocess, training)

  from pandas import MultiIndex, Int64Index


# DATA PREPARATION

In [4]:
# ===== LOAD DATA ======
metadata = pd.read_csv(config.DATA_DIR + 'metadata.csv')
print(f'Metadata: {metadata.shape}')

train_labels = pd.read_csv(config.DATA_DIR + 'train_labels.csv')
print(f'Train labels: {train_labels.shape}')

valid_labels = pd.read_csv(config.DATA_DIR + 'val_labels.csv')
print(f'Train labels: {valid_labels.shape}')

submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
print(f'Submission: {submission.shape}')

Metadata: (1570, 5)
Train labels: (766, 11)
Train labels: (293, 11)
Submission: (804, 11)


In [5]:
# ===== MODEL SAMPLES ======
train = metadata[metadata.split == 'train'].copy().reset_index(drop=True)
print(f'TRAIN: {train.shape}')

valid = metadata[metadata.split == 'val'].copy().reset_index(drop=True)
print(f'VALID: {valid.shape}')

test = metadata[metadata.split == 'test'].copy().reset_index(drop=True)
print(f'TEST: {test.shape}')

TRAIN: (766, 5)
VALID: (293, 5)
TEST: (511, 5)


In [6]:
# ===== FILE PATHS OF SAMPLES =====
train_files = metadata[metadata.split == 'train']['features_path'].to_dict()
valid_files = metadata[metadata.split == 'val']['features_path'].to_dict()
test_files = metadata[metadata.split == 'test']['features_path'].to_dict()
all_test_files = valid_files.copy()
all_test_files.update(test_files)

ion_list = list(np.arange(0,100,1.0))

# Get the names of the target columns in a list
target_labels_list = [i for i in train_labels.columns if i not in ['sample_id']]
print(target_labels_list)

['basalt', 'carbonate', 'chloride', 'iron_oxide', 'oxalate', 'oxychlorine', 'phyllosilicate', 'silicate', 'sulfate', 'sulfide']


# DEEP LEARNING PREPPROCESSING

We need to create a 3D array, where the 1st dimension are samples, 2nd dimension are features and the 3rd dimension are the time steps. Since the raw time steps repeat given a different ion type, i.e. `m/z` we need to construct features with respect to the ion type. So we will have `1.0_temp`, `1.0_abundance`, `2.0_temp`, `2.0_abundance`, etc. Furthermore, the time is measured in seconds and time steps are not uniform and even across samples. Different time measurements are across samples. Hence, we will first compute the maximum time present in the training, validation and test samples and then aggregate all the data in 10 second time intervals. This will provide a full data array instead of a sparse one. Also it will significantly reduce the training size.

In [12]:
# COMPUTE MAXIMUM TIME ACROSS ALL SAMPLES
max_time = preprocess.compute_max_time_samples(metadata)
max_time

100%|██████████| 2/2 [00:00<00:00, 19.63it/s]

1971.66
2260.0





2260.0

In [47]:
print(f'Rows: {ht.shape[0]}')
print(f'Ion rows: {ht.groupby("m/z")["m/z"].agg("count").unique()[0]}')

Rows: 19107
Ion rows: 193


**Are the time steps and `m/z` values unique?**

In [48]:
ht.sort_values(by = ['time']).head()

Unnamed: 0,time,temp,m/z,abundance,abundance_minsub,abun_minsub_scaled
0,0.0,35.289,0.0,5.550957e-11,5.77582e-12,0.000172
73,0.0,35.289,73.0,2.030804e-14,4.626072e-14,1e-06
72,0.0,35.289,72.0,3.067091e-14,5.876128e-14,2e-06
71,0.0,35.289,71.0,8.983677e-14,8.921078e-14,3e-06
70,0.0,35.289,70.0,6.724622e-14,7.834999e-14,2e-06


In [49]:
ht['check'] = ht.groupby(['time', 'm/z'])['time'].transform('count')
ht[ht['check'] > 1]

Unnamed: 0,time,temp,m/z,abundance,abundance_minsub,abun_minsub_scaled,check


In [35]:
no_time_steps = 0
duplicates = []
for s in tqdm(train_files):

    ht = preprocess.get_sample(metadata, s)
    ht1 = ht[['time']].drop_duplicates()
    if ht.shape[0] == ht1.shape[0]:
        nrows = ht.groupby("time")["m/z"].agg("count").unique()[0]
    else:
        duplicates.append(s)

    if nrows > no_time_steps:
        no_time_steps = nrows

print(no_time_steps)
print(duplicates)

100%|██████████| 766/766 [00:17<00:00, 43.42it/s]

1401
[]



