# TRAINING NOTEBOOK

## Environment

In [1]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
# Change main system path to be able to run code from src folder
import sys
p = sys.path[0]
# Mac OS
if sys.path[0].endswith('/notebooks'):
    main_path = p[:-len('/notebooks')]
if sys.path[0].endswith('/techdoc/content'):
    main_path = p[:-len('/techdoc/content')]
    
# Windows OS
if sys.path[0].endswith('\\notebooks'): 
    main_path = p[:-len('\\notebooks')]
if sys.path[0].endswith('\\techdoc\content'): 
    main_path = p[:-len('\\techdoc\content')]

sys.path[0] = main_path

In [19]:
import gc, itertools
from termcolor import colored
from tqdm import tqdm

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, f1_score
from sklearn.model_selection import cross_val_predict
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

from src import (config, describe_data, features,
                 preprocess, training)

# DATA PREPARATION

In [4]:
# ===== LOAD DATA ======
metadata = pd.read_csv(config.DATA_DIR + 'metadata.csv')
print(f'Metadata: {metadata.shape}')

train_labels = pd.read_csv(config.DATA_DIR + 'train_labels.csv')
print(f'Train labels: {train_labels.shape}')

submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
print(f'Submission: {submission.shape}')

Metadata: (1570, 5)
Train labels: (766, 11)
Submission: (804, 11)


In [5]:
# ===== MODEL SAMPLES ======
train = metadata[metadata.split == 'train'].copy().reset_index(drop=True)
print(f'TRAIN: {train.shape}')

valid = metadata[metadata.split == 'val'].copy().reset_index(drop=True)
print(f'VALID: {valid.shape}')

test = metadata[metadata.split == 'test'].copy().reset_index(drop=True)
print(f'TEST: {test.shape}')

TRAIN: (766, 5)
VALID: (293, 5)
TEST: (511, 5)


In [10]:
# ===== FILE PATHS OF SAMPLES =====
train_files = metadata[metadata.split == 'train']['features_path'].to_dict()
valid_files = metadata[metadata.split == 'val']['features_path'].to_dict()
test_files = metadata[metadata.split == 'test']['features_path'].to_dict()
all_test_files = valid_files.copy()
all_test_files.update(test_files)

# Get the names of the target columns in a list
target_labels_list = [i for i in train_labels.columns if i not in ['sample_id']]
print(target_labels_list)

['basalt', 'carbonate', 'chloride', 'iron_oxide', 'oxalate', 'oxychlorine', 'phyllosilicate', 'silicate', 'sulfate', 'sulfide']


# FEATURE ENGINEERING

In the benchmark notebook the features are discretized into bins of 100 degrees.

In [None]:
#TODO How long does it take for temperature to rise to certain level where we get abundance?
#TODO How to account for the non-linear diferences of time vs temp?
#TODO Each type of ion as a feature, make a matrix and append to the training dataset
#TODO Max value of abundance of each ion - take temp reading and time - or duration
#TODO How to define characteristic of peaks in ion abundances; temp at which they occur, the shape of peaks (height, width, area)
#TODO How to deal with overlapping peaks
#TODO Number of peaks, size of the peaks
#TODO Can we do some target encoding?

**Average temperature per sample**

In [None]:
# Average temperature per sample
train = features.avg_temp_sample(train, train_files)
valid = features.avg_temp_sample(valid, valid_files)
test = features.avg_temp_sample(test, test_files)

**Compute min and max temp for all available samples, including the ion types**

In [None]:
# Get min/max temperature values and unique ion list
# Preprocess the data to remove noise before calculation
min_temp, max_temp, ion_list = preprocess.compute_min_max_temp_ion(metadata)
print(colored(f'Min temp = {min_temp}; Max temp = {max_temp}', 'blue'))
print(colored(f'Number of unqiue ions: {len(ion_list)}', 'blue'))

100%|██████████| 1570/1570 [01:35<00:00, 16.51it/s]

[34mMin temp = -63.563; Max temp = 1484.13[0m
[34mNumber of unqiue ions: 99[0m





## Max relative abundance per temp bin and ion type

Bin the temp from min to max in intervals of 100 degrees and combine with the ion type i.e. `m/z`. For each combination compute the maximum relative abundance.

In [9]:
# ===== TRAIN SAMPLE =====
print(f'Number of all test files: {len(train_files)}')
fts_maxrelabund_tempion = features.features_iontemp_abun(metadata, train_files)
print(f'Val-Test features: {fts_maxrelabund_tempion.shape}')
fts_maxrelabund_tempion = fts_maxrelabund_tempion.replace(np.nan, 0)

Number of all test files: 766
Number of samples: 766
Val-Test features: (766, 1584)


In [14]:
# ===== VALID & TEST SAMPLE =====
print(f'Number of all test files: {len(all_test_files)}')
fts_maxrelabund_tempion_VT = features.features_iontemp_abun(metadata, all_test_files)
print(f'Val-Test features: {fts_maxrelabund_tempion_VT.shape}')
fts_maxrelabund_tempion_VT = fts_maxrelabund_tempion_VT.replace(np.nan,0)

Number of all test files: 804
Number of samples: 804
Val-Test features: (804, 1584)


## Duration to max temperature per ion type

Ion types are presented in column, rows are samples and values are time in seconds to max temperature.

# MODELS

In [21]:
# Data frame to save local CV results
models_log_loss = pd.DataFrame(index=target_labels_list)

## Logistic Regression - Benchmark

In [16]:
# ===== TRAIN MODEL - CROSS VALIDATION =====
clf = LogisticRegression(penalty="l1",solver="liblinear", 
                         C=10, random_state=config.RANDOM_SEED)
logloss_LR = training.trainCV_label(X = fts_maxrelabund_tempion, 
                                 df_y = train_labels, 
                                 target = target_labels_list, 
                                 cv_folds = 10, 
                                 model_metric = log_loss, 
                                 clf = clf)

models_log_loss['LRbench'] = models_log_loss.index.map(logloss_LR)

# Print results
print(colored(f'Average Log Loss: {np.round(np.mean(list(logloss_LR.values())), 4)}', 'blue'))
print('Log Loss per Label:')
logloss_LR

[34mAverage Log Loss: 0.2759[0m
Log Loss per Label:


{'basalt': 0.27678260590883075,
 'carbonate': 0.2612545003933846,
 'chloride': 0.2842979960325033,
 'iron_oxide': 0.3824699336087238,
 'oxalate': 0.004059239489520996,
 'oxychlorine': 0.2991500982787292,
 'phyllosilicate': 0.4074758334090647,
 'silicate': 0.35168474392654814,
 'sulfate': 0.34471883287139204,
 'sulfide': 0.14713584783404984}

In [17]:
# ===== TRAIN FULL MODEL ===== 
clf_LRr = training.train_full_model(X = fts_maxrelabund_tempion,
                                    df_y = train_labels,
                                    target = target_labels_list,
                                    model_algo = 'LR')

In [18]:
# ===== PREDICT SUBMISSION =====
submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv', index_col='sample_id')
for target in clf_LRr:
    clf = clf_LRr[target]
    submission[target] = clf.predict_proba(fts_maxrelabund_tempion_VT)[:,1]

# ===== SAVE SUBMISSION ===== 
submission.to_csv(config.MODELS_DIR + 'LRr.csv')

submission.head()

Unnamed: 0_level_0,basalt,carbonate,chloride,iron_oxide,oxalate,oxychlorine,phyllosilicate,silicate,sulfate,sulfide
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
S0766,1.868003e-07,6.2e-05,4e-06,1.125625e-07,5.2e-05,9e-05,8.296115e-08,0.998771,2e-06,2e-06
S0767,0.2259753,0.261281,0.007587,0.3353358,0.000132,0.226904,0.01458527,0.879255,0.05794,0.00974
S0768,0.5743717,0.508643,0.000804,0.2181478,0.000763,0.277165,0.8997608,0.685684,0.249768,0.002388
S0769,0.01895572,0.09202,0.042351,0.07053178,8.3e-05,0.97736,0.1079748,0.086535,0.370443,0.003093
S0770,0.0008417039,0.008442,0.048491,0.3099149,0.000358,0.862539,0.5699419,0.003204,0.001448,0.019214


## XGBoost

- numerical data needs to be scaled
- categorical data needs to be encoded

In [24]:
# Train the model
clf = xgb.XGBClassifier(objective = "binary:logistic",
                        use_label_encoder = False,
                        eval_metric = 'logloss')
logloss_XGB = training.trainCV_label(X = fts_maxrelabund_tempion, 
                                 df_y = train_labels, 
                                 target = target_labels_list, 
                                 cv_folds = 10, 
                                 model_metric = log_loss, 
                                 clf = clf)

models_log_loss['XGBoost'] = models_log_loss.index.map(logloss_XGB)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.co

In [None]:
print(colored(f'Average Log Loss: {np.round(np.mean(list(logloss_XGB.values())), 4)}', 'blue'))
print('Log Loss per Label:')
logloss_XGB

[34mAverage Log Loss: 0.1934[0m
Log Loss per Label:


{'basalt': 0.22565078102787242,
 'carbonate': 0.13225618815649615,
 'chloride': 0.22720553328986837,
 'iron_oxide': 0.254578396456081,
 'oxalate': 0.027906252811262605,
 'oxychlorine': 0.20850854126821824,
 'phyllosilicate': 0.29730870486992084,
 'silicate': 0.2575651139000811,
 'sulfate': 0.2245971262183839,
 'sulfide': 0.07831794841140914}

In [None]:
# ===== TRAIN FULL MODEL ===== 
clf_XGB = training.train_full_model(X = fts_maxrelabund_tempion,
                                    df_y = train_labels,
                                    target = target_labels_list,
                                    model_algo = 'XGB')

# ===== PREDICT SUBMISSION =====
submission_xgb = pd.read_csv(config.DATA_DIR + 'submission_format.csv', index_col='sample_id')
for target in clf_XGB:
    clf = clf_XGB[target]
    submission_xgb[target] = clf.predict_proba(fts_maxrelabund_tempion_VT)[:,1]

# ===== SAVE SUBMISSION ===== 
submission_xgb.to_csv(config.MODELS_DIR + 'XGB.csv')

submission_xgb.head()

## Model Summary

In [23]:
models_log_loss

Unnamed: 0,LRbench
basalt,0.276783
carbonate,0.261255
chloride,0.284298
iron_oxide,0.38247
oxalate,0.004059
oxychlorine,0.29915
phyllosilicate,0.407476
silicate,0.351685
sulfate,0.344719
sulfide,0.147136
