# TRAINING NOTEBOOK

## Environment

In [1]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
# Change main system path to be able to run code from src folder
import sys
p = sys.path[0]
# Mac OS
if sys.path[0].endswith('/notebooks'):
    main_path = p[:-len('/notebooks')]
if sys.path[0].endswith('/techdoc/content'):
    main_path = p[:-len('/techdoc/content')]
    
# Windows OS
if sys.path[0].endswith('\\notebooks'): 
    main_path = p[:-len('\\notebooks')]
if sys.path[0].endswith('\\techdoc\content'): 
    main_path = p[:-len('\\techdoc\content')]

sys.path[0] = main_path

In [5]:
import os, gc
from termcolor import colored
from tqdm import tqdm

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.metrics import log_loss

from scipy.signal import find_peaks
from scipy.ndimage.filters import gaussian_filter1d

from src import (config, features, preprocess, training)

  from pandas import MultiIndex, Int64Index


# DATA PREPARATION

In [6]:
# ===== LOAD DATA ======
metadata = pd.read_csv(config.DATA_DIR_OUT + 'metadata.csv', index_col='sample_id')
print(f'Metadata: {metadata.shape}')

train_labels = pd.read_csv(config.DATA_DIR + 'train_labels.csv')
print(f'Train labels: {train_labels.shape}')

valid_labels = pd.read_csv(config.DATA_DIR + 'val_labels.csv')
print(f'Train labels: {valid_labels.shape}')

# Combine train and valid labels
trvl_labels = pd.concat([train_labels, valid_labels], axis = 0)

submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
print(f'Submission: {submission.shape}')

Metadata: (1570, 7)
Train labels: (766, 11)
Train labels: (293, 11)
Submission: (804, 11)


In [7]:
# ===== FILE PATHS OF SAMPLES =====
train_files = metadata[metadata.split == 'train']['features_path'].to_dict()
valid_files = metadata[metadata.split == 'val']['features_path'].to_dict()
test_files = metadata[metadata.split == 'test']['features_path'].to_dict()
# Train & Valid files
trva_files = train_files.copy()
trva_files.update(valid_files)
# All files
all_test_files = valid_files.copy()
all_test_files.update(test_files)

# Ion type list
ion_list = list(np.arange(0,100,1.0))
ion_list.remove(4.0)

# Get the names of the target columns in a list
target_labels_list = [i for i in train_labels.columns if i not in ['sample_id']]
print(target_labels_list)

['basalt', 'carbonate', 'chloride', 'iron_oxide', 'oxalate', 'oxychlorine', 'phyllosilicate', 'silicate', 'sulfate', 'sulfide']


# MODELS

**CHECK NULL BEFORE TRAINING**

In [24]:
# Data frame to save local CV results
models_log_loss = pd.DataFrame(index=target_labels_list)

## LR - `fts_maxrelabund_tempion`

**Temp_bin & Ion -> max relative abundance == 1584 features** 

In [48]:
train_cv_loss_LR, submission_LR = training.train_tbl(
    df_train='fts_maxrelabund_tempion',
    df_labels=train_labels,
    target_list=target_labels_list,
    df_test='fts_maxrelabund_tempion_VT',
    model_algo='LR_reg',
    sub_name='LR_reg'
    )
models_log_loss['LR_reg'] = models_log_loss.index.map(train_cv_loss_LR)

[34m
Average Log Loss: 0.2759[0m
Log Loss per Label:
{'basalt': 0.2767826059088304, 'carbonate': 0.26125450039338877, 'chloride': 0.28429799603250216, 'iron_oxide': 0.38246993360871934, 'oxalate': 0.004059239489521142, 'oxychlorine': 0.29915009827872885, 'phyllosilicate': 0.4074758334090677, 'silicate': 0.3516847439265464, 'sulfate': 0.3447188328713945, 'sulfide': 0.14713584783404265}


In [53]:
train_cv_loss_LR_trvl, train_full_clf_LR_trvl, submission_LR_trvl = training.train_tbl(
    df_train='fts_maxrelabund_tempion_trvl',
    df_labels=trvl_labels,
    target_list=target_labels_list,
    df_test='fts_maxrelabund_tempion_VT',
    model_algo='LR_reg',
    sub_name='LR_reg_trvl'
    )
models_log_loss['LR_reg_trvl'] = models_log_loss.index.map(train_cv_loss_LR_trvl)

[34m
Average Log Loss: 0.2602[0m
Log Loss per Label:
{'basalt': 0.2615261042107989, 'carbonate': 0.22292052739226756, 'chloride': 0.25846765369554275, 'iron_oxide': 0.37216113777026905, 'oxalate': 0.004914672949143357, 'oxychlorine': 0.25263015825669405, 'phyllosilicate': 0.4112433512412318, 'silicate': 0.34987021450556754, 'sulfate': 0.3228680540360715, 'sulfide': 0.14532854619106556}


### Linear Regression

In [49]:
train_cv_loss_LR, submission_LR = training.train_tbl(
    df_train='fts_maxrelabund_tempion_lr',
    df_labels=train_labels,
    target_list=target_labels_list,
    df_test='fts_maxrelabund_tempion_VT_lr',
    model_algo='LR_reg',
    sub_name='LR_reg_lr'
    )
models_log_loss['LR_reg_lr'] = models_log_loss.index.map(train_cv_loss_LR)

[34m
Average Log Loss: 0.2788[0m
Log Loss per Label:
{'basalt': 0.26748988134122215, 'carbonate': 0.24005578135351563, 'chloride': 0.23575384353463966, 'iron_oxide': 0.4164447998202377, 'oxalate': 0.004248954851441749, 'oxychlorine': 0.3199455835682433, 'phyllosilicate': 0.40749725792249, 'silicate': 0.34542547747723484, 'sulfate': 0.4043068189731069, 'sulfide': 0.14690431474216825}


### Polynomial

In [47]:
train_cv_loss_LR, submission_LR = training.train_tbl(
    df_train='fts_maxrelabund_tempion_poly',
    df_labels=train_labels,
    target_list=target_labels_list,
    df_test='fts_maxrelabund_tempion_VT_poly',
    model_algo='LR_reg',
    sub_name='LR_reg_poly'
    )
models_log_loss['LR_reg_poly'] = models_log_loss.index.map(train_cv_loss_LR)

[34m
Average Log Loss: 0.2778[0m
Log Loss per Label:
{'basalt': 0.2660464253551837, 'carbonate': 0.23738946461526642, 'chloride': 0.23622180218788214, 'iron_oxide': 0.4154154449459143, 'oxalate': 0.004547983380019803, 'oxychlorine': 0.3200795785524429, 'phyllosilicate': 0.40705583312477006, 'silicate': 0.3423841732823722, 'sulfate': 0.40278334772404295, 'sulfide': 0.1457341782161859}


## XGBopt - `fts_maxrelabund_tempion`

- numerical data needs to be scaled
- categorical data needs to be encoded

In [27]:
# 8m 37.2s
train_cv_loss_XGB_opt, submission_XGB_opt = training.train_tbl(
    df_train='fts_maxrelabund_tempion',
    df_labels=train_labels,
    target_list=target_labels_list,
    df_test='fts_maxrelabund_tempion_VT',
    model_algo='XGB_opt',
    sub_name='XGB_opt'
    )
models_log_loss['XGB_opt'] = models_log_loss.index.map(train_cv_loss_XGB_opt)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.co

[34m
Average Log Loss: 0.1746[0m
Log Loss per Label:
{'basalt': 0.19665137401195665, 'carbonate': 0.12102018407547, 'chloride': 0.2043376334408594, 'iron_oxide': 0.22611550844240597, 'oxalate': 0.029272685140647313, 'oxychlorine': 0.19420209851804687, 'phyllosilicate': 0.25951556825253963, 'silicate': 0.23317893970805786, 'sulfate': 0.20317408057798922, 'sulfide': 0.07880694186687044}


In [50]:
# 8m 37.2s
train_cv_loss_XGB_opt, submission_XGB_opt = training.train_tbl(
    df_train='fts_maxrelabund_tempion_poly',
    df_labels=train_labels,
    target_list=target_labels_list,
    df_test='fts_maxrelabund_tempion_VT_poly',
    model_algo='XGB_opt',
    sub_name='XGB_opt_poly'
    )
models_log_loss['XGB_opt_poly'] = models_log_loss.index.map(train_cv_loss_XGB_opt)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.co

[34m
Average Log Loss: 0.1867[0m
Log Loss per Label:
{'basalt': 0.20491202031886369, 'carbonate': 0.15175476310061925, 'chloride': 0.2086519137271178, 'iron_oxide': 0.2521399197596492, 'oxalate': 0.03576979512795852, 'oxychlorine': 0.2047783184810855, 'phyllosilicate': 0.2636692635704544, 'silicate': 0.21231888146239047, 'sulfate': 0.25242325280503286, 'sulfide': 0.08107383658978815}


In [28]:
# 12m2.7s
train_cv_loss_XGB_opt_trvl, submission_XGB_opt_trvl = training.train_tbl(
    df_train='fts_maxrelabund_tempion_trvl',
    df_labels=trvl_labels,
    target_list=target_labels_list,
    df_test='fts_maxrelabund_tempion_VT',
    model_algo='XGB_opt',
    sub_name='XGB_opt_trvl'
    )
models_log_loss['XGB_opt_trvl'] = models_log_loss.index.map(train_cv_loss_XGB_opt_trvl)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.co

[34m
Average Log Loss: 0.1493[0m
Log Loss per Label:
{'basalt': 0.16030390276618628, 'carbonate': 0.09299784144611556, 'chloride': 0.17099937928281922, 'iron_oxide': 0.2110446946926557, 'oxalate': 0.016176140634807434, 'oxychlorine': 0.15975673521358977, 'phyllosilicate': 0.2392929120695717, 'silicate': 0.18674152896437207, 'sulfate': 0.17355651791774612, 'sulfide': 0.08166643602891402}


In [51]:
# 12m2.7s
train_cv_loss_XGB_opt_trvl, submission_XGB_opt_trvl = training.train_tbl(
    df_train='fts_maxrelabund_tempion_trvl_poly',
    df_labels=trvl_labels,
    target_list=target_labels_list,
    df_test='fts_maxrelabund_tempion_VT_poly',
    model_algo='XGB_opt',
    sub_name='XGB_opt_trvl_poly'
    )
models_log_loss['XGB_opt_trvl_poly'] = models_log_loss.index.map(train_cv_loss_XGB_opt_trvl)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.co

[34m
Average Log Loss: 0.1611[0m
Log Loss per Label:
{'basalt': 0.1809933507075191, 'carbonate': 0.10953866252001529, 'chloride': 0.1838595883222579, 'iron_oxide': 0.22075075528610463, 'oxalate': 0.01758936392041963, 'oxychlorine': 0.14532258846057083, 'phyllosilicate': 0.26300033716493276, 'silicate': 0.2142458518303026, 'sulfate': 0.1911246118167924, 'sulfide': 0.0840874148031043}


## SVC - `fts_maxrelabund_tempion`

In [11]:
train_cv_loss_SVC, submission_SVC = training.train_tbl(
    df_train='fts_maxrelabund_tempion',
    df_labels=train_labels,
    target_list=target_labels_list,
    df_test='fts_maxrelabund_tempion_VT',
    model_algo='SVC',
    sub_name='SVC'
    )
models_log_loss['SVC'] = models_log_loss.index.map(train_cv_loss_SVC)

[34m
Average Log Loss: 0.2405[0m
Log Loss per Label:
{'basalt': 0.25027464139722094, 'carbonate': 0.19458859884948693, 'chloride': 0.28062636163936616, 'iron_oxide': 0.3538731851139628, 'oxalate': 0.00708139225050123, 'oxychlorine': 0.25229495731369245, 'phyllosilicate': 0.36509738998392394, 'silicate': 0.2942221981876629, 'sulfate': 0.29006275212335353, 'sulfide': 0.11674816953592974}


In [12]:
train_cv_loss_SVC_trvl, submission_SVC_trvl = training.train_tbl(
    df_train='fts_maxrelabund_tempion_trvl',
    df_labels=trvl_labels,
    target_list=target_labels_list,
    df_test='fts_maxrelabund_tempion_VT',
    model_algo='SVC',
    sub_name='SVC_trvl'
    )
models_log_loss['SVC_trvl'] = models_log_loss.index.map(train_cv_loss_SVC_trvl)

[34m
Average Log Loss: 0.2292[0m
Log Loss per Label:
{'basalt': 0.23301198473378343, 'carbonate': 0.1804529507851266, 'chloride': 0.2908102573239602, 'iron_oxide': 0.3614053881391047, 'oxalate': 0.0039472315136384025, 'oxychlorine': 0.233017319694651, 'phyllosilicate': 0.34388104053244295, 'silicate': 0.27064062845939874, 'sulfate': 0.2701950402292935, 'sulfide': 0.10447484670029986}


## XGBopt - `combo_maxabund_peaks`

In [22]:
train_cv_loss_XGB_peaks, submission_XGB_peaks = training.train_tbl(
    df_train='combo_maxabund_peaks',
    df_labels=train_labels,
    target_list=target_labels_list,
    df_test='combo_maxabund_peaks_VT',
    model_algo='XGB_opt',
    sub_name='XGB_opt_peaks'
    )
models_log_loss['XGBopt_peaks'] = models_log_loss.index.map(train_cv_loss_XGB_peaks)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.co

[34m
Average Log Loss: 0.1707[0m
Log Loss per Label:
{'basalt': 0.19504383100727152, 'carbonate': 0.12725561255970502, 'chloride': 0.20048671679044144, 'iron_oxide': 0.24008311371277063, 'oxalate': 0.012315083744781958, 'oxychlorine': 0.164854814627361, 'phyllosilicate': 0.2399570535617075, 'silicate': 0.23931971397608573, 'sulfate': 0.21280561231900258, 'sulfide': 0.07529303845939923}


In [15]:
train_cv_loss_XGB_tempb_peaks_opt_trvl, submission_XGB_tempb_peaks_opt_trvl =\
    training.train_tbl(
        df_train='combo_maxabund_peaks_trvl',
        df_labels=trvl_labels,
        target_list=target_labels_list,
        df_test='combo_maxabund_peaks_VT',
        model_algo='XGB_opt',
        sub_name='XGB_tempb_peaks_opt'
    )
models_log_loss['XGB_tempb_peaks_opt_trvl'] = models_log_loss.index.map(train_cv_loss_XGB_tempb_peaks_opt_trvl)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.co

[34m
Average Log Loss: 0.145[0m
Log Loss per Label:
{'basalt': 0.1498281400392832, 'carbonate': 0.09301901140051114, 'chloride': 0.16665983013362626, 'iron_oxide': 0.20769608078810645, 'oxalate': 0.014033538995205336, 'oxychlorine': 0.148409597606228, 'phyllosilicate': 0.22877625417536848, 'silicate': 0.19691387258807694, 'sulfate': 0.17159940715849445, 'sulfide': 0.07329880544762327}


## XGBopt - `combo_maxabund_peaks_slope`

In [16]:
# 29m 25.5s
train_cv_loss_XGB_tempb_peaks_slope_opt, submission_XGB_tempb_peaks_slope_opt =\
    training.train_tbl(
        df_train='combo_maxabund_peaks_slope',
        df_labels=train_labels,
        target_list=target_labels_list,
        df_test='combo_maxabund_peaks_slope_VT',
        model_algo='XGB_opt',
        sub_name='XGB_tempb_peaks_slope_opt'
    )
models_log_loss['XGB_tempb_peaks_slope_opt'] = models_log_loss.index.map(train_cv_loss_XGB_tempb_peaks_slope_opt)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.co

[34m
Average Log Loss: 0.1686[0m
Log Loss per Label:
{'basalt': 0.19520568177743663, 'carbonate': 0.12031231878582822, 'chloride': 0.1963325934969151, 'iron_oxide': 0.24169911216776438, 'oxalate': 0.012315083744781958, 'oxychlorine': 0.16530498244485917, 'phyllosilicate': 0.23245570364568774, 'silicate': 0.2323883510399649, 'sulfate': 0.21482440203037304, 'sulfide': 0.07531181614802744}


In [23]:
# 97m 49.4s
train_cv_loss_XGB_tempb_peaks_slope_opt_trvl, submission_XGB_tempb_peaks_slope_opt_trvl =\
    training.train_tbl(
        df_train='combo_maxabund_peaks_slope_trvl',
        df_labels=trvl_labels,
        target_list=target_labels_list,
        df_test='combo_maxabund_peaks_slope_VT',
        model_algo='XGB_opt',
        sub_name='XGB_tempb_peaks_slope_opt_trvl'
    )
models_log_loss['XGB_tempb_peaks_slope_opt_trvl'] = models_log_loss.index.map(train_cv_loss_XGB_tempb_peaks_slope_opt_trvl)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.co

[34m
Average Log Loss: 0.1434[0m
Log Loss per Label:
{'basalt': 0.14909855284917112, 'carbonate': 0.09245368205162423, 'chloride': 0.16391485702179903, 'iron_oxide': 0.2109987667602365, 'oxalate': 0.014033538995205336, 'oxychlorine': 0.14532247398525805, 'phyllosilicate': 0.22222365475676015, 'silicate': 0.19551551843464737, 'sulfate': 0.16645654836764276, 'sulfide': 0.0734885284485847}


## XGBopt - `combo_maxabund_peaks_slope_topions`

In [56]:
# 9m 44.2s
# TRAIN
train_cv_loss_XGB_tempb_peaks_slope_topi_opt, submission_XGB_tempb_peaks_slope_topi_opt =\
    training.train_tbl(
        df_train='combo_maxabund_peaks_slope_topions',
        df_labels=train_labels,
        target_list=target_labels_list,
        df_test='combo_maxabund_peaks_slope_topions_VT',
        model_algo='XGB_opt',
        sub_name='XGB_tempb_peaks_slope_topi_opt'
    )
models_log_loss['XGB_tempb_peaks_slope_topi_opt'] = models_log_loss.index.map(train_cv_loss_XGB_tempb_peaks_slope_topi_opt)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.co

[34m
Average Log Loss: 0.1676[0m
Log Loss per Label:
{'basalt': 0.19587480511132652, 'carbonate': 0.11631532745545028, 'chloride': 0.19634522138754695, 'iron_oxide': 0.23932652730115453, 'oxalate': 0.012315083744781958, 'oxychlorine': 0.16100005371269815, 'phyllosilicate': 0.2361309095439051, 'silicate': 0.22990574306708528, 'sulfate': 0.2134879802521291, 'sulfide': 0.07493350768998107}


In [57]:
# 13m 35.5s
# TRAIN & VALID
train_cv_loss_XGB_tempb_peaks_slope_topi_opt_trvl, submission_XGB_tempb_peaks_slope_topi_opt_trvl =\
    training.train_tbl(
        df_train='combo_maxabund_peaks_slope_topions_trvl',
        df_labels=trvl_labels,
        target_list=target_labels_list,
        df_test='combo_maxabund_peaks_slope_topions_VT',
        model_algo='XGB_opt',
        sub_name='XGB_tempb_peaks_slope_topi_opt_trvl'
    )
models_log_loss['XGB_tempb_peaks_slope_topi_opt_trvl'] = models_log_loss.index.map(train_cv_loss_XGB_tempb_peaks_slope_topi_opt_trvl)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.co

[34m
Average Log Loss: 0.1428[0m
Log Loss per Label:
{'basalt': 0.14837289364431888, 'carbonate': 0.08935366025632574, 'chloride': 0.1641920493095718, 'iron_oxide': 0.2105470783364448, 'oxalate': 0.014033538995205336, 'oxychlorine': 0.14633956037110943, 'phyllosilicate': 0.22254730416083085, 'silicate': 0.19505924932561924, 'sulfate': 0.16371391431687263, 'sulfide': 0.07384643618882182}


## XGBopt - `combo_maxabund_peaks_slope_topions_te`

**Target encode** top ions for each label. Include only the label target encoding for which the model is trained.

In [73]:
# 9m 44.2s
# TRAIN
train_cv_loss_XGB_tempb_peaks_slope_topi_te_opt,\
    submission_XGB_tempb_peaks_slope_topi_te_opt =\
    training.train_tbl(
        df_train='combo_maxabund_peaks_slope_topions',
        df_labels=train_labels,
        target_list=target_labels_list,
        df_test='combo_maxabund_peaks_slope_topions_VT',
        model_algo='XGB_opt',
        sub_name='XGB_tempb_peaks_slope_topi_te_opt',
        target_encode=True,
        target_encode_fts=['top_1', 'top_2', 'top_3']
    )
models_log_loss['XGB_tempb_peaks_slope_topi_te_opt'] = models_log_loss\
    .index.map(train_cv_loss_XGB_tempb_peaks_slope_topi_te_opt)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.co

[34m
Average Log Loss: 0.1678[0m
Log Loss per Label:
{'basalt': 0.19274603907812324, 'carbonate': 0.11563920676838421, 'chloride': 0.2005078832105099, 'iron_oxide': 0.24322337153313112, 'oxalate': 0.012353178938511063, 'oxychlorine': 0.16382996282952728, 'phyllosilicate': 0.23344093215526404, 'silicate': 0.22340114952256385, 'sulfate': 0.21946133703202428, 'sulfide': 0.0732406260807403}


In [76]:
# 
# TRAIN + VALID
train_cv_loss_XGB_tempb_peaks_slope_topi_te_opt_trvl,\
    submission_XGB_tempb_peaks_slope_topi_te_opt_trvl =\
    training.train_tbl(
        df_train='combo_maxabund_peaks_slope_topions_trvl',
        df_labels=trvl_labels,
        target_list=target_labels_list,
        df_test='combo_maxabund_peaks_slope_topions_VT',
        model_algo='XGB_opt',
        sub_name='XGB_tempb_peaks_slope_topi_te_opt_trvl',
        target_encode=True,
        target_encode_fts=['top_1', 'top_2', 'top_3']
    )
models_log_loss['XGB_tempb_peaks_slope_topi_te_opt_trvl'] = models_log_loss\
    .index.map(train_cv_loss_XGB_tempb_peaks_slope_topi_te_opt_trvl)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.co

[34m
Average Log Loss: 0.1427[0m
Log Loss per Label:
{'basalt': 0.14601733264823286, 'carbonate': 0.08943117590871308, 'chloride': 0.16551662260299882, 'iron_oxide': 0.2158353513162307, 'oxalate': 0.014033538995205336, 'oxychlorine': 0.14234565957881512, 'phyllosilicate': 0.22102569822708054, 'silicate': 0.19079992532647996, 'sulfate': 0.16972949563545386, 'sulfide': 0.07248662299128468}


## PCA-XGBo - `combo_maxabund_peaks_slope_topions`

In [17]:
# 
# TRAIN + VALID
train_cv_loss_XGB_tempb_peaks_slope_topi_pca_trvl,\
    submission_XGB_tempb_peaks_slope_topi_pca_trvl =\
    training.train_tbl(
        df_train='combo_maxabund_peaks_slope_topions_trvl',
        df_labels=trvl_labels,
        target_list=target_labels_list,
        df_test='combo_maxabund_peaks_slope_topions_VT',
        model_algo='PCA-XGB',
        sub_name='XGB_tempb_peaks_slope_topi_pca_trvl',
        target_encode=False
    )
models_log_loss['XGB_tempb_peaks_slope_topi_pca_trvl'] = models_log_loss\
    .index.map(train_cv_loss_XGB_tempb_peaks_slope_topi_pca_trvl)

[34m
Average Log Loss: 0.2708[0m
Log Loss per Label:
{'basalt': 0.26141569287499017, 'carbonate': 0.18869120319632987, 'chloride': 0.3346256489808875, 'iron_oxide': 0.41581886866870893, 'oxalate': 0.044638523910946575, 'oxychlorine': 0.32388255061460397, 'phyllosilicate': 0.42366090010568797, 'silicate': 0.28816146272501547, 'sulfate': 0.3220774466290682, 'sulfide': 0.10549359269568644}


# MODEL SUMMARY

In [29]:
model_summary.sort_values(by='overall')\
             .style.highlight_min(axis=0, 
                                  props='color:darkblue; background-color:lightblue;',
                                  subset=target_labels_list)\
                    .highlight_min(axis=0, 
                                   props='color:white; background-color:red;',
                                   subset=['overall'])

Unnamed: 0,basalt,carbonate,chloride,iron_oxide,oxalate,oxychlorine,phyllosilicate,silicate,sulfate,sulfide,overall
7,0.149099,0.092454,0.163915,0.210999,0.014034,0.145322,0.222224,0.195516,0.166457,0.073489,0.143351
4,0.149828,0.093019,0.16666,0.207696,0.014034,0.14841,0.228776,0.196914,0.171599,0.073299,0.145023
9,0.160304,0.092998,0.170999,0.211045,0.016176,0.159757,0.239293,0.186742,0.173557,0.081666,0.149254
5,0.195206,0.120312,0.196333,0.241699,0.012315,0.165305,0.232456,0.232388,0.214824,0.075312,0.168615
6,0.195044,0.127256,0.200487,0.240083,0.012315,0.164855,0.239957,0.23932,0.212806,0.075293,0.170741
8,0.196651,0.12102,0.204338,0.226116,0.029273,0.194202,0.259516,0.233179,0.203174,0.078807,0.174628
3,0.233012,0.180453,0.29081,0.361405,0.003947,0.233017,0.343881,0.270641,0.270195,0.104475,0.229184
2,0.250275,0.194589,0.280626,0.353873,0.007081,0.252295,0.365097,0.294222,0.290063,0.116748,0.240487
1,0.261526,0.222921,0.258468,0.372161,0.004915,0.25263,0.411243,0.34987,0.322868,0.145329,0.260193
0,0.276783,0.261255,0.284298,0.38247,0.004059,0.29915,0.407476,0.351685,0.344719,0.147136,0.275903


In [18]:
models_log_loss.style.highlight_min(axis=1, 
                                    props='color:darkblue; background-color:lightblue;')

NameError: name 'models_log_loss' is not defined

In [78]:
model_summary = models_log_loss.T.copy()
model_summary['overall'] = model_summary.mean(axis=1)
model_summary.sort_values(by='overall')\
             .style.highlight_min(axis=0, 
                                  props='color:darkblue; background-color:lightblue;',
                                  subset=target_labels_list)\
                    .highlight_min(axis=0, 
                                   props='color:white; background-color:red;',
                                   subset=['overall'])

Unnamed: 0,basalt,carbonate,chloride,iron_oxide,oxalate,oxychlorine,phyllosilicate,silicate,sulfate,sulfide,overall
XGB_tempb_peaks_slope_topi_te_opt_trvl,0.146017,0.089431,0.165517,0.215835,0.014034,0.142346,0.221026,0.1908,0.169729,0.072487,0.142722
XGB_tempb_peaks_slope_topi_opt_trvl,0.148373,0.089354,0.164192,0.210547,0.014034,0.14634,0.222547,0.195059,0.163714,0.073846,0.142801
XGB_tempb_peaks_slope_opt_trvl,0.149099,0.092454,0.163915,0.210999,0.014034,0.145322,0.222224,0.195516,0.166457,0.073489,0.143351
XGB_tempb_peaks_opt_trvl,0.149828,0.093019,0.16666,0.207696,0.014034,0.14841,0.228776,0.196914,0.171599,0.073299,0.145023
XGB_opt_trvl,0.160304,0.092998,0.170999,0.211045,0.016176,0.159757,0.239293,0.186742,0.173557,0.081666,0.149254
XGB_tempb_peaks_slope_topi_opt,0.195875,0.116315,0.196345,0.239327,0.012315,0.161,0.236131,0.229906,0.213488,0.074934,0.167564
XGB_tempb_peaks_slope_topi_te_opt,0.192746,0.115639,0.200508,0.243223,0.012353,0.16383,0.233441,0.223401,0.219461,0.073241,0.167784
XGB_tempb_peaks_slope_opt,0.195206,0.120312,0.196333,0.241699,0.012315,0.165305,0.232456,0.232388,0.214824,0.075312,0.168615
XGBopt_peaks,0.195044,0.127256,0.200487,0.240083,0.012315,0.164855,0.239957,0.23932,0.212806,0.075293,0.170741
XGB_opt,0.196651,0.12102,0.204338,0.226116,0.029273,0.194202,0.259516,0.233179,0.203174,0.078807,0.174628


In [31]:
model_summary.to_csv(os.path.join(config.MODELS_DIR + 'model_summary.csv'))

# Predict Validation on Individual Labels

Compute predictions for each label depending on which model performed the best for that label. Validate the results on the validation sample only since we have the labels.

In [62]:
submission_by_label = pd.read_csv(config.DATA_DIR + 'submission_format.csv', 
                             index_col='sample_id')
submission_by_label = submission_by_label
print(submission_by_label.shape)

log_loss_ind_label = {}

for label in target_labels_list:
    if label == 'oxalate':
        submission_by_label[label] = submission_LR_trvl[label]
    elif label in ['silicate']:
        submission_by_label[label] = submission_XGB_opt_trvl[label]
    elif label in ['iron_oxide', 'sulfide']:
        submission_by_label[label] = submission_XGB_tempb_peaks_opt_trvl[label]
    elif label in ['chloride', 'oxychlorine', 'phyllosilicate']:
        submission_by_label[label] = submission_XGB_tempb_peaks_slope_opt_trvl[label]
    else:
        submission_by_label[label] = submission_XGB_tempb_peaks_slope_topi_opt_trvl[label]
    
    # Compute log-loss
    ll = log_loss(valid_labels[label], submission_by_label.iloc[:valid_labels.shape[0]][label])
    log_loss_ind_label[label] = ll

submission_by_label.to_csv(config.MODELS_DIR + 'ind_label_trvl_slope_topions' + '.csv')

print(f'Average Log Loss: {np.mean(list(log_loss_ind_label.values()))}')
#log_loss_ind_label

(804, 10)
Average Log Loss: 0.010258730803071135


In [63]:
submission_by_label.head()

Unnamed: 0_level_0,basalt,carbonate,chloride,iron_oxide,oxalate,oxychlorine,phyllosilicate,silicate,sulfate,sulfide
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
S0766,0.001748,0.002859,0.003846,0.002066,9e-06,0.001891,0.007245,0.052359,0.002117,0.001138
S0767,0.014384,0.004987,0.001018,0.00269,3.8e-05,0.003281,0.011698,0.043701,0.007977,0.000664
S0768,0.973089,0.045625,0.001227,0.012095,0.000425,0.004729,0.931023,0.986137,0.017578,0.000879
S0769,0.001931,0.001206,0.00804,0.010399,2.5e-05,0.994446,0.005793,0.001311,0.949095,0.000843
S0770,0.001869,0.001278,0.009605,0.925608,0.00018,0.996009,0.992411,0.000837,0.001732,0.000852


In [64]:
all_models = models_log_loss.copy()
all_models['Ind_labels'] = all_models.index.map(log_loss_ind_label)
all_models = all_models.T
all_models['overall'] = all_models.mean(axis=1)
all_models.style.highlight_min(axis=0, 
                                  props='color:darkblue; background-color:lightblue;',
                                  subset=target_labels_list)\
                    .highlight_min(axis=0, 
                                   props='color:white; background-color:red;',
                                   subset=['overall'])

Unnamed: 0,basalt,carbonate,chloride,iron_oxide,oxalate,oxychlorine,phyllosilicate,silicate,sulfate,sulfide,overall
LR_reg,0.276783,0.261255,0.284298,0.38247,0.004059,0.29915,0.407476,0.351685,0.344719,0.147136,0.275903
LR_reg_trvl,0.261526,0.222921,0.258468,0.372161,0.004915,0.25263,0.411243,0.34987,0.322868,0.145329,0.260193
SVC,0.250275,0.194589,0.280626,0.353873,0.007081,0.252295,0.365097,0.294222,0.290063,0.116748,0.240487
SVC_trvl,0.233012,0.180453,0.29081,0.361405,0.003947,0.233017,0.343881,0.270641,0.270195,0.104475,0.229184
XGB_tempb_peaks_opt_trvl,0.149828,0.093019,0.16666,0.207696,0.014034,0.14841,0.228776,0.196914,0.171599,0.073299,0.145023
XGB_tempb_peaks_slope_opt,0.195206,0.120312,0.196333,0.241699,0.012315,0.165305,0.232456,0.232388,0.214824,0.075312,0.168615
XGBopt_peaks,0.195044,0.127256,0.200487,0.240083,0.012315,0.164855,0.239957,0.23932,0.212806,0.075293,0.170741
XGB_tempb_peaks_slope_opt_trvl,0.149099,0.092454,0.163915,0.210999,0.014034,0.145322,0.222224,0.195516,0.166457,0.073489,0.143351
XGB_opt,0.196651,0.12102,0.204338,0.226116,0.029273,0.194202,0.259516,0.233179,0.203174,0.078807,0.174628
XGB_opt_trvl,0.160304,0.092998,0.170999,0.211045,0.016176,0.159757,0.239293,0.186742,0.173557,0.081666,0.149254


In [65]:
all_models.to_csv(os.path.join(config.MODELS_DIR + 'all_models.csv'), index=False)