# TRAINING NOTEBOOK

## Environment

In [1]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
# Change main system path to be able to run code from src folder
import sys
p = sys.path[0]
# Mac OS
if sys.path[0].endswith('/notebooks'):
    main_path = p[:-len('/notebooks')]
if sys.path[0].endswith('/techdoc/content'):
    main_path = p[:-len('/techdoc/content')]
    
# Windows OS
if sys.path[0].endswith('\\notebooks'): 
    main_path = p[:-len('\\notebooks')]
if sys.path[0].endswith('\\techdoc\content'): 
    main_path = p[:-len('\\techdoc\content')]

sys.path[0] = main_path

In [3]:
import os, gc
from termcolor import colored
from tqdm import tqdm

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.metrics import log_loss

from scipy.signal import find_peaks
from scipy.ndimage.filters import gaussian_filter1d

from src import (config, features, preprocess, training)

  from pandas import MultiIndex, Int64Index


# DATA PREPARATION

In [4]:
# ===== LOAD DATA ======
metadata = pd.read_csv(config.DATA_DIR_OUT + 'metadata.csv', index_col='sample_id')
print(f'Metadata: {metadata.shape}')

train_labels = pd.read_csv(config.DATA_DIR + 'train_labels.csv')
print(f'Train labels: {train_labels.shape}')

valid_labels = pd.read_csv(config.DATA_DIR + 'val_labels.csv')
print(f'Train labels: {valid_labels.shape}')

# Combine train and valid labels
trvl_labels = pd.concat([train_labels, valid_labels], axis = 0)

submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
print(f'Submission: {submission.shape}')

Metadata: (1570, 7)
Train labels: (766, 11)
Train labels: (293, 11)
Submission: (804, 11)


In [5]:
# ===== FILE PATHS OF SAMPLES =====
train_files = metadata[metadata.split == 'train']['features_path'].to_dict()
valid_files = metadata[metadata.split == 'val']['features_path'].to_dict()
test_files = metadata[metadata.split == 'test']['features_path'].to_dict()
# Train & Valid files
trva_files = train_files.copy()
trva_files.update(valid_files)
# All files
all_test_files = valid_files.copy()
all_test_files.update(test_files)

# Ion type list
ion_list = list(np.arange(0,100,1.0))
ion_list.remove(4.0)

# Get the names of the target columns in a list
target_labels_list = [i for i in train_labels.columns if i not in ['sample_id']]
print(target_labels_list)

['basalt', 'carbonate', 'chloride', 'iron_oxide', 'oxalate', 'oxychlorine', 'phyllosilicate', 'silicate', 'sulfate', 'sulfide']


## Features

In [12]:
# Temperature bins of 100 for each ion - max relative abundance
fts_maxrelabund_tempion = pd.read_csv(os.path.join(config.DATA_DIR_OUT + 
                                                   'fts_maxrelabund_tempion.csv'))
print(fts_maxrelabund_tempion.shape)
fts_maxrelabund_tempion_VT = pd.read_csv(os.path.join(config.DATA_DIR_OUT + 
                                                   'fts_maxrelabund_tempion_VT.csv'))
print(fts_maxrelabund_tempion_VT.shape)

# ----- TRAIN + VALID -----
fts_maxrelabund_tempion_trvl = pd.read_csv(os.path.join(config.DATA_DIR_OUT + 
                                                   'fts_maxrelabund_tempion_trvl.csv'))
print(fts_maxrelabund_tempion.shape)
fts_maxrelabund_tempion_VT_trvl = pd.read_csv(os.path.join(config.DATA_DIR_OUT + 
                                                   'fts_maxrelabund_tempion_VT_trvl.csv'))
print(fts_maxrelabund_tempion_VT_trvl.shape)


(766, 1584)
(804, 1584)
(766, 1584)
(804, 1584)


# MODELS

In [33]:
# Data frame to save local CV results
models_log_loss = pd.DataFrame(index=target_labels_list)

## Logistic Regression

**Temp_bin & Ion -> max relative abundance == 1584 features** 

In [34]:
train_cv_loss_LR, train_full_clf_LR, submission_LR = training.train_tbl(
    df_train=fts_maxrelabund_tempion,
    df_labels=train_labels,
    target_list=target_labels_list,
    df_test=fts_maxrelabund_tempion_VT,
    model_algo='LR_reg',
    sub_name='LR_reg'
    )
models_log_loss['LR_reg'] = models_log_loss.index.map(train_cv_loss_LR)

[34m
Average Log Loss: 0.2759[0m
Log Loss per Label:
{'basalt': 0.2767826059088304, 'carbonate': 0.26125450039338877, 'chloride': 0.28429799603250216, 'iron_oxide': 0.38246993360871934, 'oxalate': 0.004059239489521142, 'oxychlorine': 0.29915009827872885, 'phyllosilicate': 0.4074758334090677, 'silicate': 0.3516847439265464, 'sulfate': 0.3447188328713945, 'sulfide': 0.14713584783404265}


In [35]:
train_cv_loss_LR_trvl, train_full_clf_LR_trvl, submission_LR_trvl = training.train_tbl(
    df_train=fts_maxrelabund_tempion_trvl,
    df_labels=trvl_labels,
    target_list=target_labels_list,
    df_test=fts_maxrelabund_tempion_VT_trvl,
    model_algo='LR_reg',
    sub_name='LR_reg_trvl'
    )
models_log_loss['LR_reg_trvl'] = models_log_loss.index.map(train_cv_loss_LR_trvl)

[34m
Average Log Loss: 0.2602[0m
Log Loss per Label:
{'basalt': 0.2615261042107989, 'carbonate': 0.22292052739226756, 'chloride': 0.25846765369554275, 'iron_oxide': 0.37216113777026905, 'oxalate': 0.004914672949143357, 'oxychlorine': 0.25263015825669405, 'phyllosilicate': 0.4112433512412318, 'silicate': 0.34987021450556754, 'sulfate': 0.3228680540360715, 'sulfide': 0.14532854619106556}


## XGB

- numerical data needs to be scaled
- categorical data needs to be encoded

In [42]:
train_cv_loss_XGB_opt, train_full_clf_XGB_opt, submission_XGB_opt = training.train_tbl(
    df_train=fts_maxrelabund_tempion,
    df_labels=train_labels,
    target_list=target_labels_list,
    df_test=fts_maxrelabund_tempion_VT,
    model_algo='XGB_opt',
    sub_name='XGB_opt'
    )
models_log_loss['XGB_opt'] = models_log_loss.index.map(train_cv_loss_XGB_opt)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.co

[34m
Average Log Loss: 0.1746[0m
Log Loss per Label:
{'basalt': 0.19665137401195665, 'carbonate': 0.12102018407547, 'chloride': 0.2043376334408594, 'iron_oxide': 0.22611550844240597, 'oxalate': 0.029272685140647313, 'oxychlorine': 0.19420209851804687, 'phyllosilicate': 0.25951556825253963, 'silicate': 0.23317893970805786, 'sulfate': 0.20317408057798922, 'sulfide': 0.07880694186687044}


In [43]:
train_cv_loss_XGB_opt_trvl, train_full_clf_XGB_opt_trvl, submission_XGB_opt_trvl = training.train_tbl(
    df_train=fts_maxrelabund_tempion_trvl,
    df_labels=trvl_labels,
    target_list=target_labels_list,
    df_test=fts_maxrelabund_tempion_VT_trvl,
    model_algo='XGB_opt',
    sub_name='XGB_opt_trvl'
    )
models_log_loss['XGB_opt_trvl'] = models_log_loss.index.map(train_cv_loss_XGB_opt_trvl)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.co

[34m
Average Log Loss: 0.1493[0m
Log Loss per Label:
{'basalt': 0.16030390276618628, 'carbonate': 0.09299784144611556, 'chloride': 0.17099937928281922, 'iron_oxide': 0.2110446946926557, 'oxalate': 0.016176140634807434, 'oxychlorine': 0.15975673521358977, 'phyllosilicate': 0.2392929120695717, 'silicate': 0.18674152896437207, 'sulfate': 0.17355651791774612, 'sulfide': 0.08166643602891402}


# MODEL SUMMARY

In [44]:
models_log_loss.style.highlight_min(axis=1, 
                                    props='color:darkblue; background-color:lightblue;')

Unnamed: 0,LR_reg,LR_reg_trvl,XGB_opt,XGB_opt_trvl
basalt,0.276783,0.261526,0.196651,0.160304
carbonate,0.261255,0.222921,0.12102,0.092998
chloride,0.284298,0.258468,0.204338,0.170999
iron_oxide,0.38247,0.372161,0.226116,0.211045
oxalate,0.004059,0.004915,0.029273,0.016176
oxychlorine,0.29915,0.25263,0.194202,0.159757
phyllosilicate,0.407476,0.411243,0.259516,0.239293
silicate,0.351685,0.34987,0.233179,0.186742
sulfate,0.344719,0.322868,0.203174,0.173557
sulfide,0.147136,0.145329,0.078807,0.081666


In [50]:
model_summary = models_log_loss.T.copy()
model_summary['overall'] = model_summary.mean(axis=1)
model_summary.style.highlight_min(axis=0, 
                                  props='color:darkblue; background-color:lightblue;',
                                  subset=target_labels_list)\
                    .highlight_min(axis=0, 
                                   props='color:white; background-color:red;',
                                   subset=['overall'])

Unnamed: 0,basalt,carbonate,chloride,iron_oxide,oxalate,oxychlorine,phyllosilicate,silicate,sulfate,sulfide,overall
LR_reg,0.276783,0.261255,0.284298,0.38247,0.004059,0.29915,0.407476,0.351685,0.344719,0.147136,0.275903
LR_reg_trvl,0.261526,0.222921,0.258468,0.372161,0.004915,0.25263,0.411243,0.34987,0.322868,0.145329,0.260193
XGB_opt,0.196651,0.12102,0.204338,0.226116,0.029273,0.194202,0.259516,0.233179,0.203174,0.078807,0.174628
XGB_opt_trvl,0.160304,0.092998,0.170999,0.211045,0.016176,0.159757,0.239293,0.186742,0.173557,0.081666,0.149254


In [46]:
model_summary.to_csv(os.path.join(config.MODELS_DIR + 'model_summary.csv'), index=False)