# TRAINING NOTEBOOK

## Environment

In [1]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
# Change main system path to be able to run code from src folder
import sys
p = sys.path[0]
# Mac OS
if sys.path[0].endswith('/notebooks'):
    main_path = p[:-len('/notebooks')]
if sys.path[0].endswith('/techdoc/content'):
    main_path = p[:-len('/techdoc/content')]
    
# Windows OS
if sys.path[0].endswith('\\notebooks'): 
    main_path = p[:-len('\\notebooks')]
if sys.path[0].endswith('\\techdoc\content'): 
    main_path = p[:-len('\\techdoc\content')]

sys.path[0] = main_path

In [3]:
import gc, itertools
from termcolor import colored
from tqdm import tqdm

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

import xgboost as xgb

from scipy.signal import find_peaks
from scipy.ndimage.filters import gaussian_filter1d

from src import (config, describe_data, features,
                 preprocess, training)

  from pandas import MultiIndex, Int64Index


# DATA PREPARATION

In [4]:
# ===== LOAD DATA ======
metadata = pd.read_csv(config.DATA_DIR + 'metadata.csv')
print(f'Metadata: {metadata.shape}')

train_labels = pd.read_csv(config.DATA_DIR + 'train_labels.csv')
print(f'Train labels: {train_labels.shape}')

valid_labels = pd.read_csv(config.DATA_DIR + 'val_labels.csv')
print(f'Train labels: {valid_labels.shape}')

submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
print(f'Submission: {submission.shape}')

Metadata: (1570, 5)
Train labels: (766, 11)
Train labels: (293, 11)
Submission: (804, 11)


In [5]:
# Combine train and valid labels
trvl_labels = pd.concat([train_labels, valid_labels], axis = 0)

In [6]:
# ===== MODEL SAMPLES ======
train = metadata[metadata.split == 'train'].copy().reset_index(drop=True)
print(f'TRAIN: {train.shape}')

valid = metadata[metadata.split == 'val'].copy().reset_index(drop=True)
print(f'VALID: {valid.shape}')

test = metadata[metadata.split == 'test'].copy().reset_index(drop=True)
print(f'TEST: {test.shape}')

TRAIN: (766, 5)
VALID: (293, 5)
TEST: (511, 5)


In [7]:
# ===== FILE PATHS OF SAMPLES =====
train_files = metadata[metadata.split == 'train']['features_path'].to_dict()
valid_files = metadata[metadata.split == 'val']['features_path'].to_dict()
test_files = metadata[metadata.split == 'test']['features_path'].to_dict()
# Train & Valid files
trva_files = train_files.copy()
trva_files.update(valid_files)
# All files
all_test_files = valid_files.copy()
all_test_files.update(test_files)

# Ion type list
ion_list = list(np.arange(0,100,1.0))
ion_list.remove(4.0)

# Get the names of the target columns in a list
target_labels_list = [i for i in train_labels.columns if i not in ['sample_id']]
print(target_labels_list)

['basalt', 'carbonate', 'chloride', 'iron_oxide', 'oxalate', 'oxychlorine', 'phyllosilicate', 'silicate', 'sulfate', 'sulfide']


# FEATURE ENGINEERING

In the benchmark notebook the features are discretized into bins of 100 degrees.

In [None]:
#TODO How long does it take for temperature to rise to certain level where we get abundance?
#TODO How to account for the non-linear diferences of time vs temp?
#TODO Each type of ion as a feature, make a matrix and append to the training dataset
#TODO Max value of abundance of each ion - take temp reading and time - or duration
#TODO How to define characteristic of peaks in ion abundances; temp at which they occur, the shape of peaks (height, width, area)
#TODO How to deal with overlapping peaks
#TODO Number of peaks, size of the peaks
#TODO Can we do some target encoding?

**Compute min and max temp for all available samples, including the ion types**

In [8]:
# Get min/max temperature values and unique ion list
# Preprocess the data to remove noise before calculation
min_temp, max_temp, ion_list = preprocess.compute_min_max_temp_ion(metadata)
print(colored(f'Min temp = {min_temp}; Max temp = {max_temp}', 'blue'))
print(colored(f'Number of unqiue ions: {len(ion_list)}', 'blue'))

100%|██████████| 1570/1570 [01:54<00:00, 13.76it/s]

[34mMin temp = -63.563; Max temp = 1484.13[0m
[34mNumber of unqiue ions: 99[0m





## Max relative abundance per temp bin and ion type

Bin the temp from min to max in intervals of 100 degrees and combine with the ion type i.e. `m/z`. For each combination compute the maximum relative abundance.

In [9]:
# ===== TRAIN SAMPLE =====
print(f'Number of all test files: {len(trva_files)}')
fts_maxrelabund_tempion = features.features_iontemp_abun(metadata, trva_files)
print(f'Val-Test features: {fts_maxrelabund_tempion.shape}')
fts_maxrelabund_tempion = fts_maxrelabund_tempion.replace(np.nan, 0)

Number of all test files: 1059
Number of samples: 1059
Val-Test features: (1059, 1584)


In [10]:
# ===== VALID & TEST SAMPLE =====
print(f'Number of all test files: {len(all_test_files)}')
fts_maxrelabund_tempion_VT = features.features_iontemp_abun(metadata, all_test_files)
print(f'Val-Test features: {fts_maxrelabund_tempion_VT.shape}')
fts_maxrelabund_tempion_VT = fts_maxrelabund_tempion_VT.replace(np.nan,0)

Number of all test files: 804
Number of samples: 804
Val-Test features: (804, 1584)


## Duration to max temperature per ion type

Ion types are presented in column, rows are samples and values are time in seconds to max abundance. We should also record at what temperature this happens.

In [11]:
# ===== TRAIN SAMPLE =====
fts_dur_maxabund_ion = features.features_ion_duration_maxtemp(metadata, trva_files, ion_list)
print(f'Features: {fts_dur_maxabund_ion.shape}')
#TODO Fix the warnings

  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df

Features: (1059, 99)





In [12]:
# ===== VALID & TEST SAMPLE =====
fts_dur_maxabund_ion_VT = features.features_ion_duration_maxtemp(metadata, all_test_files, ion_list)
print(f'Features: {fts_dur_maxabund_ion.shape}')

  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df[sample_name] = fts_df['m/z'].map(duration_max_abund)
  fts_df

Features: (1059, 99)





## Combine mx relative abundance and duration

In [13]:
# ===== TRAIN SAMPLE =====
combo_maxabund_dur = pd.concat([fts_maxrelabund_tempion, fts_dur_maxabund_ion], 
                               axis = 1, 
                               ignore_index=False)
print(combo_maxabund_dur.shape)

(1059, 1683)


In [14]:
# ===== VALID & TEST SAMPLE =====
combo_maxabund_dur_VT = pd.concat([fts_maxrelabund_tempion_VT, fts_dur_maxabund_ion_VT], 
                               axis = 1, 
                               ignore_index=False)
print(combo_maxabund_dur_VT.shape)

(804, 1683)


## Analysis of time series

- Percentage change in relative abundance per temp-ion - For each temperature(bin)-ion combination compute the change in relativen abundance.
- find number of peaks
- area under the curve for each ion type

### Hands-On Analysis

- add ion_cnt_peaks
- add time to peak
- first bigger than second if more than one

Refernces:  
- [finding peaks in MS data](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2631518/)
- [SO comment on scipy](https://stackoverflow.com/questions/1713335/peak-finding-algorithm-for-python-scipy)

In [None]:
ht = preprocess.get_sample(metadata,0)
ht = preprocess.preprocess_samples(ht)

In [None]:
htt = ht[ht['m/z'] == 9.0].copy()
htt['abun_minsub_scaled_filtered'] = gaussian_filter1d(htt['abun_minsub_scaled'], sigma=4)

In [None]:
fig, ax = plt.subplots(1,2, figsize=(12,4))
plt.subplot(1, 2, 1); plt.plot(htt['temp'], htt['abun_minsub_scaled'])
plt.subplot(1, 2, 1); plt.plot(htt['temp'], htt['abun_minsub_scaled_filtered'], color='red')
plt.subplot(1, 2, 2); plt.boxplot(htt['abun_minsub_scaled'])
plt.show()

In [None]:
ion_abund_median = htt['abun_minsub_scaled_filtered'].mean()
print(ion_abund_median)
peaks, _ = find_peaks(htt['abun_minsub_scaled_filtered'], prominence=ion_abund_median)
#peaks_s = find_peaks_cwt(htt['abun_minsub_scaled'], noise_perc=0.1)
print(f'Peak idx: {peaks}') # indexes where peaks are found
#print(f'Peak idx: {peaks_s}') # indexes where peaks are found
# Get temp values for peaks
for i in peaks:
    print(htt.iloc[i]['temp'])
    
# Compute area under the curve
#xx = htt['temp']
#yy = htt['abun_minsub_scaled']
#print(f'Area under the curve: {np.round(auc(xx,yy),5)}')

In [None]:
# Computation for sample S0000
ion_peaks_cnt = {}

for ion in ion_list:
    ion_peaks_info = []
    temp_dt = ht[ht['m/z'] == ion].copy()
    temp_dt['abun_minsub_scaled_filtered'] = gaussian_filter1d(temp_dt['abun_minsub_scaled'], 
                                                               sigma=4)
    med = temp_dt['abun_minsub_scaled_filtered'].median()
    
    peaks, _ = find_peaks(temp_dt['abun_minsub_scaled_filtered'], prominence=med)
    ion_peaks_info.append(len(peaks))
    
    peak_temp = []
    peak_time = []
    peak_abund = []
    for i in peaks:
        tm = temp_dt.iloc[i]['time']; peak_time.append(tm) 
        t = temp_dt.iloc[i]['temp']; peak_temp.append(t)
        a = temp_dt.iloc[i]['abun_minsub_scaled']; peak_abund.append(a)
    
    if len(peak_time)>0 and len(peak_temp)>0 and len(peak_abund)>0:
        peak_time = max(peak_time)
        peak_temp = max(peak_temp)
        peak_abund = max(peak_abund)
    else: 
        peak_time, peak_temp, peak_abund = 0, 0, 0
        
    # Compute AUC
    if not temp_dt.empty:
        area_abund = np.round(auc(temp_dt['temp'],temp_dt['abun_minsub_scaled']),5)
    else: area_abund = 0
    
    # Add values
    ion_peaks_info.append(peak_time)
    ion_peaks_info.append(peak_temp)
    ion_peaks_info.append(peak_abund)
    ion_peaks_info.append(area_abund)
        
    ion_peaks_cnt[ion] = ion_peaks_info
    
ion_peaks_cnt

In [None]:
new_cols = ['m/z','peak_cnt', 'peak_time', 'peak_temp', 'peak_abund', 'abund_area']
ion_peaks_stats = pd.DataFrame(ion_peaks_cnt)
ion_peaks_stats = ion_peaks_stats.T
ion_peaks_stats.reset_index(inplace=True)
ion_peaks_stats.columns = new_cols
ion_peaks_stats['sample_id'] = 'S0000'
ion_peaks_stats

In [None]:
features.compute_ion_peaks(metadata, 0, ion_list)

In [None]:
file_paths = {A:N for (A,N) in [x for x in train_files.items()][:5]}
ita = features.features_ion_peaks(file_paths, metadata, ion_list)
ita

### Abundance Peaks + Stats

In [15]:
# ===== TRAIN SAMPLE =====
fts_ion_peaks = features.features_ion_peaks(trva_files, metadata, ion_list)
print(fts_ion_peaks.shape)
fts_ion_peaks.head()

100%|██████████| 1059/1059 [03:27<00:00,  5.11it/s]

(1059, 396)





Unnamed: 0_level_0,peak_cnt_0.0,peak_cnt_1.0,peak_cnt_2.0,peak_cnt_3.0,peak_cnt_5.0,peak_cnt_6.0,peak_cnt_7.0,peak_cnt_8.0,peak_cnt_9.0,peak_cnt_10.0,...,peak_abund_90.0,peak_abund_91.0,peak_abund_92.0,peak_abund_93.0,peak_abund_94.0,peak_abund_95.0,peak_abund_96.0,peak_abund_97.0,peak_abund_98.0,peak_abund_99.0
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
S0000,2.0,2.0,2.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
S0001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
S0002,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
S0003,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
S0004,2.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# ===== VALID & TEST SAMPLE =====
fts_ion_peaks_VT = features.features_ion_peaks(all_test_files, metadata, ion_list)
print(fts_ion_peaks_VT.shape)
fts_ion_peaks_VT.head()

100%|██████████| 804/804 [02:40<00:00,  5.01it/s]

(804, 396)





Unnamed: 0_level_0,peak_cnt_0.0,peak_cnt_1.0,peak_cnt_2.0,peak_cnt_3.0,peak_cnt_5.0,peak_cnt_6.0,peak_cnt_7.0,peak_cnt_8.0,peak_cnt_9.0,peak_cnt_10.0,...,peak_abund_90.0,peak_abund_91.0,peak_abund_92.0,peak_abund_93.0,peak_abund_94.0,peak_abund_95.0,peak_abund_96.0,peak_abund_97.0,peak_abund_98.0,peak_abund_99.0
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
S0766,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
S0767,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
S0768,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
S0769,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
S0770,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**ANALYZE THE ABUNDANCE CURVE**

## Combine mx relative abundance and ion peaks

In [17]:
# ===== TRAIN SAMPLE =====
combo_maxabund_peaks = pd.concat([fts_maxrelabund_tempion, fts_ion_peaks], 
                               axis = 1, 
                               ignore_index=False)
print(combo_maxabund_peaks.shape)

(1059, 1980)


In [18]:
# ===== VALID & TEST SAMPLE =====
combo_maxabund_peaks_VT = pd.concat([fts_maxrelabund_tempion_VT, fts_ion_peaks_VT], 
                               axis = 1, 
                               ignore_index=False)
print(combo_maxabund_peaks_VT.shape)

(804, 1980)


# MODELS

In [19]:
# Data frame to save local CV results
models_log_loss = pd.DataFrame(index=target_labels_list)

## Logistic Regression - Benchmark

In [21]:
train_cv_loss_LR, train_full_clf_LR, submission_LR = training.train_tbl(
    df_train=fts_maxrelabund_tempion,
    df_labels=trvl_labels,
    target_list=target_labels_list,
    df_test=fts_maxrelabund_tempion_VT,
    model_algo='LR_reg',
    sub_name='LR_reg_trvl'
    )
models_log_loss['LR_reg'] = models_log_loss.index.map(train_cv_loss_LR)

[34m
Average Log Loss: 0.2602[0m
Log Loss per Label:
{'basalt': 0.26152610421079714, 'carbonate': 0.22292052739226142, 'chloride': 0.25846765369554403, 'iron_oxide': 0.37216113777027066, 'oxalate': 0.004914672949143167, 'oxychlorine': 0.2526301582566911, 'phyllosilicate': 0.4112433512412358, 'silicate': 0.3498702145055753, 'sulfate': 0.3228680540360762, 'sulfide': 0.1453285461910574}


## XGBoost - optimized
- numerical data needs to be scaled
- categorical data needs to be encoded

In [22]:
train_cv_loss_XGB_opt, train_full_clf_XGB_opt, submission_XGB_opt = training.train_tbl(
    df_train=fts_maxrelabund_tempion,
    df_labels=trvl_labels,
    target_list=target_labels_list,
    df_test=fts_maxrelabund_tempion_VT,
    model_algo='XGB_opt',
    sub_name='XGB_opt_trvl'
    )
models_log_loss['XGB_opt'] = models_log_loss.index.map(train_cv_loss_XGB_opt)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.co

[34m
Average Log Loss: 0.1493[0m
Log Loss per Label:
{'basalt': 0.16030390276618628, 'carbonate': 0.09299784144611556, 'chloride': 0.17099937928281922, 'iron_oxide': 0.2110446946926557, 'oxalate': 0.016176140634807434, 'oxychlorine': 0.15975673521358977, 'phyllosilicate': 0.2392929120695717, 'silicate': 0.18674152896437207, 'sulfate': 0.17355651791774612, 'sulfide': 0.08166643602891402}


## XGBoost opt - Temp bin + Ion peaks

In [23]:
train_cv_loss_XGB_tempb_peaks_opt, train_full_clf_XGB_tempb_peaks_opt, submission_XGB_tempb_peaks_opt =\
    training.train_tbl(
        df_train=combo_maxabund_peaks,
        df_labels=trvl_labels,
        target_list=target_labels_list,
        df_test=combo_maxabund_peaks_VT,
        model_algo='XGB_opt',
        sub_name='XGB_tempb_peaks_opt'
    )
models_log_loss['XGB_tempb_peaks_opt'] = models_log_loss.index.map(train_cv_loss_XGB_tempb_peaks_opt)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.co

[34m
Average Log Loss: 0.145[0m
Log Loss per Label:
{'basalt': 0.1498281400392832, 'carbonate': 0.09301901140051114, 'chloride': 0.16665983013362626, 'iron_oxide': 0.20769608078810645, 'oxalate': 0.014033538995205336, 'oxychlorine': 0.148409597606228, 'phyllosilicate': 0.22877625417536848, 'silicate': 0.19691387258807694, 'sulfate': 0.17159940715849445, 'sulfide': 0.07329880544762327}


In [32]:
#TODO Normalize peaks ?
#TODO Add instrument type as binary
#TODO Add time-temp slope

***

## Model Summary

In [24]:
models_log_loss.style.highlight_min(axis=1, props='color:darkblue; background-color:lightblue;')

Unnamed: 0,LR_reg,XGB_opt,XGB_tempb_peaks_opt
basalt,0.261526,0.160304,0.149828
carbonate,0.222921,0.092998,0.093019
chloride,0.258468,0.170999,0.16666
iron_oxide,0.372161,0.211045,0.207696
oxalate,0.004915,0.016176,0.014034
oxychlorine,0.25263,0.159757,0.14841
phyllosilicate,0.411243,0.239293,0.228776
silicate,0.34987,0.186742,0.196914
sulfate,0.322868,0.173557,0.171599
sulfide,0.145329,0.081666,0.073299


In [25]:
models_log_loss.agg('mean').sort_values()

XGB_tempb_peaks_opt    0.145023
XGB_opt                0.149254
LR_reg                 0.260193
dtype: float64

# Predict Validation on Individual Labels

Compute predictions for each label depending on which model performed the best for that label. Validate the results on the validation sample only since we have the labels.

In [26]:
submission_by_label = pd.read_csv(config.DATA_DIR + 'submission_format.csv', 
                             index_col='sample_id')
submission_by_label = submission_by_label
print(submission_by_label.shape)

log_loss_ind_label = {}

for label in target_labels_list:
    if label == 'oxalate':
        submission_by_label[label] = submission_LR[label]
    elif label in ['carbonate', 'iron_oxide', 'silicate', 'sulfate']:
        submission_by_label[label] = submission_XGB_opt[label]
    else:
        submission_by_label[label] = submission_XGB_tempb_peaks_opt[label]
    
    # Compute log-loss
    ll = log_loss(valid_labels[label], submission_by_label.iloc[:valid_labels.shape[0]][label])
    log_loss_ind_label[label] = ll

models_log_loss['Ind_labels'] = models_log_loss.index.map(log_loss_ind_label)

submission_by_label.to_csv(config.MODELS_DIR + 'ind_label_trvl' + '.csv')

print(f'Average Log Loss: {np.mean(list(log_loss_ind_label.values()))}')
log_loss_ind_label

(804, 10)
Average Log Loss: 0.01045828412889781


{'basalt': 0.009586783985403897,
 'carbonate': 0.009073093251119378,
 'chloride': 0.010652786032964217,
 'iron_oxide': 0.019375610316677637,
 'oxalate': 0.0009390678833569345,
 'oxychlorine': 0.0104609651547863,
 'phyllosilicate': 0.01679729224014201,
 'silicate': 0.010902846098357793,
 'sulfate': 0.012120110015170299,
 'sulfide': 0.004674286310999645}

In [27]:
submission_by_label.tail()

Unnamed: 0_level_0,basalt,carbonate,chloride,iron_oxide,oxalate,oxychlorine,phyllosilicate,silicate,sulfate,sulfide
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
S1565,0.057673,0.674431,0.028812,0.074945,0.076743,0.010618,0.012503,0.142715,0.013262,0.00397
S1566,0.011901,0.041812,0.002183,0.005466,2.7e-05,0.015552,0.111805,0.002567,0.02311,0.022945
S1567,0.01095,0.138708,0.021253,0.004014,0.000202,0.003496,0.009955,0.004168,0.01949,0.00441
S1568,0.077689,0.042328,0.01303,0.002887,0.00093,0.15218,0.391907,0.003764,0.003403,0.16927
S1569,0.046413,0.620819,0.033543,0.091343,0.002928,0.017474,0.017358,0.142715,0.011138,0.00397


In [28]:
models_log_loss.style.highlight_min(axis=1, props='color:darkblue; background-color:lightblue;')

Unnamed: 0,LR_reg,XGB_opt,XGB_tempb_peaks_opt,Ind_labels
basalt,0.261526,0.160304,0.149828,0.009587
carbonate,0.222921,0.092998,0.093019,0.009073
chloride,0.258468,0.170999,0.16666,0.010653
iron_oxide,0.372161,0.211045,0.207696,0.019376
oxalate,0.004915,0.016176,0.014034,0.000939
oxychlorine,0.25263,0.159757,0.14841,0.010461
phyllosilicate,0.411243,0.239293,0.228776,0.016797
silicate,0.34987,0.186742,0.196914,0.010903
sulfate,0.322868,0.173557,0.171599,0.01212
sulfide,0.145329,0.081666,0.073299,0.004674
