In [1]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [24]:
## Environment
# Change main system path to be able to run code from src folder
import sys
p = sys.path[0]
# Mac OS
if sys.path[0].endswith('/models'):
    main_path = p[:-len('/models')]
sys.path[0] = main_path

import os, gc, json
from termcolor import colored
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
from src import (config, fe, features, feature_selection, 
                 preprocess, training)
from src.fe import CreateFeatures

In [5]:
train_labels = pd.read_csv(config.DATA_DIR + 'train_labels.csv')
target_labels_list = [i for i in train_labels.columns if i not in ['sample_id']]

In this notebook we determine the best base model for each target. Each feature group which has more than one feature is fitted for each target. We analyze CV loss and V loss. The feature group for which the CV loss and the V loss is at minimum is considered the base model for that feature.

In [21]:
# Define a list of all feature groups
FTS_GROUPS = ['fts_topmz', 'fts_peak_widths', 'fts_range_abun_to_temp',
              'fts_mzstats', 'fts_mra_tempmz', 'fts_lr_corr_mz4', 'fts_corr_mz4',
              'fts_cntpk_mratt']

Since all individual models are already trained we simply import the CV and V loglosses for each feature group.

In [22]:
FTS_TYPE = ['_', '_sfm_']
MODEL_ALGO = ['XGB_opt', 'LR_reg', 'XGB']
SPLIT_TYPE = 'tr'
base_models = pd.DataFrame()
for fts in FTS_GROUPS:    
    for i in FTS_TYPE:
        for algo in MODEL_ALGO:
            # For full features
            cvloss_file_path = os.path.join(config.MODELS_DIR,
                                            fts + '_' + algo + '_' +\
                                            SPLIT_TYPE + i + 'cvloss.csv')
        
            if os.path.exists(cvloss_file_path):
                df_cvloss = pd.read_csv(os.path.join(config.MODELS_DIR, 
                                                    cvloss_file_path), 
                                        index_col='target')
                base_models = pd.concat([base_models, df_cvloss], axis=1)

base_models = base_models.T.copy()
base_models.style.highlight_min(axis=0, 
                                  props='color:darkblue; background-color:lightblue;',
                                  subset=target_labels_list)

target,basalt,carbonate,chloride,iron_oxide,oxalate,oxychlorine,phyllosilicate,silicate,sulfate,sulfide
fts_topmz_XGB_opt_tr,0.338978,0.278419,0.340757,0.493114,0.039228,0.396737,0.539699,0.35648,0.431514,0.170794
fts_peak_widths_XGB_opt_tr,0.242746,0.187358,0.275997,0.347945,0.053747,0.272343,0.372442,0.287244,0.311164,0.101251
fts_peak_widths_XGB_opt_tr_sfm,0.238383,0.188893,0.266738,0.341897,0.046472,0.272343,0.358678,0.287244,0.311164,0.089869
fts_range_abun_to_temp_XGB_opt_tr,0.2089,0.134557,0.211078,0.257936,0.024436,0.187906,0.27847,0.254468,0.243184,0.074498
fts_range_abun_to_temp_XGB_opt_tr_sfm,0.2089,0.128625,0.185301,0.233763,0.02086,0.187906,0.27847,0.204074,0.222138,0.093155
fts_mzstats_XGB_opt_tr,0.232613,0.155973,0.222719,0.298379,0.022603,0.231221,0.286709,0.24907,0.204235,0.084463
fts_mzstats_XGB_opt_tr_sfm,0.200694,0.155973,0.196798,0.270841,0.019582,0.223709,0.246448,0.254593,0.204235,0.068978
fts_mra_tempmz_XGB_opt_tr,0.196651,0.12102,0.204338,0.226116,0.029273,0.194202,0.259516,0.233179,0.203174,0.078807
fts_mra_tempmz_LR_reg_tr,0.276783,0.261255,0.284298,0.38247,0.004059,0.29915,0.407476,0.351685,0.344719,0.147136
fts_mra_tempmz_XGB_tr,0.225651,0.132256,0.227206,0.254578,0.027906,0.208509,0.297309,0.257565,0.224597,0.078318


Extract the name of the model for the minimim CV loss for each label:

In [26]:
base_model_label = {}
for label in target_labels_list:
    base_model_label[label] = base_models[base_models[label] == base_models[label].min()][label].index[0]
    
# Save base models
file_name = os.path.join(config.MODELS_DIR, 
                         'models_base_label.txt')
with open(file_name, 'w') as file:
    file.write(json.dumps(base_model_label))
print(f'Saving {file_name}')

# Base model results
base_model_label

Saving ../models/models_base_label.txt


{'basalt': 'fts_mra_tempmz_XGB_opt_tr',
 'carbonate': 'fts_mra_tempmz_XGB_opt_tr_sfm',
 'chloride': 'fts_range_abun_to_temp_XGB_opt_tr_sfm',
 'iron_oxide': 'fts_mra_tempmz_XGB_opt_tr_sfm',
 'oxalate': 'fts_mra_tempmz_LR_reg_tr',
 'oxychlorine': 'fts_cntpk_mratt_XGB_opt_tr',
 'phyllosilicate': 'fts_mra_tempmz_XGB_opt_tr_sfm',
 'silicate': 'fts_range_abun_to_temp_XGB_opt_tr_sfm',
 'sulfate': 'fts_mra_tempmz_XGB_opt_tr_sfm',
 'sulfide': 'fts_mra_tempmz_XGB_opt_tr_sfm'}