In [1]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
## Environment
# Change main system path to be able to run code from src folder
import sys, json
p = sys.path[0]
# Mac OS
if sys.path[0].endswith('/models'):
    main_path = p[:-len('/models')]
sys.path[0] = main_path

import os, gc
from termcolor import colored
import pandas as pd
import numpy as np
import joblib
from src import config, training

  from pandas import MultiIndex, Int64Index


In [3]:
# DATA PREPARATION
# ===== LOAD DATA ======
metadata = pd.read_csv(config.DATA_DIR + 'metadata.csv')
print(f'Metadata: {metadata.shape}')

train_labels = pd.read_csv(config.DATA_DIR + 'train_labels.csv')
print(f'Train labels: {train_labels.shape}')

valid_labels = pd.read_csv(config.DATA_DIR + 'val_labels.csv')
print(f'Train labels: {valid_labels.shape}')

# Combine train and valid labels
trvl_labels = pd.concat([train_labels, valid_labels], axis = 0)

submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
print(f'Submission: {submission.shape}')

# ===== FILE PATHS OF SAMPLES =====
train_files = metadata[metadata.split == 'train']['features_path'].to_dict()
valid_files = metadata[metadata.split == 'val']['features_path'].to_dict()
test_files = metadata[metadata.split == 'test']['features_path'].to_dict()
# Train & Valid files
trva_files = train_files.copy()
trva_files.update(valid_files)
# All files
all_test_files = valid_files.copy()
all_test_files.update(test_files)

# Define SAM testbed files
sam_files = metadata[(metadata.instrument_type == 'sam_testbed') & (metadata.split == 'train')]['features_path']
sam_files = sam_files.to_dict()

# Get the names of the target columns in a list
target_labels_list = [i for i in train_labels.columns if i not in ['sample_id']]
print(target_labels_list)

# SAM testbed labels
sam_labels = train_labels.drop(train_labels.tail(len(sam_files)).index)
sam_labels = pd.concat([sam_labels, valid_labels], axis=0)
print(f'Labels w/o SAM : {sam_labels.shape}')

Metadata: (1570, 5)
Train labels: (766, 11)
Train labels: (293, 11)
Submission: (804, 11)
['basalt', 'carbonate', 'chloride', 'iron_oxide', 'oxalate', 'oxychlorine', 'phyllosilicate', 'silicate', 'sulfate', 'sulfide']
Labels w/o SAM : (1047, 11)


# CURRENT BEST MODEL

In [7]:
SUB_NAME = 'final_model_1'

In [5]:
label_models_tr = {
    'basalt': 'fts_mra_tempmz_slope_cntpk_XGB_opt_tr_sfm.csv',
    'carbonate': 'fts_mra_tempmz_XGB_tr.csv',
    'chloride': 'fts_mra_tempmz_slope_cntpk_XGB_opt_tr_sfm.csv',
    'iron_oxide': 'fts_mra_tempmz_cntpk_XGB_opt_tr_sfm.csv',
    'oxalate': 'fts_mra_tempmz_LR_reg_tr.csv',
    'oxychlorine': 'fts_mra_tempmz_slope_cntpk_XGB_opt_tr_sfm.csv',
    'phyllosilicate': 'fts_mra_tempmz_slope_cntpk_XGB_opt_tr_sfm.csv',
    'silicate': 'fts_mra_tempmz_XGB_opt_tr_sfm.csv',
    'sulfate': 'fts_mra_tempmz_slope_cntpk_XGB_opt_tr_sfm.CSV',
    'sulfide': 'fts_mra_tempmz_cntpk_XGB_opt_tr_sfm.csv'
}

label_models_trvl = {
    'basalt': 'fts_mra_tempmz_slope_cntpk_XGB_opt_trvl_sfm.csv',
    'carbonate': 'fts_mra_tempmz_XGB_trvl.csv',
    'chloride': 'fts_mra_tempmz_slope_cntpk_XGB_opt_trvl_sfm.csv',
    'iron_oxide': 'fts_mra_tempmz_cntpk_XGB_opt_trvl_sfm.csv',
    'oxalate': 'fts_mra_tempmz_LR_reg_trvl.csv',
    'oxychlorine': 'fts_mra_tempmz_slope_cntpk_XGB_opt_trvl_sfm.csv',
    'phyllosilicate': 'fts_mra_tempmz_slope_cntpk_XGB_opt_trvl_sfm.csv',
    'silicate': 'fts_mra_tempmz_XGB_opt_trvl_sfm.csv',
    'sulfate': 'fts_mra_tempmz_slope_cntpk_XGB_opt_trvl_sfm.CSV',
    'sulfide': 'fts_mra_tempmz_cntpk_XGB_opt_trvl_sfm.csv'
}

In [6]:
LABEL_MODELS_DICT = label_models_tr
#==================================
submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
for label in target_labels_list:
    #print(label)
    MODEL_SUB_NAME = LABEL_MODELS_DICT[label]

    # Load saved submission
    label_sub = pd.read_csv(os.path.join(config.MODELS_DIR, MODEL_SUB_NAME))  
    
    submission[label] = label_sub[label]

clf_loss_tr, clf_loss_avg_tr = training.compute_valid_loss(submission.iloc[:len(valid_files)],
                       valid_files,
                       valid_labels, 
                       target_labels_list,
                       SUB_NAME)
print(f'TR: {clf_loss_avg_tr}')

#------------------------------------
LABEL_MODELS_DICT = label_models_trvl

submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
for label in target_labels_list:
    #print(label)
    MODEL_SUB_NAME = LABEL_MODELS_DICT[label]

    # Load saved submission
    label_sub = pd.read_csv(os.path.join(config.MODELS_DIR, MODEL_SUB_NAME))  
    
    submission[label] = label_sub[label]

clf_loss_trvl, clf_loss_avg_trvl = training.compute_valid_loss(submission.iloc[:len(valid_files)],
                       valid_files,
                       valid_labels, 
                       target_labels_list,
                       SUB_NAME)
print(f'TRVL: {clf_loss_avg_trvl}')

SUB_NAME = 'SUB_MODEL_TR_' + str(clf_loss_avg_tr)[:7] + '_TRVL_' + str(clf_loss_avg_trvl)[:9]
print(SUB_NAME)
# Save submission file
submission.to_csv(os.path.join(config.MODELS_DIR, SUB_NAME + '.csv'), index=False)
print('SUBMISSION')
submission.head()

TR: 0.1557306712481364
TRVL: 0.009723688101486268
SUB_MODEL_TR_0.15573_TRVL_0.0097236
SUBMISSION


Unnamed: 0,sample_id,basalt,carbonate,chloride,iron_oxide,oxalate,oxychlorine,phyllosilicate,silicate,sulfate,sulfide
0,S0766,0.001987,0.000721,0.00524,0.002605,9e-06,0.002144,0.005939,0.052359,0.001593,0.001867
1,S0767,0.015197,0.000819,0.000948,0.004182,3.8e-05,0.002691,0.011697,0.043701,0.006188,0.000673
2,S0768,0.968272,0.010956,0.000939,0.029527,0.000425,0.004507,0.940964,0.986137,0.017367,0.000724
3,S0769,0.002439,0.000337,0.007204,0.009711,2.5e-05,0.994533,0.008008,0.001311,0.959035,0.000934
4,S0770,0.002307,0.000205,0.007413,0.949029,0.00018,0.994665,0.988824,0.000837,0.001379,0.001128


# MODEL 2

In [20]:
#submitted

Check the validation loss in the above model for TR trained and see if we have a model with a better loss for that particular label.

In [8]:
SUB_NAME = 'final_model_2'

In [9]:
label_models_tr = {
    'basalt': 'fts_mra_tempmz_slope_cntpk_spectra_mzstats_XGB_opt_tr_sfm.csv',
    'carbonate': 'fts_mra_tempmz_XGB_tr.csv',
    'chloride': 'fts_mra_tempmz_slope_XGB_opt_tr_sfm.csv',
    'iron_oxide': 'fts_mra_tempmz_slope_cntpk_spectra_XGB_opt_tr_sfm.csv',
    'oxalate': 'fts_mra_tempmz_LR_reg_tr.csv',
    'oxychlorine': 'fts_mra_tempmz_slope_topmz_XGB_opt_tr_sfm.csv',
    'phyllosilicate': 'fts_mra_tempmz_slope_cntpk_topmz_XGB_opt_tr_sfm.csv',
    'silicate': 'fts_mra_tempmz_XGB_opt_tr_sfm.csv',
    'sulfate': 'fts_mra_tempmz_slope_cntpk_mzstats_width_XGB_opt_tr_sfm.CSV',
    'sulfide': 'fts_cntpk_mratt_XGB_opt_tr_sfm.csv'
}

label_models_trvl = {
    'basalt': 'fts_mra_tempmz_slope_cntpk_spectra_mzstats_XGB_opt_trvl_sfm.csv',
    'carbonate': 'fts_mra_tempmz_XGB_trvl.csv',
    'chloride': 'fts_mra_tempmz_slope_XGB_opt_trvl_sfm.csv',
    'iron_oxide': 'fts_mra_tempmz_slope_cntpk_spectra_XGB_opt_trvl_sfm.csv',
    'oxalate': 'fts_mra_tempmz_LR_reg_trvl.csv',
    'oxychlorine': 'fts_mra_tempmz_slope_topmz_XGB_opt_trvl_sfm.csv',
    'phyllosilicate': 'fts_mra_tempmz_slope_cntpk_topmz_XGB_opt_trvl_sfm.csv',
    'silicate': 'fts_mra_tempmz_XGB_opt_trvl_sfm.csv',
    'sulfate': 'fts_mra_tempmz_slope_cntpk_mzstats_width_XGB_opt_trvl_sfm.CSV',
    'sulfide': 'fts_cntpk_mratt_XGB_opt_trvl_sfm.csv'
}

In [10]:
LABEL_MODELS_DICT = label_models_tr
#==================================
submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
for label in target_labels_list:
    #print(label)
    MODEL_SUB_NAME = LABEL_MODELS_DICT[label]

    # Load saved submission
    label_sub = pd.read_csv(os.path.join(config.MODELS_DIR, MODEL_SUB_NAME))  
    
    submission[label] = label_sub[label]

clf_loss_tr, clf_loss_avg_tr = training.compute_valid_loss(submission.iloc[:len(valid_files)],
                       valid_files,
                       valid_labels, 
                       target_labels_list,
                       SUB_NAME)
print(f'TR: {clf_loss_avg_tr}')

#------------------------------------
LABEL_MODELS_DICT = label_models_trvl

submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
for label in target_labels_list:
    #print(label)
    MODEL_SUB_NAME = LABEL_MODELS_DICT[label]

    # Load saved submission
    label_sub = pd.read_csv(os.path.join(config.MODELS_DIR, MODEL_SUB_NAME))  
    
    submission[label] = label_sub[label]

clf_loss_trvl, clf_loss_avg_trvl = training.compute_valid_loss(submission.iloc[:len(valid_files)],
                       valid_files,
                       valid_labels, 
                       target_labels_list,
                       SUB_NAME)
print(f'TRVL: {clf_loss_avg_trvl}')

SUB_NAME = 'SUB_MODEL_TR_' + str(clf_loss_avg_tr)[:7] + '_TRVL_' + str(clf_loss_avg_trvl)[:9]
print(SUB_NAME)
# Save submission file
submission.to_csv(os.path.join(config.MODELS_DIR, SUB_NAME + '.csv'), index=False)
print('SUBMISSION')
submission.head(2)

TR: 0.14417060223654504
TRVL: 0.009752600218665
SUB_MODEL_TR_0.14417_TRVL_0.0097526
SUBMISSION


Unnamed: 0,sample_id,basalt,carbonate,chloride,iron_oxide,oxalate,oxychlorine,phyllosilicate,silicate,sulfate,sulfide
0,S0766,0.00279,0.000721,0.007935,0.002778,9e-06,0.003297,0.007457,0.052359,0.001588,0.001337
1,S0767,0.004998,0.000819,0.001804,0.005994,3.8e-05,0.002175,0.013336,0.043701,0.005784,0.001753


# MODEL 3 - best TR VLOSS

In [21]:
# submitted

In [11]:
SUB_NAME = 'final_model_3_VL'

In [12]:
label_models_tr = {
    'basalt': 'fts_mra_tempmz_slope_cntpk_spectra_mzstats_XGB_opt_tr_sfm.csv',
    'carbonate': 'fts_mra_tempmz_slope_spectra_XGB_opt_tr_sfm.csv',
    'chloride': 'fts_mra_tempmz_XGB_opt_tr_sfm.csv',
    'iron_oxide': 'fts_range_abun_to_temp_XGB_opt_tr_sfm.csv',
    'oxalate': 'fts_mz_maxabun_XGB_opt_tr_sfm.csv',
    'oxychlorine': 'fts_mra_tempmz_slope_cntpk_spectra_range_XGB_opt_tr_sfm.csv',
    'phyllosilicate': 'fts_mra_tempmz_slope_cntpk_spectra_range_XGB_opt_tr_sfm.csv',
    'silicate': 'fts_mra_tempmz_slope_spectra_XGB_opt_tr_sfm.csv',
    'sulfate': 'fts_mra_tempmz_slope_cntpk_mzstats_width_XGB_opt_tr_sfm.CSV',
    'sulfide': 'fts_cntpk_mratt_XGB_opt_tr_sfm.csv'
}

label_models_trvl = {
    'basalt': 'fts_mra_tempmz_slope_cntpk_spectra_mzstats_XGB_opt_trvl_sfm.csv',
    'carbonate': 'fts_mra_tempmz_slope_spectra_XGB_opt_trvl_sfm.csv',
    'chloride': 'fts_mra_tempmz_XGB_opt_trvl_sfm.csv',
    'iron_oxide': 'fts_range_abun_to_temp_XGB_opt_trvl_sfm.csv',
    'oxalate': 'fts_mz_maxabun_XGB_opt_trvl_sfm.csv',
    'oxychlorine': 'fts_mra_tempmz_slope_cntpk_spectra_range_XGB_opt_trvl_sfm.csv',
    'phyllosilicate': 'fts_mra_tempmz_slope_cntpk_spectra_range_XGB_opt_trvl_sfm.csv',
    'silicate': 'fts_mra_tempmz_slope_spectra_XGB_opt_trvl_sfm.csv',
    'sulfate': 'fts_mra_tempmz_slope_cntpk_mzstats_width_XGB_opt_trvl_sfm.CSV',
    'sulfide': 'fts_cntpk_mratt_XGB_opt_trvl_sfm.csv'
}

In [13]:
LABEL_MODELS_DICT = label_models_tr
#==================================
submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
for label in target_labels_list:
    #print(label)
    MODEL_SUB_NAME = LABEL_MODELS_DICT[label]

    # Load saved submission
    label_sub = pd.read_csv(os.path.join(config.MODELS_DIR, MODEL_SUB_NAME))  
    
    submission[label] = label_sub[label]

clf_loss_tr, clf_loss_avg_tr = training.compute_valid_loss(submission.iloc[:len(valid_files)],
                       valid_files,
                       valid_labels, 
                       target_labels_list,
                       SUB_NAME)
print(f'TR: {clf_loss_avg_tr}')

#------------------------------------
LABEL_MODELS_DICT = label_models_trvl

submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
for label in target_labels_list:
    #print(label)
    MODEL_SUB_NAME = LABEL_MODELS_DICT[label]

    # Load saved submission
    label_sub = pd.read_csv(os.path.join(config.MODELS_DIR, MODEL_SUB_NAME))  
    
    submission[label] = label_sub[label]

clf_loss_trvl, clf_loss_avg_trvl = training.compute_valid_loss(submission.iloc[:len(valid_files)],
                       valid_files,
                       valid_labels, 
                       target_labels_list,
                       SUB_NAME)
print(f'TRVL: {clf_loss_avg_trvl}')

SUB_NAME = 'SUB_MODEL_TR_' + str(clf_loss_avg_tr)[:7] + '_TRVL_' + str(clf_loss_avg_trvl)[:9]
print(SUB_NAME)
# Save submission file
submission.to_csv(os.path.join(config.MODELS_DIR, SUB_NAME + '.csv'), index=False)
print('SUBMISSION')
submission.head()

TR: 0.13890205609275139
TRVL: 0.010065069070772103
SUB_MODEL_TR_0.13890_TRVL_0.0100650
SUBMISSION


Unnamed: 0,sample_id,basalt,carbonate,chloride,iron_oxide,oxalate,oxychlorine,phyllosilicate,silicate,sulfate,sulfide
0,S0766,0.00279,0.00124,0.007935,0.005267,0.00044,0.003931,0.004392,0.070304,0.001588,0.001337
1,S0767,0.004998,0.003934,0.001804,0.002885,0.00044,0.001861,0.004206,0.040689,0.005784,0.001753
2,S0768,0.979432,0.040569,0.000792,0.009382,0.001673,0.002484,0.932608,0.989231,0.011058,0.001763
3,S0769,0.002182,0.000696,0.013194,0.009617,0.00044,0.997063,0.004484,0.001571,0.971288,0.001009
4,S0770,0.003755,0.001125,0.00432,0.931504,0.000869,0.994373,0.990687,0.001029,0.001558,0.001195


# MODEL 3a

Oxalate LR

In [7]:
# SUBMITTED

In [4]:
SUB_NAME = 'final_model_3a_VL'

In [5]:
label_models_tr = {
    'basalt': 'fts_mra_tempmz_slope_cntpk_spectra_mzstats_XGB_opt_tr_sfm.csv',
    'carbonate': 'fts_mra_tempmz_slope_spectra_XGB_opt_tr_sfm.csv',
    'chloride': 'fts_mra_tempmz_XGB_opt_tr_sfm.csv',
    'iron_oxide': 'fts_range_abun_to_temp_XGB_opt_tr_sfm.csv',
    'oxalate': 'fts_mra_tempmz_LR_reg_tr.csv',
    'oxychlorine': 'fts_mra_tempmz_slope_cntpk_spectra_range_XGB_opt_tr_sfm.csv',
    'phyllosilicate': 'fts_mra_tempmz_slope_cntpk_spectra_range_XGB_opt_tr_sfm.csv',
    'silicate': 'fts_mra_tempmz_slope_spectra_XGB_opt_tr_sfm.csv',
    'sulfate': 'fts_mra_tempmz_slope_cntpk_mzstats_width_XGB_opt_tr_sfm.CSV',
    'sulfide': 'fts_cntpk_mratt_XGB_opt_tr_sfm.csv'
}

label_models_trvl = {
    'basalt': 'fts_mra_tempmz_slope_cntpk_spectra_mzstats_XGB_opt_trvl_sfm.csv',
    'carbonate': 'fts_mra_tempmz_slope_spectra_XGB_opt_trvl_sfm.csv',
    'chloride': 'fts_mra_tempmz_XGB_opt_trvl_sfm.csv',
    'iron_oxide': 'fts_range_abun_to_temp_XGB_opt_trvl_sfm.csv',
    'oxalate': 'fts_mra_tempmz_LR_reg_trvl.csv',
    'oxychlorine': 'fts_mra_tempmz_slope_cntpk_spectra_range_XGB_opt_trvl_sfm.csv',
    'phyllosilicate': 'fts_mra_tempmz_slope_cntpk_spectra_range_XGB_opt_trvl_sfm.csv',
    'silicate': 'fts_mra_tempmz_slope_spectra_XGB_opt_trvl_sfm.csv',
    'sulfate': 'fts_mra_tempmz_slope_cntpk_mzstats_width_XGB_opt_trvl_sfm.CSV',
    'sulfide': 'fts_cntpk_mratt_XGB_opt_trvl_sfm.csv'
}

In [6]:
LABEL_MODELS_DICT = label_models_tr
#==================================
submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
for label in target_labels_list:
    #print(label)
    MODEL_SUB_NAME = LABEL_MODELS_DICT[label]

    # Load saved submission
    label_sub = pd.read_csv(os.path.join(config.MODELS_DIR, MODEL_SUB_NAME))  
    
    submission[label] = label_sub[label]

clf_loss_tr, clf_loss_avg_tr = training.compute_valid_loss(submission.iloc[:len(valid_files)],
                       valid_files,
                       valid_labels, 
                       target_labels_list,
                       SUB_NAME)
print(f'TR: {clf_loss_avg_tr}')

#------------------------------------
LABEL_MODELS_DICT = label_models_trvl

submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
for label in target_labels_list:
    #print(label)
    MODEL_SUB_NAME = LABEL_MODELS_DICT[label]

    # Load saved submission
    label_sub = pd.read_csv(os.path.join(config.MODELS_DIR, MODEL_SUB_NAME))  
    
    submission[label] = label_sub[label]

clf_loss_trvl, clf_loss_avg_trvl = training.compute_valid_loss(submission.iloc[:len(valid_files)],
                       valid_files,
                       valid_labels, 
                       target_labels_list,
                       SUB_NAME)
print(f'TRVL: {clf_loss_avg_trvl}')

SUB_NAME = 'SUB_MODEL_TR_' + str(clf_loss_avg_tr)[:7] + '_TRVL_' + str(clf_loss_avg_trvl)[:9]
print(SUB_NAME)
# Save submission file
submission.to_csv(os.path.join(config.MODELS_DIR, SUB_NAME + '.csv'), index=False)
print('SUBMISSION')
submission.head()

TR: 0.13904575932720156
TRVL: 0.009940614261717646
SUB_MODEL_TR_0.13904_TRVL_0.0099406
SUBMISSION


Unnamed: 0,sample_id,basalt,carbonate,chloride,iron_oxide,oxalate,oxychlorine,phyllosilicate,silicate,sulfate,sulfide
0,S0766,0.00279,0.00124,0.007935,0.005267,9e-06,0.003931,0.004392,0.070304,0.001588,0.001337
1,S0767,0.004998,0.003934,0.001804,0.002885,3.8e-05,0.001861,0.004206,0.040689,0.005784,0.001753
2,S0768,0.979432,0.040569,0.000792,0.009382,0.000425,0.002484,0.932608,0.989231,0.011058,0.001763
3,S0769,0.002182,0.000696,0.013194,0.009617,2.5e-05,0.997063,0.004484,0.001571,0.971288,0.001009
4,S0770,0.003755,0.001125,0.00432,0.931504,0.00018,0.994373,0.990687,0.001029,0.001558,0.001195


# MODEL 4 - best TR CVLOSS

In [14]:
SUB_NAME = 'final_model_4_TRCV'

In [15]:
label_models_tr = {
    'basalt': 'fts_mra_tempmz_slope_spectra_mzstats_XGB_opt_tr_sfm.csv',
    'carbonate': 'fts_mra_tempmz_slope_topmz_XGB_opt_tr_sfm.csv',
    'chloride': 'fts_mz_maxabun_XGB_opt_tr_sfm.csv',
    'iron_oxide': 'fts_mra_tempmz_slope_spectra_XGB_opt_tr_sfm.csv',
    'oxalate': 'fts_mra_tempmz_LR_reg_tr.csv',
    'oxychlorine': 'fts_mra_tempmz_slope_cntpk_topmz_XGB_opt_tr_sfm.csv',
    'phyllosilicate': 'fts_mra_tempmz_slope_cntpk_spectra_mzstats_XGB_opt_tr_sfm.csv',
    'silicate': 'fts_range_abun_to_temp_XGB_opt_tr_sfm.csv',
    'sulfate': 'fts_mra_tempmz_slope_XGB_opt_tr_sfm.CSV',
    'sulfide': 'fts_mra_tempmz_slope_cntpk_spectra_mzstats_XGB_opt_tr_sfm.csv'
}

label_models_trvl = {
    'basalt': 'fts_mra_tempmz_slope_spectra_mzstats_XGB_opt_trvl_sfm.csv',
    'carbonate': 'fts_mra_tempmz_slope_topmz_XGB_opt_trvl_sfm.csv',
    'chloride': 'fts_mz_maxabun_XGB_opt_trvl_sfm.csv',
    'iron_oxide': 'fts_mra_tempmz_slope_spectra_XGB_opt_trvl_sfm.csv',
    'oxalate': 'fts_mra_tempmz_LR_reg_trvl.csv',
    'oxychlorine': 'fts_mra_tempmz_slope_cntpk_topmz_XGB_opt_trvl_sfm.csv',
    'phyllosilicate': 'fts_mra_tempmz_slope_cntpk_spectra_mzstats_XGB_opt_trvl_sfm.csv',
    'silicate': 'fts_range_abun_to_temp_XGB_opt_trvl_sfm.csv',
    'sulfate': 'fts_mra_tempmz_slope_XGB_opt_trvl_sfm.CSV',
    'sulfide': 'fts_mra_tempmz_slope_cntpk_spectra_mzstats_XGB_opt_trvl_sfm.csv'
}

In [16]:
LABEL_MODELS_DICT = label_models_tr
#==================================
submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
for label in target_labels_list:
    #print(label)
    MODEL_SUB_NAME = LABEL_MODELS_DICT[label]

    # Load saved submission
    label_sub = pd.read_csv(os.path.join(config.MODELS_DIR, MODEL_SUB_NAME))  
    
    submission[label] = label_sub[label]

clf_loss_tr, clf_loss_avg_tr = training.compute_valid_loss(submission.iloc[:len(valid_files)],
                       valid_files,
                       valid_labels, 
                       target_labels_list,
                       SUB_NAME)
print(f'TR: {clf_loss_avg_tr}')

#------------------------------------
LABEL_MODELS_DICT = label_models_trvl

submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
for label in target_labels_list:
    #print(label)
    MODEL_SUB_NAME = LABEL_MODELS_DICT[label]

    # Load saved submission
    label_sub = pd.read_csv(os.path.join(config.MODELS_DIR, MODEL_SUB_NAME))  
    
    submission[label] = label_sub[label]

clf_loss_trvl, clf_loss_avg_trvl = training.compute_valid_loss(submission.iloc[:len(valid_files)],
                       valid_files,
                       valid_labels, 
                       target_labels_list,
                       SUB_NAME)
print(f'TRVL: {clf_loss_avg_trvl}')

SUB_NAME = 'SUB_MODEL_TR_' + str(clf_loss_avg_tr)[:7] + '_TRVL_' + str(clf_loss_avg_trvl)[:9]
print(SUB_NAME)
# Save submission file
submission.to_csv(os.path.join(config.MODELS_DIR, SUB_NAME + '.csv'), index=False)
print('SUBMISSION')
submission.head()

TR: 0.1780506917423813
TRVL: 0.030896078850807425
SUB_MODEL_TR_0.17805_TRVL_0.0308960
SUBMISSION


Unnamed: 0,sample_id,basalt,carbonate,chloride,iron_oxide,oxalate,oxychlorine,phyllosilicate,silicate,sulfate,sulfide
0,S0766,0.003253,0.001279,0.004079,0.002786,9e-06,0.002249,0.004424,0.048152,0.005256,0.00142
1,S0767,0.009362,0.006744,0.001173,0.007463,3.8e-05,0.002734,0.010777,0.025426,0.011236,0.000587
2,S0768,0.981396,0.002065,0.000978,0.013438,0.000425,0.003778,0.943094,0.97231,0.014457,0.000898
3,S0769,0.002069,0.00181,0.037527,0.020186,2.5e-05,0.994361,0.010089,0.001626,0.976882,0.000769
4,S0770,0.001839,0.000614,0.015633,0.94542,0.00018,0.994528,0.988158,0.001147,0.002356,0.000997


# MODEL 5 - best TRVL CVLOSS

In [22]:
# submitted

In [17]:
SUB_NAME = 'final_model_5_TRVLCV'

In [18]:
label_models_tr = {
    'basalt': 'fts_range_abun_to_temp_XGB_opt_tr_sfm.csv',
    'carbonate': 'fts_mra_tempmz_slope_topmz_XGB_opt_tr_sfm.csv',
    'chloride': 'fts_mra_tempmz_slope_spectra_XGB_opt_tr_sfm.csv',
    'iron_oxide': 'fts_mra_tempmz_slope_cntpk_spectra_range_XGB_opt_tr_sfm.csv',
    'oxalate': 'fts_mra_tempmz_LR_reg_tr.csv',
    'oxychlorine': 'fts_mra_tempmz_cntpk_XGB_opt_tr_sfm.csv',
    'phyllosilicate': 'fts_mra_tempmz_slope_cntpk_spectra_range_XGB_opt_tr_sfm.csv',
    'silicate': 'fts_mra_tempmz_slope_cntpk_XGB_opt_tr_sfm.csv',
    'sulfate': 'fts_mra_tempmz_slope_cntpk_spectra_mzstats_width_XGB_opt_tr_sfm.CSV',
    'sulfide': 'fts_mra_tempmz_slope_cntpk_spectra_mzstats_width_XGB_opt_tr_sfm.csv'
}

label_models_trvl = {
    'basalt': 'fts_range_abun_to_temp_XGB_opt_trvl_sfm.csv',
    'carbonate': 'fts_mra_tempmz_slope_topmz_XGB_opt_trvl_sfm.csv',
    'chloride': 'fts_mra_tempmz_slope_spectra_XGB_opt_trvl_sfm.csv',
    'iron_oxide': 'fts_mra_tempmz_slope_cntpk_spectra_range_XGB_opt_trvl_sfm.csv',
    'oxalate': 'fts_mra_tempmz_LR_reg_trvl.csv',
    'oxychlorine': 'fts_mra_tempmz_cntpk_XGB_opt_trvl_sfm.csv',
    'phyllosilicate': 'fts_mra_tempmz_slope_cntpk_spectra_range_XGB_opt_trvl_sfm.csv',
    'silicate': 'fts_mra_tempmz_slope_cntpk_XGB_opt_trvl_sfm.csv',
    'sulfate': 'fts_mra_tempmz_slope_cntpk_spectra_mzstats_width_XGB_opt_trvl_sfm.CSV',
    'sulfide': 'fts_mra_tempmz_slope_cntpk_spectra_mzstats_width_XGB_opt_trvl_sfm.csv'
}

In [19]:
LABEL_MODELS_DICT = label_models_tr
#==================================
submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
for label in target_labels_list:
    #print(label)
    MODEL_SUB_NAME = LABEL_MODELS_DICT[label]

    # Load saved submission
    label_sub = pd.read_csv(os.path.join(config.MODELS_DIR, MODEL_SUB_NAME))  
    
    submission[label] = label_sub[label]

clf_loss_tr, clf_loss_avg_tr = training.compute_valid_loss(submission.iloc[:len(valid_files)],
                       valid_files,
                       valid_labels, 
                       target_labels_list,
                       SUB_NAME)
print(f'TR: {clf_loss_avg_tr}')

#------------------------------------
LABEL_MODELS_DICT = label_models_trvl

submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
for label in target_labels_list:
    #print(label)
    MODEL_SUB_NAME = LABEL_MODELS_DICT[label]

    # Load saved submission
    label_sub = pd.read_csv(os.path.join(config.MODELS_DIR, MODEL_SUB_NAME))  
    
    submission[label] = label_sub[label]

clf_loss_trvl, clf_loss_avg_trvl = training.compute_valid_loss(submission.iloc[:len(valid_files)],
                       valid_files,
                       valid_labels, 
                       target_labels_list,
                       SUB_NAME)
print(f'TRVL: {clf_loss_avg_trvl}')

SUB_NAME = 'SUB_MODEL_TR_' + str(clf_loss_avg_tr)[:7] + '_TRVL_' + str(clf_loss_avg_trvl)[:9]
print(SUB_NAME)
# Save submission file
submission.to_csv(os.path.join(config.MODELS_DIR, SUB_NAME + '.csv'), index=False)
print('SUBMISSION')
submission.head()

TR: 0.16598385728803894
TRVL: 0.029415778323799775
SUB_MODEL_TR_0.16598_TRVL_0.0294157
SUBMISSION


Unnamed: 0,sample_id,basalt,carbonate,chloride,iron_oxide,oxalate,oxychlorine,phyllosilicate,silicate,sulfate,sulfide
0,S0766,0.001631,0.001279,0.006002,0.002679,9e-06,0.001776,0.004392,0.049732,0.001971,0.001284
1,S0767,0.014849,0.006744,0.001223,0.00262,3.8e-05,0.003117,0.004206,0.028484,0.007197,0.000628
2,S0768,0.950114,0.002065,0.000793,0.016389,0.000425,0.003101,0.932608,0.982762,0.012761,0.000999
3,S0769,0.003971,0.00181,0.012454,0.009628,2.5e-05,0.994391,0.004484,0.001526,0.971494,0.000691
4,S0770,0.001149,0.000614,0.003953,0.956826,0.00018,0.994925,0.990687,0.001119,0.001211,0.00103


# MODEL 6 - best TRVL VLOSS

In [8]:
SUB_NAME = 'final_model_6_TRVLV'

In [9]:
label_models_tr = {
    'basalt': 'fts_mra_tempmz_slope_cntpk_spectra_range_mass_XGB_opt_tr_sfm.csv',
    'carbonate': 'fts_mra_tempmz_slope_cntpk_mzstats_XGB_opt_tr_sfm.csv',
    'chloride': 'fts_mra_tempmz_slope_cntpk_mzstats_XGB_opt_tr_sfm.csv',
    'iron_oxide': 'fts_mra_tempmz_slope_cntpk_spectra_mzstats_width_XGB_opt_tr_sfm.csv',
    'oxalate': 'fts_range_abun_to_temp_XGB_opt_tr_sfm.csv',
    'oxychlorine': 'fts_mra_tempmz_slope_cntpk_mzstats_width_XGB_opt_tr_sfm.csv',
    'phyllosilicate': 'fts_mra_tempmz_slope_cntpk_mzstats_width_XGB_opt_tr_sfm.csv',
    'silicate': 'fts_mra_tempmz_slope_cntpk_spectra_mzstats_width_XGB_opt_tr_sfm.csv',
    'sulfate': 'fts_mra_tempmz_slope_cntpk_spectra_mzstats_width_XGB_opt_tr_sfm.CSV',
    'sulfide': 'fts_mra_tempmz_slope_cntpk_mzstats_width_XGB_opt_tr_sfm.csv'
}

label_models_trvl = {
    'basalt': 'fts_mra_tempmz_slope_cntpk_spectra_range_mass_XGB_opt_trvl_sfm.csv',
    'carbonate': 'fts_mra_tempmz_slope_cntpk_mzstats_XGB_opt_trvl_sfm.csv',
    'chloride': 'fts_mra_tempmz_slope_cntpk_mzstats_XGB_opt_trvl_sfm.csv',
    'iron_oxide': 'fts_mra_tempmz_slope_cntpk_spectra_mzstats_width_XGB_opt_trvl_sfm.csv',
    'oxalate': 'fts_range_abun_to_temp_XGB_opt_trvl_sfm.csv',
    'oxychlorine': 'fts_mra_tempmz_slope_cntpk_mzstats_width_XGB_opt_trvl_sfm.csv',
    'phyllosilicate': 'fts_mra_tempmz_slope_cntpk_mzstats_width_XGB_opt_trvl_sfm.csv',
    'silicate': 'fts_mra_tempmz_slope_cntpk_spectra_mzstats_width_XGB_opt_trvl_sfm.csv',
    'sulfate': 'fts_mra_tempmz_slope_cntpk_spectra_mzstats_width_XGB_opt_trvl_sfm.CSV',
    'sulfide': 'fts_mra_tempmz_slope_cntpk_mzstats_width_XGB_opt_trvl_sfm.csv'
}

In [10]:
LABEL_MODELS_DICT = label_models_tr
#==================================
submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
for label in target_labels_list:
    #print(label)
    MODEL_SUB_NAME = LABEL_MODELS_DICT[label]

    # Load saved submission
    label_sub = pd.read_csv(os.path.join(config.MODELS_DIR, MODEL_SUB_NAME))  
    
    submission[label] = label_sub[label]

clf_loss_tr, clf_loss_avg_tr = training.compute_valid_loss(submission.iloc[:len(valid_files)],
                       valid_files,
                       valid_labels, 
                       target_labels_list,
                       SUB_NAME)
print(f'TR: {clf_loss_avg_tr}')

#------------------------------------
LABEL_MODELS_DICT = label_models_trvl

submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
for label in target_labels_list:
    #print(label)
    MODEL_SUB_NAME = LABEL_MODELS_DICT[label]

    # Load saved submission
    label_sub = pd.read_csv(os.path.join(config.MODELS_DIR, MODEL_SUB_NAME))  
    
    submission[label] = label_sub[label]

clf_loss_trvl, clf_loss_avg_trvl = training.compute_valid_loss(submission.iloc[:len(valid_files)],
                       valid_files,
                       valid_labels, 
                       target_labels_list,
                       SUB_NAME)
print(f'TRVL: {clf_loss_avg_trvl}')

SUB_NAME = 'SUB_MODEL_TR_' + str(clf_loss_avg_tr)[:7] + '_TRVL_' + str(clf_loss_avg_trvl)[:9]
print(SUB_NAME)
# Save submission file
submission.to_csv(os.path.join(config.MODELS_DIR, SUB_NAME + '.csv'), index=False)
print('SUBMISSION')
submission.head()

TR: 0.16375682776213923
TRVL: 0.008944540886721936
SUB_MODEL_TR_0.16375_TRVL_0.0089445
SUBMISSION


Unnamed: 0,sample_id,basalt,carbonate,chloride,iron_oxide,oxalate,oxychlorine,phyllosilicate,silicate,sulfate,sulfide
0,S0766,0.001989,0.001812,0.002989,0.001777,0.000961,0.002999,0.005369,0.045934,0.001971,0.001357
1,S0767,0.00993,0.004078,0.000995,0.003643,0.000475,0.00354,0.018704,0.049199,0.007197,0.000553
2,S0768,0.960725,0.030713,0.000865,0.025591,0.000992,0.005781,0.946813,0.975798,0.012761,0.000948
3,S0769,0.003377,0.001511,0.005971,0.005279,0.0008,0.993693,0.012589,0.001328,0.971494,0.000729
4,S0770,0.001372,0.003251,0.015108,0.958674,0.001201,0.993307,0.982456,0.001295,0.001211,0.001083
