In [1]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
## Environment
# Change main system path to be able to run code from src folder
import sys, json
p = sys.path[0]
# Mac OS
if sys.path[0].endswith('/models'):
    main_path = p[:-len('/models')]
sys.path[0] = main_path

import os, gc
from termcolor import colored
import pandas as pd
import numpy as np
import joblib
from src import config, training

  from pandas import MultiIndex, Int64Index


In [3]:
# DATA PREPARATION
# ===== LOAD DATA ======
metadata = pd.read_csv(config.DATA_DIR + 'metadata.csv')
print(f'Metadata: {metadata.shape}')

train_labels = pd.read_csv(config.DATA_DIR + 'train_labels.csv')
print(f'Train labels: {train_labels.shape}')

valid_labels = pd.read_csv(config.DATA_DIR + 'val_labels.csv')
print(f'Train labels: {valid_labels.shape}')

# Combine train and valid labels
trvl_labels = pd.concat([train_labels, valid_labels], axis = 0)

submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
print(f'Submission: {submission.shape}')

# ===== FILE PATHS OF SAMPLES =====
train_files = metadata[metadata.split == 'train']['features_path'].to_dict()
valid_files = metadata[metadata.split == 'val']['features_path'].to_dict()
test_files = metadata[metadata.split == 'test']['features_path'].to_dict()
# Train & Valid files
trva_files = train_files.copy()
trva_files.update(valid_files)
# All files
all_test_files = valid_files.copy()
all_test_files.update(test_files)

# Define SAM testbed files
sam_files = metadata[(metadata.instrument_type == 'sam_testbed') & (metadata.split == 'train')]['features_path']
sam_files = sam_files.to_dict()

# Get the names of the target columns in a list
target_labels_list = [i for i in train_labels.columns if i not in ['sample_id']]
print(target_labels_list)

# SAM testbed labels
sam_labels = train_labels.drop(train_labels.tail(len(sam_files)).index)
sam_labels = pd.concat([sam_labels, valid_labels], axis=0)
print(f'Labels w/o SAM : {sam_labels.shape}')

Metadata: (1570, 5)
Train labels: (766, 11)
Train labels: (293, 11)
Submission: (804, 11)
['basalt', 'carbonate', 'chloride', 'iron_oxide', 'oxalate', 'oxychlorine', 'phyllosilicate', 'silicate', 'sulfate', 'sulfide']
Labels w/o SAM : (1047, 11)


# CURRENT BEST MODEL

In [88]:
SUB_NAME = 'final_model_1'

In [89]:
label_models_tr = {
    'basalt': 'fts_mra_tempmz_slope_cntpk_XGB_opt_tr_sfm.csv',
    'carbonate': 'fts_mra_tempmz_XGB_tr.csv',
    'chloride': 'fts_mra_tempmz_slope_cntpk_XGB_opt_tr_sfm.csv',
    'iron_oxide': 'fts_mra_tempmz_cntpk_XGB_opt_tr_sfm.csv',
    'oxalate': 'fts_mra_tempmz_LR_reg_tr.csv',
    'oxychlorine': 'fts_mra_tempmz_slope_cntpk_XGB_opt_tr_sfm.csv',
    'phyllosilicate': 'fts_mra_tempmz_slope_cntpk_XGB_opt_tr_sfm.csv',
    'silicate': 'fts_mra_tempmz_XGB_opt_tr_sfm.csv',
    'sulfate': 'fts_mra_tempmz_slope_cntpk_XGB_opt_tr_sfm.CSV',
    'sulfide': 'fts_mra_tempmz_cntpk_XGB_opt_tr_sfm.csv'
}

label_models_trvl = {
    'basalt': 'fts_mra_tempmz_slope_cntpk_XGB_opt_trvl_sfm.csv',
    'carbonate': 'fts_mra_tempmz_XGB_trvl.csv',
    'chloride': 'fts_mra_tempmz_slope_cntpk_XGB_opt_trvl_sfm.csv',
    'iron_oxide': 'fts_mra_tempmz_cntpk_XGB_opt_trvl_sfm.csv',
    'oxalate': 'fts_mra_tempmz_LR_reg_trvl.csv',
    'oxychlorine': 'fts_mra_tempmz_slope_cntpk_XGB_opt_trvl_sfm.csv',
    'phyllosilicate': 'fts_mra_tempmz_slope_cntpk_XGB_opt_trvl_sfm.csv',
    'silicate': 'fts_mra_tempmz_XGB_opt_trvl_sfm.csv',
    'sulfate': 'fts_mra_tempmz_slope_cntpk_XGB_opt_trvl_sfm.CSV',
    'sulfide': 'fts_mra_tempmz_cntpk_XGB_opt_trvl_sfm.csv'
}

In [90]:
LABEL_MODELS_DICT = label_models_tr
#==================================
submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
for label in target_labels_list:
    #print(label)
    MODEL_SUB_NAME = LABEL_MODELS_DICT[label]

    # Load saved submission
    label_sub = pd.read_csv(os.path.join(config.MODELS_DIR, MODEL_SUB_NAME))  
    
    submission[label] = label_sub[label]

clf_loss_tr, clf_loss_avg_tr = training.compute_valid_loss(submission.iloc[:len(valid_files)],
                       valid_files,
                       valid_labels, 
                       target_labels_list,
                       SUB_NAME)
print(f'TR: {clf_loss_avg_tr}')

#------------------------------------
LABEL_MODELS_DICT = label_models_trvl

submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
for label in target_labels_list:
    #print(label)
    MODEL_SUB_NAME = LABEL_MODELS_DICT[label]

    # Load saved submission
    label_sub = pd.read_csv(os.path.join(config.MODELS_DIR, MODEL_SUB_NAME))  
    
    submission[label] = label_sub[label]

clf_loss_trvl, clf_loss_avg_trvl = training.compute_valid_loss(submission.iloc[:len(valid_files)],
                       valid_files,
                       valid_labels, 
                       target_labels_list,
                       SUB_NAME)
print(f'TRVL: {clf_loss_avg_trvl}')

SUB_NAME = 'SUB_MODEL_TR_' + str(clf_loss_avg_tr)[:7] + '_TRVL_' + str(clf_loss_avg_trvl)[:9]
print(SUB_NAME)
# Save submission file
submission.to_csv(os.path.join(config.MODELS_DIR, SUB_NAME + '.csv'), index=False)
print('SUBMISSION')
submission.head()

TR: 0.1557306712481364
TRVL: 0.009723688101486267
SUB_MODEL_TR_0.15573_TRVL_0.0097236
SUBMISSION


Unnamed: 0,sample_id,basalt,carbonate,chloride,iron_oxide,oxalate,oxychlorine,phyllosilicate,silicate,sulfate,sulfide
0,S0766,0.001987,0.000721,0.00524,0.002605,9e-06,0.002144,0.005939,0.052359,0.001593,0.001867
1,S0767,0.015197,0.000819,0.000948,0.004182,3.8e-05,0.002691,0.011697,0.043701,0.006188,0.000673
2,S0768,0.968272,0.010956,0.000939,0.029527,0.000425,0.004507,0.940964,0.986137,0.017367,0.000724
3,S0769,0.002439,0.000337,0.007204,0.009711,2.5e-05,0.994533,0.008008,0.001311,0.959035,0.000934
4,S0770,0.002307,0.000205,0.007413,0.949029,0.00018,0.994665,0.988823,0.000837,0.001379,0.001128


# MODEL 1

Check the validation loss in the above model for TR trained and see if we have a model with a better loss for that particular label.

In [91]:
SUB_NAME = 'final_model_2'

In [92]:
label_models_tr = {
    'basalt': 'fts_mra_tempmz_slope_cntpk_spectra_mzstats_XGB_opt_tr_sfm.csv',
    'carbonate': 'fts_mra_tempmz_XGB_tr.csv',
    'chloride': 'fts_mra_tempmz_slope_XGB_opt_tr_sfm.csv',
    'iron_oxide': 'fts_mra_tempmz_slope_cntpk_spectra_XGB_opt_tr_sfm.csv',
    'oxalate': 'fts_mra_tempmz_LR_reg_tr.csv',
    'oxychlorine': 'fts_mra_tempmz_slope_topmz_XGB_opt_tr_sfm.csv',
    'phyllosilicate': 'fts_mra_tempmz_slope_cntpk_topmz_XGB_opt_tr_sfm.csv',
    'silicate': 'fts_mra_tempmz_XGB_opt_tr_sfm.csv',
    'sulfate': 'fts_mra_tempmz_slope_cntpk_mzstats_width_XGB_opt_tr_sfm.CSV',
    'sulfide': 'fts_cntpk_mratt_XGB_opt_tr_sfm.csv'
}

label_models_trvl = {
    'basalt': 'fts_mra_tempmz_slope_cntpk_spectra_mzstats_XGB_opt_trvl_sfm.csv',
    'carbonate': 'fts_mra_tempmz_XGB_trvl.csv',
    'chloride': 'fts_mra_tempmz_slope_XGB_opt_trvl_sfm.csv',
    'iron_oxide': 'fts_mra_tempmz_slope_cntpk_spectra_XGB_opt_trvl_sfm.csv',
    'oxalate': 'fts_mra_tempmz_LR_reg_trvl.csv',
    'oxychlorine': 'fts_mra_tempmz_slope_topmz_XGB_opt_trvl_sfm.csv',
    'phyllosilicate': 'fts_mra_tempmz_slope_cntpk_topmz_XGB_opt_trvl_sfm.csv',
    'silicate': 'fts_mra_tempmz_XGB_opt_trvl_sfm.csv',
    'sulfate': 'fts_mra_tempmz_slope_cntpk_mzstats_width_XGB_opt_trvl_sfm.CSV',
    'sulfide': 'fts_cntpk_mratt_XGB_opt_trvl_sfm.csv'
}

In [93]:
LABEL_MODELS_DICT = label_models_tr
#==================================
submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
for label in target_labels_list:
    #print(label)
    MODEL_SUB_NAME = LABEL_MODELS_DICT[label]

    # Load saved submission
    label_sub = pd.read_csv(os.path.join(config.MODELS_DIR, MODEL_SUB_NAME))  
    
    submission[label] = label_sub[label]

clf_loss_tr, clf_loss_avg_tr = training.compute_valid_loss(submission.iloc[:len(valid_files)],
                       valid_files,
                       valid_labels, 
                       target_labels_list,
                       SUB_NAME)
print(f'TR: {clf_loss_avg_tr}')

#------------------------------------
LABEL_MODELS_DICT = label_models_trvl

submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
for label in target_labels_list:
    #print(label)
    MODEL_SUB_NAME = LABEL_MODELS_DICT[label]

    # Load saved submission
    label_sub = pd.read_csv(os.path.join(config.MODELS_DIR, MODEL_SUB_NAME))  
    
    submission[label] = label_sub[label]

clf_loss_trvl, clf_loss_avg_trvl = training.compute_valid_loss(submission.iloc[:len(valid_files)],
                       valid_files,
                       valid_labels, 
                       target_labels_list,
                       SUB_NAME)
print(f'TRVL: {clf_loss_avg_trvl}')

SUB_NAME = 'SUB_MODEL_TR_' + str(clf_loss_avg_tr)[:7] + '_TRVL_' + str(clf_loss_avg_trvl)[:9]
print(SUB_NAME)
# Save submission file
submission.to_csv(os.path.join(config.MODELS_DIR, SUB_NAME + '.csv'), index=False)
print('SUBMISSION')
submission.head()

TR: 0.14417060223654504
TRVL: 0.009752600218664996
SUB_MODEL_TR_0.14417_TRVL_0.0097526
SUBMISSION


Unnamed: 0,sample_id,basalt,carbonate,chloride,iron_oxide,oxalate,oxychlorine,phyllosilicate,silicate,sulfate,sulfide
0,S0766,0.00279,0.000721,0.007935,0.002778,9e-06,0.003297,0.007457,0.052359,0.001588,0.001337
1,S0767,0.004998,0.000819,0.001804,0.005994,3.8e-05,0.002175,0.013336,0.043701,0.005784,0.001753
2,S0768,0.979432,0.010956,0.000792,0.028486,0.000425,0.0088,0.943645,0.986137,0.011058,0.001763
3,S0769,0.002182,0.000337,0.013194,0.010506,2.5e-05,0.983953,0.011526,0.001311,0.971288,0.001009
4,S0770,0.003755,0.000205,0.00432,0.94395,0.00018,0.995167,0.991762,0.000837,0.001558,0.001195


# MODEL 2

In [94]:
SUB_NAME = 'final_model_3'

In [None]:
label_models_tr = {
    'basalt': 'fts_mra_tempmz_slope_cntpk_spectra_mzstats_XGB_opt_tr_sfm.csv',
    'carbonate': 'fts_mra_tempmz_XGB_tr.csv',
    'chloride': 'fts_mra_tempmz_slope_XGB_opt_tr_sfm.csv',
    'iron_oxide': 'fts_mra_tempmz_slope_cntpk_spectra_XGB_opt_tr_sfm.csv',
    'oxalate': 'fts_mra_tempmz_LR_reg_tr.csv',
    'oxychlorine': 'fts_mra_tempmz_slope_topmz_XGB_opt_tr_sfm.csv',
    'phyllosilicate': 'fts_mra_tempmz_slope_cntpk_topmz_XGB_opt_tr_sfm.csv',
    'silicate': 'fts_mra_tempmz_XGB_opt_tr_sfm.csv',
    'sulfate': 'fts_mra_tempmz_slope_cntpk_mzstats_width_XGB_opt_tr_sfm.CSV',
    'sulfide': 'fts_cntpk_mratt_XGB_opt_tr_sfm.csv'
}

label_models_trvl = {
    'basalt': 'fts_mra_tempmz_slope_cntpk_spectra_mzstats_XGB_opt_trvl_sfm.csv',
    'carbonate': 'fts_mra_tempmz_XGB_trvl.csv',
    'chloride': 'fts_mra_tempmz_slope_XGB_opt_trvl_sfm.csv',
    'iron_oxide': 'fts_mra_tempmz_slope_cntpk_spectra_XGB_opt_trvl_sfm.csv',
    'oxalate': 'fts_mra_tempmz_LR_reg_trvl.csv',
    'oxychlorine': 'fts_mra_tempmz_slope_topmz_XGB_opt_trvl_sfm.csv',
    'phyllosilicate': 'fts_mra_tempmz_slope_cntpk_topmz_XGB_opt_trvl_sfm.csv',
    'silicate': 'fts_mra_tempmz_XGB_opt_trvl_sfm.csv',
    'sulfate': 'fts_mra_tempmz_slope_cntpk_mzstats_width_XGB_opt_trvl_sfm.CSV',
    'sulfide': 'fts_cntpk_mratt_XGB_opt_trvl_sfm.csv'
}