In [1]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [17]:
## Environment
# Change main system path to be able to run code from src folder
import sys, json
p = sys.path[0]
# Mac OS
if sys.path[0].endswith('/models'):
    main_path = p[:-len('/models')]
sys.path[0] = main_path

import os, gc
from termcolor import colored
import pandas as pd
import numpy as np
import joblib
from src import config, training

In [3]:
# DATA PREPARATION
# ===== LOAD DATA ======
metadata = pd.read_csv(config.DATA_DIR + 'metadata.csv')
print(f'Metadata: {metadata.shape}')

train_labels = pd.read_csv(config.DATA_DIR + 'train_labels.csv')
print(f'Train labels: {train_labels.shape}')

valid_labels = pd.read_csv(config.DATA_DIR + 'val_labels.csv')
print(f'Train labels: {valid_labels.shape}')

# Combine train and valid labels
trvl_labels = pd.concat([train_labels, valid_labels], axis = 0)

submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
print(f'Submission: {submission.shape}')

# ===== FILE PATHS OF SAMPLES =====
train_files = metadata[metadata.split == 'train']['features_path'].to_dict()
valid_files = metadata[metadata.split == 'val']['features_path'].to_dict()
test_files = metadata[metadata.split == 'test']['features_path'].to_dict()
# Train & Valid files
trva_files = train_files.copy()
trva_files.update(valid_files)
# All files
all_test_files = valid_files.copy()
all_test_files.update(test_files)

# Define SAM testbed files
sam_files = metadata[(metadata.instrument_type == 'sam_testbed') & (metadata.split == 'train')]['features_path']
sam_files = sam_files.to_dict()

# Get the names of the target columns in a list
target_labels_list = [i for i in train_labels.columns if i not in ['sample_id']]
print(target_labels_list)

# SAM testbed labels
sam_labels = train_labels.drop(train_labels.tail(len(sam_files)).index)
sam_labels = pd.concat([sam_labels, valid_labels], axis=0)
print(f'Labels w/o SAM : {sam_labels.shape}')

Metadata: (1570, 5)
Train labels: (766, 11)
Train labels: (293, 11)
Submission: (804, 11)
['basalt', 'carbonate', 'chloride', 'iron_oxide', 'oxalate', 'oxychlorine', 'phyllosilicate', 'silicate', 'sulfate', 'sulfide']
Labels w/o SAM : (1047, 11)


In [18]:
SUB_NAME = 'ens_2'

In [19]:
def final_model(label):
    if label == 'oxalate':
        MODEL_CLF = 'fts_mra_tempmz_LR_reg_trvl_' + label + '.joblib.dat'
        FTS_NAME = 'fts_mra_tempmz'
        VT_SAMPLE = FTS_NAME + '_vlte.csv'
    
    elif label in ['carbonate', 'oxychlorine']:
        MODEL_CLF = 'fts_mra_tempmz_slope_cntpk_topmz_XGB_opt_tr_sfm_' + label + '.joblib.dat'
        FTS_NAME = 'fts_mra_tempmz_slope_cntpk_topmz'
        FTS_COLS = 'fts_mra_tempmz_slope_cntpk_topmz_XGB_opt_tr_SFM_COLS.txt'
        VT_SAMPLE = FTS_NAME + '_vlte.csv'
        
    elif label == 'silicate':
        MODEL_CLF = 'fts_mra_tempmz_slope_cntpk_XGB_opt_tr_sfm_' + label + '.joblib.dat'
        FTS_NAME = 'fts_mra_tempmz_slope_cntpk'
        FTS_COLS = 'fts_mra_tempmz_slope_cntpk_XGB_opt_tr_SFM_COLS.txt'
        VT_SAMPLE = FTS_NAME + '_vlte.csv'
        
    elif label in ['sulfide', 'phyllosilicate', 'basalt']:
        MODEL_CLF = 'fts_mra_tempmz_slope_cntpk_spectra_mzstats_XGB_opt_tr_sfm_' + label + '.joblib.dat'
        FTS_NAME = 'fts_mra_tempmz_slope_cntpk_spectra_mzstats'
        FTS_COLS = 'fts_mra_tempmz_slope_cntpk_mzstats_XGB_opt_tr_SFM_COLS.txt'
        VT_SAMPLE = FTS_NAME + '_vlte.csv'
        
    elif label in ['iron_oxide', 'sulfate']:
        MODEL_CLF = 'fts_mra_tempmz_slope_XGB_opt_tr_sfm_' + label + '.joblib.dat'
        FTS_NAME = 'fts_mra_tempmz_slope'
        FTS_COLS = 'fts_mra_tempmz_slope_XGB_opt_tr_SFM_COLS.txt'
        VT_SAMPLE = FTS_NAME + '_vlte.csv'
        
    else:
        MODEL_CLF = 'fts_mzstats_XGB_opt_tr_sfm_' + label + '.joblib.dat'
        FTS_NAME = 'fts_mzstats'
        FTS_COLS = 'fts_mzstats_XGB_opt_tr_SFM_COLS.txt'
        VT_SAMPLE = FTS_NAME + '_vlte.csv'

    return MODEL_CLF, FTS_NAME, VT_SAMPLE, FTS_COLS

In [None]:
for label in ['silicate', 'carbonate', 'oxychlorine']:
    print(label)
    MODEL_CLF, FTS_NAME, VT_SAMPLE, FTS_COLS = final_model(label)

    # Load saved model
    clf = joblib.load(os.path.join(config.CLF_DIR, MODEL_CLF))
    
    # Read the column names of the model for label
    fts_path = os.path.join(config.MODELS_DIR, FTS_COLS)
    with open(fts_path) as json_file:
        features_dict = json.load(json_file)
    features = features_dict[label]
    
    # Load valid & test data set
    X_vlte = pd.read_csv(os.path.join(config.DATA_DIR_OUT,VT_SAMPLE))
    X_vlte = X_vlte[features].copy()
    print(f'Valid & Test: {X_vlte.shape}')
    
    submission[label] = clf.predict_proba(X_vlte)[:,1]

# Save submission file
submission.to_csv(os.path.join(config.MODELS_DIR, SUB_NAME))

In [28]:
def final_model_sub(label):
    if label == 'oxalate':
        MODEL_SUB_NAME = 'fts_mra_tempmz_LR_reg_trvl.csv'
    
    elif label in ['carbonate', 'oxychlorine']:
        MODEL_SUB_NAME = 'fts_mra_tempmz_slope_cntpk_topmz_XGB_opt_tr_sfm.csv'
        
    elif label == 'silicate':
        MODEL_SUB_NAME = 'fts_mra_tempmz_slope_cntpk_XGB_opt_tr_sfm.csv'
        
    elif label in ['sulfide', 'phyllosilicate', 'basalt']:
        MODEL_SUB_NAME = 'fts_mra_tempmz_slope_cntpk_spectra_mzstats_XGB_opt_tr_sfm.csv'
        
    elif label in ['iron_oxide', 'sulfate']:
        MODEL_SUB_NAME = 'fts_mra_tempmz_slope_XGB_opt_tr_sfm.csv'
        
    else:
        MODEL_SUB_NAME = 'fts_mzstats_XGB_opt_tr_sfm.csv'

    return MODEL_SUB_NAME

In [33]:
submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
for label in target_labels_list:
    #print(label)
    MODEL_SUB_NAME = final_model_sub(label)

    # Load saved submission
    label_sub = pd.read_csv(os.path.join(config.MODELS_DIR, MODEL_SUB_NAME))  
    
    submission[label] = label_sub[label]

# Save submission file
submission.to_csv(os.path.join(config.MODELS_DIR, SUB_NAME))

In [34]:
submission

Unnamed: 0,sample_id,basalt,carbonate,chloride,iron_oxide,oxalate,oxychlorine,phyllosilicate,silicate,sulfate,sulfide
0,S0766,0.002912,0.003799,0.007180,0.003223,0.000009,0.003325,0.007598,0.586737,0.002109,0.001175
1,S0767,0.012440,0.044529,0.001285,0.050387,0.000038,0.002806,0.045761,0.291309,0.004825,0.000705
2,S0768,0.624930,0.058557,0.000758,0.002626,0.000425,0.004802,0.157121,0.906950,0.111087,0.001095
3,S0769,0.009926,0.002231,0.051930,0.052097,0.000025,0.995201,0.014731,0.005204,0.954842,0.002129
4,S0770,0.003402,0.001239,0.008457,0.520125,0.000180,0.996573,0.904139,0.001551,0.002110,0.000758
...,...,...,...,...,...,...,...,...,...,...,...
799,S1565,0.201738,0.044905,0.007144,0.058225,0.076743,0.009005,0.004993,0.039485,0.009111,0.031734
800,S1566,0.096602,0.003928,0.013198,0.003365,0.000027,0.037568,0.157951,0.004899,0.039808,0.024004
801,S1567,0.009221,0.013053,0.003086,0.001395,0.000202,0.002950,0.022676,0.004006,0.031137,0.002057
802,S1568,0.014807,0.093033,0.003980,0.003593,0.000930,0.175069,0.205159,0.035257,0.009476,0.580668


In [35]:
clf_loss, clf_loss_avg = training.compute_valid_loss(submission.iloc[:len(valid_files)],
                       valid_files,
                       valid_labels, 
                       target_labels_list,
                       SUB_NAME)
print(clf_loss_avg)

0.1679287489909083


In [36]:
clf_loss

{'basalt': 0.11840654639254344,
 'carbonate': 0.1771306441490242,
 'chloride': 0.2372944708484457,
 'iron_oxide': 0.30114993373308474,
 'oxalate': 0.0009390678833573559,
 'oxychlorine': 0.14727275088179248,
 'phyllosilicate': 0.25410422294648916,
 'silicate': 0.16274793309342409,
 'sulfate': 0.20770438182227657,
 'sulfide': 0.0725375381586451}