In [1]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [10]:
## Environment
# Change main system path to be able to run code from src folder
import sys
p = sys.path[0]
# Mac OS
if sys.path[0].endswith('/models'):
    main_path = p[:-len('/models')]
sys.path[0] = main_path

import os, gc
from termcolor import colored
import pandas as pd
import numpy as np
import joblib
from src import config, training, feature_selection

In [3]:
# DATA PREPARATION
# ===== LOAD DATA ======
metadata = pd.read_csv(config.DATA_DIR + 'metadata.csv')
print(f'Metadata: {metadata.shape}')

train_labels = pd.read_csv(config.DATA_DIR + 'train_labels.csv')
print(f'Train labels: {train_labels.shape}')

valid_labels = pd.read_csv(config.DATA_DIR + 'val_labels.csv')
print(f'Train labels: {valid_labels.shape}')

# Combine train and valid labels
trvl_labels = pd.concat([train_labels, valid_labels], axis = 0)

submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
print(f'Submission: {submission.shape}')

# ===== FILE PATHS OF SAMPLES =====
train_files = metadata[metadata.split == 'train']['features_path'].to_dict()
valid_files = metadata[metadata.split == 'val']['features_path'].to_dict()
test_files = metadata[metadata.split == 'test']['features_path'].to_dict()
# Train & Valid files
trva_files = train_files.copy()
trva_files.update(valid_files)
# All files
all_test_files = valid_files.copy()
all_test_files.update(test_files)

# Define SAM testbed files
sam_files = metadata[(metadata.instrument_type == 'sam_testbed') & (metadata.split == 'train')]['features_path']
sam_files = sam_files.to_dict()

# Get the names of the target columns in a list
target_labels_list = [i for i in train_labels.columns if i not in ['sample_id']]
print(target_labels_list)

# SAM testbed labels
sam_labels = train_labels.drop(train_labels.tail(len(sam_files)).index)
sam_labels = pd.concat([sam_labels, valid_labels], axis=0)
print(f'Labels w/o SAM : {sam_labels.shape}')

Metadata: (1570, 5)
Train labels: (766, 11)
Train labels: (293, 11)
Submission: (804, 11)
['basalt', 'carbonate', 'chloride', 'iron_oxide', 'oxalate', 'oxychlorine', 'phyllosilicate', 'silicate', 'sulfate', 'sulfide']
Labels w/o SAM : (1047, 11)


In [33]:
SUB_NAME = 'ens_spectra'

In [28]:
def ens_model(label):
    if label == 'oxalate':
        MODEL_CLF = 'fts_mra_tempmz_LR_reg_trvl_' + label + '.joblib.dat'
        FTS_NAME = 'fts_mra_tempmz'
        return MODEL_CLF, FTS_NAME, None
    
    elif label == ['sulfate', 'iron-oxide']:
        MODEL_CLF = 'fts_mra_tempmz_slope_XGB_opt_tr_sfm_' + label + '.joblib.dat'
        FTS_NAME = 'fts_mra_tempmz_slope'
        path_fts = os.path.join(config.MODELS_DIR, 
                                'fts_mra_tempmz_slope_tr_SFM_COLS.txt')
        SFM_COLUMNS = feature_selection.load_features(path_fts)
        return MODEL_CLF, FTS_NAME, SFM_COLUMNS
    
    elif label in ['oxychlorine']:
        MODEL_CLF = 'fts_mra_tempmz_slope_cntpk_XGB_opt_tr_sfm_' + label + '.joblib.dat'
        FTS_NAME = 'fts_mra_tempmz_slope_cntpk'
        path_fts = os.path.join(config.MODELS_DIR, 
                                'fts_mra_tempmz_slope_cntpk_tr_SFM_COLS.txt')
        SFM_COLUMNS = feature_selection.load_features(path_fts)
        return MODEL_CLF, FTS_NAME, SFM_COLUMNS
    
    elif label == 'phyllosilicate':
        MODEL_CLF = 'fts_mra_tempmz_cntpk_mratt_slope_corrmz4_XGB_opt_tr_' + label + '.joblib.dat'
        FTS_NAME = 'fts_mra_tempmz_cntpk_mratt_slope_corrmz4'
        return MODEL_CLF, FTS_NAME, None
    
    elif label == 'carbonate':
        MODEL_CLF = 'fts_mra_tempmz_XGB_opt_tr_sfm_' + label + '.joblib.dat'
        FTS_NAME = 'fts_mra_tempmz'
        path_fts = os.path.join(config.MODELS_DIR, 
                                'fts_mra_tempmz_tr_SFM_COLS.txt')
        SFM_COLUMNS = feature_selection.load_features(path_fts)
        return MODEL_CLF, FTS_NAME, SFM_COLUMNS
    
    else:
        MODEL_CLF = 'fts_mra_tempmz_cntpk_mratt_slope_spectra_XGB_opt_tr_sfm_' + label + '.joblib.dat'
        FTS_NAME = 'fts_mra_tempmz_cntpk_mratt_slope_spectra'
        
        path_fts = os.path.join(config.MODELS_DIR, 
                                'fts_mra_tempmz_cntpk_mratt_slope_spectra_tr_SFM_COLS.txt')
        SFM_COLUMNS = feature_selection.load_features(path_fts)
        
        return MODEL_CLF, FTS_NAME, SFM_COLUMNS

In [31]:
for label in target_labels_list:
    print(label)
    MODEL_CLF, FTS_NAME, SFM_COLUMNS = ens_model(label)
    VT_SAMPLE = FTS_NAME + '_vlte.csv'
    # Load saved model
    clf = joblib.load(os.path.join(config.MODELS_DIR, MODEL_CLF))
    
    # Load valid & test data set
    X_vlte = pd.read_csv(os.path.join(config.DATA_DIR_OUT,VT_SAMPLE))
    if SFM_COLUMNS:
        X_vlte = X_vlte[SFM_COLUMNS[label]]
    print(f'Valid & Test: {X_vlte.shape}')
    
    submission[label] = clf.predict_proba(X_vlte)[:,1]

basalt
Valid & Test: (804, 143)
carbonate


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Valid & Test: (804, 126)
chloride


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Valid & Test: (804, 108)
iron_oxide


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Valid & Test: (804, 159)
oxalate


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Valid & Test: (804, 1584)
oxychlorine
Valid & Test: (804, 427)
phyllosilicate


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Valid & Test: (804, 1982)
silicate


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Valid & Test: (804, 263)
sulfate


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Valid & Test: (804, 231)
sulfide


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Valid & Test: (804, 41)


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [34]:
clf_loss, clf_loss_avg = training.compute_valid_loss(submission.iloc[:len(valid_files)],
                       valid_files,
                       valid_labels, 
                       target_labels_list,
                       SUB_NAME)
print(clf_loss_avg)

(293, 11)
0.15123639765849056


In [9]:
# Save submission file
submission.to_csv(os.path.join(config.MODELS_DIR, SUB_NAME))