In [1]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
## Environment
# Change main system path to be able to run code from src folder
import sys
p = sys.path[0]
# Mac OS
if sys.path[0].endswith('/models'):
    main_path = p[:-len('/models')]
sys.path[0] = main_path

import os, gc
from termcolor import colored
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
from src import (config, fe, features, feature_selection, 
                 preprocess, training)
from src.fe import CreateFeatures

  from pandas import MultiIndex, Int64Index


In [3]:
# DATA PREPARATION
# ===== LOAD DATA ======
metadata = pd.read_csv(config.DATA_DIR + 'metadata.csv')
print(f'Metadata: {metadata.shape}')

train_labels = pd.read_csv(config.DATA_DIR + 'train_labels.csv')
print(f'Train labels: {train_labels.shape}')

valid_labels = pd.read_csv(config.DATA_DIR + 'val_labels.csv')
print(f'Train labels: {valid_labels.shape}')

# Combine train and valid labels
trvl_labels = pd.concat([train_labels, valid_labels], axis = 0)

submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
print(f'Submission: {submission.shape}')

# ===== FILE PATHS OF SAMPLES =====
train_files = metadata[metadata.split == 'train']['features_path'].to_dict()
valid_files = metadata[metadata.split == 'val']['features_path'].to_dict()
test_files = metadata[metadata.split == 'test']['features_path'].to_dict()
# Train & Valid files
trva_files = train_files.copy()
trva_files.update(valid_files)
# All files
all_test_files = valid_files.copy()
all_test_files.update(test_files)

# Define SAM testbed files
sam_files = metadata[(metadata.instrument_type == 'sam_testbed') & (metadata.split == 'train')]['features_path']
sam_files = sam_files.to_dict()

# Get the names of the target columns in a list
target_labels_list = [i for i in train_labels.columns if i not in ['sample_id']]
print(target_labels_list)

# SAM testbed labels
sam_labels = train_labels.drop(train_labels.tail(len(sam_files)).index)
sam_labels = pd.concat([sam_labels, valid_labels], axis=0)
print(f'Labels w/o SAM : {sam_labels.shape}')

Metadata: (1570, 5)
Train labels: (766, 11)
Train labels: (293, 11)
Submission: (804, 11)
['basalt', 'carbonate', 'chloride', 'iron_oxide', 'oxalate', 'oxychlorine', 'phyllosilicate', 'silicate', 'sulfate', 'sulfide']
Labels w/o SAM : (1047, 11)


In [4]:
FTS_NAME = 'fts_mra_tempmz_slope_cntpk'                 # Name of the file with base features for TRAINING
COMPUTE_FTS = True                                      # Should the features be recomputed
COMPUTE_FTS_SAM = False                                 # Compute SAM test bed
MODEL_ALGO = 'XGB_opt'                                  # Name of the classifier
MODEL_NAME = FTS_NAME + '_' + MODEL_ALGO                # Name of the model
COMBINE_FTS = ['fts_mra_tempmz_slope', 'fts_cntpk_mratt']   # Feature sets to combine for training
NEW_FEATURES = 'fts_cntpk_mratt'                           # Name of a data frame with new features to add to model
TRAIN_FTS_SFM = 'fts_mra_tempmz_slope'                        # Features selected with SMF() for training.
BASE_MODEL = TRAIN_FTS_SFM + '_' + MODEL_ALGO

**COMPUTE FEATURES**

- Change the `fe._` method depending on the feature that we wish to calculate

In [8]:
# Check if feature is computed and load it or choose to compute it
check_file = 0
for i in ['_tr', '_trvl', '_vlte']:
    check_file += os.path.exists(os.path.join(config.DATA_DIR_OUT, FTS_NAME +
                                              str(i) + '.csv'))

if (check_file == 3) & (not COMPUTE_FTS):
    print('Reading features ... ')
    X_tr = pd.read_csv(os.path.join(config.DATA_DIR_OUT, FTS_NAME + '_tr.csv'))
    print(X_tr.shape)
    X_trvl = pd.read_csv(os.path.join(config.DATA_DIR_OUT, FTS_NAME + '_trvl.csv'))
    print(X_trvl.shape)
    X_vlte = pd.read_csv(os.path.join(config.DATA_DIR_OUT, FTS_NAME + '_vlte.csv'))
    print(X_vlte.shape)
    
else:
    print('Computing features ... ')
    # ----- TRAIN -----
    fe = CreateFeatures(metadata, train_files, 'tr', FTS_NAME)
    X_tr = fe.combine_features(COMBINE_FTS)
    print(colored(f'train => {X_tr.shape}', 'blue'))
    
    # ----- TRAIN & VALID -----
    fe = CreateFeatures(metadata, trva_files, 'trvl', FTS_NAME)
    X_trvl = fe.combine_features(COMBINE_FTS)
    print(colored(f'train & valid => {X_trvl.shape}', 'blue'))
    
    # ----- VALID & TEST -----
    fe = CreateFeatures(metadata, all_test_files, 'vlte', FTS_NAME)
    X_vlte = fe.combine_features(COMBINE_FTS)
    print(colored(f'valid & test => {X_vlte.shape}', 'blue'))
    
if COMPUTE_FTS_SAM:
    print(f'\nCreating SAM testbed samples ...')
    # Training without SAM testbed
    X_tr_sam = X_tr.drop(X_tr.tail(len(sam_files)).index).copy()
    X_tr_sam = pd.concat([X_tr_sam, X_trvl.iloc[len(train_files):,:]], axis=0)
    print(f'Train shape: {X_tr_sam.shape}')
    # Validation data
    X_vl_sam = X_tr.tail(len(sam_files)).copy()
    print(f'Valid shape: {X_vl_sam.shape}')

Computing features ... 
[34mtrain => (766, 1981)[0m
[34mtrain & valid => (1059, 1981)[0m
[34mvalid & test => (804, 1981)[0m


## MODELS

**TRAIN**

In [9]:
# ===== TRAIN =====
split_type = 'tr'

# Initialize the feature selection class
smf = feature_selection.SelectModelFeatures(
    base_sfm_features_name=TRAIN_FTS_SFM,
    base_fitted_model_name=TRAIN_FTS_SFM + '_' + MODEL_ALGO,
    target_labels_list=target_labels_list,
    new_features_file_name=NEW_FEATURES,
    fitted_model_name=MODEL_NAME,
    fitted_model_algo=MODEL_ALGO,
    X_tr=X_tr,
    X_vlte=X_vlte,
    split_type='tr',
    train_labels=train_labels,
    valid_files=valid_files,
    valid_labels=valid_labels)

if TRAIN_FTS_SFM:
    # Loads FTS_NAME_tr_SFM_COLS - cols to train with if
    # training is done without full column lenght of input data
    TRAIN_FTS_DICT = smf.load_features()
else: 
    TRAIN_FTS_DICT = None

# Train the model- saves features as MODEL_NAME_tr_COLS.txt'
cvloss, submission_model = training.train_tbl(
    df_train=X_tr,
    df_labels=train_labels,
    target_list=target_labels_list,
    df_test=X_vlte,
    split_type=split_type,
    model_algo=MODEL_ALGO,
    sub_name=MODEL_NAME + '_' + split_type,
    base_model_name=TRAIN_FTS_SFM + '_' + MODEL_ALGO + '_' + split_type + '_sfm',
    fts_select_cols=TRAIN_FTS_DICT,
    )

# Compute validation loss when full model is trained
mloss, mloss_avg = training.compute_valid_loss(submission_model,
                                               valid_files, valid_labels,
                                               target_labels_list,
                                               sub_name=MODEL_NAME + '_' + split_type,
                                               fts_select_cols=TRAIN_FTS_DICT)
print(colored(f'VAL LogLoss: {np.round(mloss_avg, 5)}', 'green'))

[34mLoading feature column names[0m
Reading fts_mra_tempmz_slope_XGB_opt_tr_SFM_COLS.txt
Adding features from fts_cntpk_mratt
[34mCV training ....[0m
[33mbasalt: LogLoss=0.18535[0m [32m-> -0.00583[0m
[33mcarbonate: LogLoss=0.12326[0m [31m-> 0.01446[0m
[33mchloride: LogLoss=0.21241[0m [31m-> 0.00471[0m
[33miron_oxide: LogLoss=0.22971[0m [31m-> 0.00969[0m
[33moxalate: LogLoss=0.01442[0m [32m-> -0.01156[0m
[33moxychlorine: LogLoss=0.15448[0m [32m-> -0.02339[0m
[33mphyllosilicate: LogLoss=0.23354[0m [32m-> -0.01184[0m
[33msilicate: LogLoss=0.22178[0m [32m-> -0.0114[0m
[33msulfate: LogLoss=0.20057[0m [31m-> 0.01631[0m
[33msulfide: LogLoss=0.06468[0m [32m-> -0.00286[0m
[34mFull training .....[0m
[32mbasalt - nfeatures: 428[0m
[32mcarbonate - nfeatures: 522[0m
[32mchloride - nfeatures: 442[0m
[32miron_oxide - nfeatures: 556[0m
[32moxalate - nfeatures: 403[0m
[32moxychlorine - nfeatures: 427[0m
[32mphyllosilicate - nfeatures: 641[0m


In [25]:
# !!!! RUN ONLY ONCE !!!! - overwrites the SFM_COLS for the fitted model
# Compute new features selected from the fitted model
# If just to read the features use compute_features=False
# Saves features as Loads FTS_NAME_tr_SFM_COLS
SFM_COLUMNS = smf.select_features(cv_new_model=cvloss,
                                  fitted_sfm=True)
smf.show_no_fts_label(SFM_COLUMNS)

# Retrain the model with new final features


New features from fts_cntpk_mratt_tr.csv
Recomputing current model features from fitted model (no fts > 1) fts_mra_tempmz_slope_cntpk_XGB_opt
Computing optimal threshold for each label
basalt - Computing threshold on fts_mra_tempmz_slope_cntpk_XGB_opt_tr_sfm_basalt.joblib.dat
Threshold 0.004
carbonate - Computing threshold on fts_mra_tempmz_slope_cntpk_XGB_opt_tr_sfm_carbonate.joblib.dat
Threshold 0.0
chloride - Computing threshold on fts_mra_tempmz_slope_cntpk_XGB_opt_tr_sfm_chloride.joblib.dat
Threshold 0.0
iron_oxide - Computing threshold on fts_mra_tempmz_slope_cntpk_XGB_opt_tr_sfm_iron_oxide.joblib.dat
Threshold 0.002
oxalate - Computing threshold on fts_mra_tempmz_slope_cntpk_XGB_opt_tr_sfm_oxalate.joblib.dat
Threshold 0.0
oxychlorine - Computing threshold on fts_mra_tempmz_slope_cntpk_XGB_opt_tr_sfm_oxychlorine.joblib.dat
Threshold 0.002
phyllosilicate - Computing threshold on fts_mra_tempmz_slope_cntpk_XGB_opt_tr_sfm_phyllosilicate.joblib.dat
Threshold 0.001
silicate - Computin

  0%|          | 0/10 [00:00<?, ?it/s]

basalt - Computing threshold on fts_mra_tempmz_slope_cntpk_XGB_opt_tr_sfm_basalt.joblib.dat


 10%|█         | 1/10 [00:00<00:03,  2.57it/s]

carbonate - Computing threshold on fts_mra_tempmz_slope_cntpk_XGB_opt_tr_sfm_carbonate.joblib.dat


 20%|██        | 2/10 [00:01<00:06,  1.17it/s]

chloride - Computing threshold on fts_mra_tempmz_slope_cntpk_XGB_opt_tr_sfm_chloride.joblib.dat


 30%|███       | 3/10 [00:02<00:06,  1.10it/s]

iron_oxide - Computing threshold on fts_mra_tempmz_slope_cntpk_XGB_opt_tr_sfm_iron_oxide.joblib.dat


 40%|████      | 4/10 [00:03<00:04,  1.28it/s]

oxalate - Computing threshold on fts_mra_tempmz_slope_cntpk_XGB_opt_tr_sfm_oxalate.joblib.dat


 50%|█████     | 5/10 [00:03<00:03,  1.61it/s]

oxychlorine - Computing threshold on fts_mra_tempmz_slope_cntpk_XGB_opt_tr_sfm_oxychlorine.joblib.dat


 60%|██████    | 6/10 [00:03<00:02,  1.79it/s]

phyllosilicate - Computing threshold on fts_mra_tempmz_slope_cntpk_XGB_opt_tr_sfm_phyllosilicate.joblib.dat


 70%|███████   | 7/10 [00:04<00:01,  1.53it/s]

silicate - Computing threshold on fts_mra_tempmz_slope_cntpk_XGB_opt_tr_sfm_silicate.joblib.dat


 80%|████████  | 8/10 [00:05<00:01,  1.69it/s]

sulfate - Computing threshold on fts_mra_tempmz_slope_cntpk_XGB_opt_tr_sfm_sulfate.joblib.dat


100%|██████████| 10/10 [00:05<00:00,  1.72it/s]

sulfide - Computing threshold on fts_mra_tempmz_slope_cntpk_XGB_opt_tr_sfm_sulfide.joblib.dat
Saving fts_mra_tempmz_slope_cntpk_XGB_opt_tr_SFM_COLS.txt
basalt: 78
carbonate: 522
chloride: 442
iron_oxide: 158
oxalate: 403
oxychlorine: 131
phyllosilicate: 227
silicate: 105
sulfate: 141
sulfide: 5





In [26]:
# Retrain the model with newly selected features
# !!! Note that this will overwrite the initial SFM
TRAIN_FTS_SFM = 'fts_mra_tempmz_slope_cntpk'      # Features selected with SMF() for training.
NEW_FEATURES = None

# Initialize the feature selection class
# Need to reinitialize since we are changing TRAIN_FTS_SFM from above
smf = feature_selection.SelectModelFeatures(
    base_sfm_features_name=TRAIN_FTS_SFM,
    base_fitted_model_name=TRAIN_FTS_SFM + '_' + MODEL_ALGO,
    target_labels_list=target_labels_list,
    new_features_file_name=NEW_FEATURES,
    fitted_model_name=MODEL_NAME,
    fitted_model_algo=MODEL_ALGO,
    X_tr=X_tr,
    X_vlte=X_vlte,
    split_type='tr',
    train_labels=train_labels,
    valid_files=valid_files,
    valid_labels=valid_labels)

# ===== TRAIN =====
split_type = 'tr'

if TRAIN_FTS_SFM:
    # Loads FTS_NAME_tr_SFM_COLS - cols to train with if
    # training is done without full column lenght of input data
    TRAIN_FTS_DICT = smf.load_features()
else: 
    TRAIN_FTS_DICT = None
    
# Train the model- saves features as MODEL_NAME_tr_COLS.txt'
cvloss, submission_model = training.train_tbl(
    df_train=X_tr,
    df_labels=train_labels,
    target_list=target_labels_list,
    df_test=X_vlte,
    split_type=split_type,
    model_algo=MODEL_ALGO,
    sub_name=MODEL_NAME + '_' + split_type,
    fts_select_cols=TRAIN_FTS_DICT,
    )

# Compute validation loss when full model is trained
mloss, mloss_avg = training.compute_valid_loss(submission_model,
                                               valid_files, valid_labels,
                                               target_labels_list,
                                               sub_name=MODEL_NAME + '_' + split_type,
                                               fts_select_cols=TRAIN_FTS_DICT)
print(colored(f'VAL LogLoss: {np.round(mloss_avg, 5)}', 'green'))

[34mLoading feature column names[0m
Reading fts_mra_tempmz_slope_cntpk_XGB_opt_tr_SFM_COLS.txt
[34mCV training ....[0m
[33mbasalt[0m
[33mLogLoss 0.22858148855602609[0m
[33mcarbonate[0m
[33mLogLoss 0.146696290363218[0m
[33mchloride[0m
[33mLogLoss 0.21635095820260428[0m
[33miron_oxide[0m
[33mLogLoss 0.27168072229264195[0m
[33moxalate[0m
[33mLogLoss 0.03947637817996841[0m
[33moxychlorine[0m
[33mLogLoss 0.23361781830990927[0m
[33mphyllosilicate[0m
[33mLogLoss 0.2798630786013598[0m
[33msilicate[0m
[33mLogLoss 0.21799369451734965[0m
[33msulfate[0m
[33mLogLoss 0.2635013501540425[0m
[33msulfide[0m
[33mLogLoss 0.20148641343924148[0m
[34mFull training .....[0m
[32mbasalt - nfeatures: 78[0m
[32mcarbonate - nfeatures: 522[0m
[32mchloride - nfeatures: 442[0m
[32miron_oxide - nfeatures: 158[0m
[32moxalate - nfeatures: 403[0m
[32moxychlorine - nfeatures: 131[0m
[32mphyllosilicate - nfeatures: 227[0m
[32msilicate - nfeatures: 105[0m
[32msu

**TRAIN & VALID**

**SAM TRAINING**