In [1]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
## Environment
# Change main system path to be able to run code from src folder
import sys
p = sys.path[0]
# Mac OS
if sys.path[0].endswith('/models'):
    main_path = p[:-len('/models')]
sys.path[0] = main_path

import os, gc
from termcolor import colored
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
from src import (config, fe, features, feature_selection, 
                 preprocess, training)
from src.fe import CreateFeatures

  from pandas import MultiIndex, Int64Index


In [3]:
# DATA PREPARATION
# ===== LOAD DATA ======
metadata = pd.read_csv(config.DATA_DIR + 'metadata.csv')
print(f'Metadata: {metadata.shape}')

train_labels = pd.read_csv(config.DATA_DIR + 'train_labels.csv')
print(f'Train labels: {train_labels.shape}')

valid_labels = pd.read_csv(config.DATA_DIR + 'val_labels.csv')
print(f'Train labels: {valid_labels.shape}')

# Combine train and valid labels
trvl_labels = pd.concat([train_labels, valid_labels], axis = 0)

submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
print(f'Submission: {submission.shape}')

# ===== FILE PATHS OF SAMPLES =====
train_files = metadata[metadata.split == 'train']['features_path'].to_dict()
valid_files = metadata[metadata.split == 'val']['features_path'].to_dict()
test_files = metadata[metadata.split == 'test']['features_path'].to_dict()
# Train & Valid files
trva_files = train_files.copy()
trva_files.update(valid_files)
# All files
all_test_files = valid_files.copy()
all_test_files.update(test_files)

# Define SAM testbed files
sam_files = metadata[(metadata.instrument_type == 'sam_testbed') & (metadata.split == 'train')]['features_path']
sam_files = sam_files.to_dict()

# Get the names of the target columns in a list
target_labels_list = [i for i in train_labels.columns if i not in ['sample_id']]
print(target_labels_list)

# SAM testbed labels
sam_labels = train_labels.drop(train_labels.tail(len(sam_files)).index)
sam_labels = pd.concat([sam_labels, valid_labels], axis=0)
print(f'Labels w/o SAM : {sam_labels.shape}')

Metadata: (1570, 5)
Train labels: (766, 11)
Train labels: (293, 11)
Submission: (804, 11)
['basalt', 'carbonate', 'chloride', 'iron_oxide', 'oxalate', 'oxychlorine', 'phyllosilicate', 'silicate', 'sulfate', 'sulfide']
Labels w/o SAM : (1047, 11)


In [4]:
FTS_NAME = 'fts_corr_mz4'                 # Name of the file with base features for TRAINING
COMPUTE_FTS = False                         # Should the features be recomputed
COMPUTE_FTS_SAM = False                      # Compute SAM test bed
MODEL_ALGO = 'XGB_opt'                      # Name of the classifier
MODEL_NAME = FTS_NAME + '_' + MODEL_ALGO    # Name of the model
COMBINE_FTS = None                          # Feature sets to combine for training
NEW_FEATURES = None                         # Name of a data frame with new features to add to model
TRAIN_FTS_SFM = None                        # Features selected with SMF() for training.
BASE_MODEL = None

**COMPUTE FEATURES**

- Change the `fe._` method depending on the feature that we wish to calculate

In [5]:
# Check if feature is computed and load it or choose to compute it
check_file = 0
for i in ['_tr', '_trvl', '_vlte']:
    check_file += os.path.exists(os.path.join(config.DATA_DIR_OUT, FTS_NAME +
                                              str(i) + '.csv'))

if (check_file == 3) & (not COMPUTE_FTS):
    print('Reading features ... ')
    X_tr = pd.read_csv(os.path.join(config.DATA_DIR_OUT, FTS_NAME + '_tr.csv'))
    print(X_tr.shape)
    X_trvl = pd.read_csv(os.path.join(config.DATA_DIR_OUT, FTS_NAME + '_trvl.csv'))
    print(X_trvl.shape)
    X_vlte = pd.read_csv(os.path.join(config.DATA_DIR_OUT, FTS_NAME + '_vlte.csv'))
    print(X_vlte.shape)
    
else:
    print('Computing features ... ')
    # ----- TRAIN -----
    fe = CreateFeatures(metadata, train_files, 'tr', FTS_NAME)
    X_tr = fe.fts_topmz()
    print(colored(f'train => {X_tr.shape}', 'blue'))
    
    # ----- TRAIN & VALID -----
    fe = CreateFeatures(metadata, trva_files, 'trvl', FTS_NAME)
    X_trvl = fe.fts_topmz()
    print(colored(f'train => {X_trvl.shape}', 'blue'))
    
    # ----- VALID & TEST -----
    fe = CreateFeatures(metadata, all_test_files, 'vlte', FTS_NAME)
    X_vlte = fe.fts_topmz()
    print(colored(f'train => {X_vlte.shape}', 'blue'))
    
if COMPUTE_FTS_SAM:
    print(f'\nCreating SAM testbed samples ...')
    # Training without SAM testbed
    X_tr_sam = X_tr.drop(X_tr.tail(len(sam_files)).index).copy()
    X_tr_sam = pd.concat([X_tr_sam, X_trvl.iloc[len(train_files):,:]], axis=0)
    print(f'Train shape: {X_tr_sam.shape}')
    # Validation data
    X_vl_sam = X_tr.tail(len(sam_files)).copy()
    print(f'Valid shape: {X_vl_sam.shape}')

Reading features ... 
(766, 197)
(1059, 197)
(804, 197)


## MODELS

**TRAIN**

In [6]:
# ===== TRAIN =====
split_type = 'tr'

# Initialize the feature selection class
smf = feature_selection.SelectModelFeatures(
    base_sfm_features_name=TRAIN_FTS_SFM,
    base_fitted_model_name=BASE_MODEL,
    target_labels_list=target_labels_list,
    new_features_file_name=NEW_FEATURES,
    fitted_model_name=MODEL_NAME,
    fitted_model_algo=MODEL_ALGO,
    X_tr=X_tr,
    X_vlte=X_vlte,
    split_type='tr',
    train_labels=train_labels,
    valid_files=valid_files,
    valid_labels=valid_labels)

if TRAIN_FTS_SFM:
    # Loads FTS_NAME_tr_SFM_COLS - cols to train with if
    # training is done without full column lenght of input data
    TRAIN_FTS_DICT = smf.load_features(file_name=FTS_NAME)
else: 
    TRAIN_FTS_DICT = None
    
# Train the model- saves features as MODEL_NAME_tr_COLS.txt'
cvloss, submission_model = training.train_tbl(
    df_train=X_tr,
    df_labels=train_labels,
    target_list=target_labels_list,
    df_test=X_vlte,
    split_type=split_type,
    model_algo=MODEL_ALGO,
    sub_name=MODEL_NAME + '_' + split_type,
    fts_select_cols=TRAIN_FTS_DICT,
    )

# Compute validation loss when full model is trained
mloss, mloss_avg = training.compute_valid_loss(submission_model,
                                               valid_files, valid_labels,
                                               target_labels_list,
                                               sub_name=MODEL_NAME + '_' + split_type,
                                               fts_select_cols=TRAIN_FTS_DICT)
print(colored(f'VAL LogLoss: {np.round(mloss_avg, 5)}', 'green'))

[34mCV training ....[0m
[33mbasalt[0m
[33mLogLoss 0.30989157107016296[0m
[33mcarbonate[0m
[33mLogLoss 0.3119704315927664[0m
[33mchloride[0m
[33mLogLoss 0.33705787688307487[0m
[33miron_oxide[0m
[33mLogLoss 0.4355351843990496[0m
[33moxalate[0m
[33mLogLoss 0.1079704908146408[0m
[33moxychlorine[0m
[33mLogLoss 0.44082371521873565[0m
[33mphyllosilicate[0m
[33mLogLoss 0.4889370100337249[0m
[33msilicate[0m
[33mLogLoss 0.37608685116909435[0m
[33msulfate[0m
[33mLogLoss 0.3495140722600115[0m
[33msulfide[0m
[33mLogLoss 0.12451133504185566[0m
[34mFull training .....[0m
[32mbasalt - nfeatures: 197[0m
[32mcarbonate - nfeatures: 197[0m
[32mchloride - nfeatures: 197[0m
[32miron_oxide - nfeatures: 197[0m
[32moxalate - nfeatures: 197[0m
[32moxychlorine - nfeatures: 197[0m
[32mphyllosilicate - nfeatures: 197[0m
[32msilicate - nfeatures: 197[0m
[32msulfate - nfeatures: 197[0m
[32msulfide - nfeatures: 197[0m
Saving fts_corr_mz4_XGB_opt_tr_COLS.

In [9]:
# Compute new features selected from the fitted model
# If just to read the features use compute_features=False
# Saves features as Loads FTS_NAME_tr_SFM_COLS
SFM_COLUMNS = smf.select_features(cv_new_model=cvloss)
smf.show_no_fts_label(SFM_COLUMNS)

Recomputing current model features from fitted model fts_corr_mz4_XGB_opt_tr
Computing optimal threshold for each label
basalt - Computing threshold on fts_corr_mz4_XGB_opt_tr_basalt.joblib.dat
carbonate - Computing threshold on fts_corr_mz4_XGB_opt_tr_carbonate.joblib.dat
chloride - Computing threshold on fts_corr_mz4_XGB_opt_tr_chloride.joblib.dat
iron_oxide - Computing threshold on fts_corr_mz4_XGB_opt_tr_iron_oxide.joblib.dat
oxalate - Computing threshold on fts_corr_mz4_XGB_opt_tr_oxalate.joblib.dat
oxychlorine - Computing threshold on fts_corr_mz4_XGB_opt_tr_oxychlorine.joblib.dat
phyllosilicate - Computing threshold on fts_corr_mz4_XGB_opt_tr_phyllosilicate.joblib.dat
silicate - Computing threshold on fts_corr_mz4_XGB_opt_tr_silicate.joblib.dat
sulfate - Computing threshold on fts_corr_mz4_XGB_opt_tr_sulfate.joblib.dat
sulfide - Computing threshold on fts_corr_mz4_XGB_opt_tr_sulfide.joblib.dat
Refinting the model based on the threshold
Saving fts_corr_mz4_XGB_opt_tr_SFM_COLS.txt

In [10]:
# Retrain the model with newly selected features
TRAIN_FTS_SFM = 'fts_corr_mz4'            # Features selected with SMF() for training.


# ===== TRAIN =====
split_type = 'tr'

if TRAIN_FTS_SFM:
    # Loads FTS_NAME_tr_SFM_COLS - cols to train with if
    # training is done without full column lenght of input data
    TRAIN_FTS_DICT = smf.load_features(file_name=TRAIN_FTS_SFM)
else: 
    TRAIN_FTS_DICT = None
    
# Train the model- saves features as MODEL_NAME_tr_COLS.txt'
cvloss, submission_model = training.train_tbl(
    df_train=X_tr,
    df_labels=train_labels,
    target_list=target_labels_list,
    df_test=X_vlte,
    split_type=split_type,
    model_algo=MODEL_ALGO,
    sub_name=MODEL_NAME + '_' + split_type,
    fts_select_cols=TRAIN_FTS_DICT,
    )

# Compute validation loss when full model is trained
mloss, mloss_avg = training.compute_valid_loss(submission_model,
                                               valid_files, valid_labels,
                                               target_labels_list,
                                               sub_name=MODEL_NAME + '_' + split_type,
                                               fts_select_cols=TRAIN_FTS_DICT)
print(colored(f'VAL LogLoss: {np.round(mloss_avg, 5)}', 'green'))

[34mLoading feature column names[0m
Reading fts_corr_mz4_XGB_opt_tr_SFM_COLS.txt
[34mCV training ....[0m
[33mbasalt[0m
[33mLogLoss 0.2985467787307536[0m
[33mcarbonate[0m
[33mLogLoss 0.2644426503526319[0m
[33mchloride[0m
[33mLogLoss 0.31622337474284534[0m
[33miron_oxide[0m
[33mLogLoss 0.429140161781317[0m
[33moxalate[0m
[33mLogLoss 0.11288276783502534[0m
[33moxychlorine[0m
[33mLogLoss 0.3898933362026771[0m
[33mphyllosilicate[0m
[33mLogLoss 0.44185602059513834[0m
[33msilicate[0m
[33mLogLoss 0.3638202996891974[0m
[33msulfate[0m
[33mLogLoss 0.33164688390070524[0m
[33msulfide[0m
[33mLogLoss 0.11690457414015432[0m
[34mFull training .....[0m
[32mbasalt - nfeatures: 85[0m
[32mcarbonate - nfeatures: 48[0m
[32mchloride - nfeatures: 17[0m
[32miron_oxide - nfeatures: 92[0m
[32moxalate - nfeatures: 21[0m
[32moxychlorine - nfeatures: 27[0m
[32mphyllosilicate - nfeatures: 28[0m
[32msilicate - nfeatures: 7[0m
[32msulfate - nfeatures: 21[0

**TRAIN & VALID**

In [11]:
FTS_NAME = 'fts_corr_mz4'                 # Name of the file with base features for TRAINING
COMPUTE_FTS = False                         # Should the features be recomputed
COMPUTE_FTS_SAM = False                      # Compute SAM test bed
MODEL_ALGO = 'XGB_opt'                      # Name of the classifier
MODEL_NAME = FTS_NAME + '_' + MODEL_ALGO    # Name of the model
COMBINE_FTS = None                          # Feature sets to combine for training
NEW_FEATURES = None                         # Name of a data frame with new features to add to model
TRAIN_FTS_SFM = None                        # Features selected with SMF() for training.
BASE_MODEL = None

In [12]:
# ===== TRAIN =====
split_type = 'trvl'

# Initialize the feature selection class
smf = feature_selection.SelectModelFeatures(
    base_sfm_features_name=TRAIN_FTS_SFM,
    base_fitted_model_name=BASE_MODEL,
    target_labels_list=target_labels_list,
    new_features_file_name=NEW_FEATURES,
    fitted_model_name=MODEL_NAME,
    fitted_model_algo=MODEL_ALGO,
    X_tr=X_trvl,
    X_vlte=X_vlte,
    split_type=split_type,
    train_labels=trvl_labels,
    valid_files=valid_files,
    valid_labels=valid_labels)

if TRAIN_FTS_SFM:
    # Loads FTS_NAME_tr_SFM_COLS - cols to train with if
    # training is done without full column lenght of input data
    TRAIN_FTS_DICT = smf.load_features()
else: 
    TRAIN_FTS_DICT = None
    
# Train the model- saves features as MODEL_NAME_tr_COLS.txt'
cvloss, submission_model = training.train_tbl(
    df_train=X_trvl,
    df_labels=trvl_labels,
    target_list=target_labels_list,
    df_test=X_vlte,
    split_type=split_type,
    model_algo=MODEL_ALGO,
    sub_name=MODEL_NAME + '_' + split_type,
    fts_select_cols=TRAIN_FTS_DICT,
    )

# Compute validation loss when full model is trained
mloss, mloss_avg = training.compute_valid_loss(submission_model,
                                               valid_files, valid_labels,
                                               target_labels_list,
                                               sub_name=MODEL_NAME + '_' + split_type,
                                               fts_select_cols=TRAIN_FTS_DICT)
print(colored(f'VAL LogLoss: {np.round(mloss_avg, 5)}', 'green'))

[34mCV training ....[0m
[33mbasalt[0m
[33mLogLoss 0.31049689334866915[0m
[33mcarbonate[0m
[33mLogLoss 0.268192805329645[0m
[33mchloride[0m
[33mLogLoss 0.3097545604587912[0m
[33miron_oxide[0m
[33mLogLoss 0.40397799317828553[0m
[33moxalate[0m
[33mLogLoss 0.09524512113165487[0m
[33moxychlorine[0m
[33mLogLoss 0.3864065647579128[0m
[33mphyllosilicate[0m
[33mLogLoss 0.4437171378604476[0m
[33msilicate[0m
[33mLogLoss 0.3512839055119815[0m
[33msulfate[0m
[33mLogLoss 0.34681812767709436[0m
[33msulfide[0m
[33mLogLoss 0.09590613260185861[0m
[34mFull training .....[0m
[32mbasalt - nfeatures: 197[0m
[32mcarbonate - nfeatures: 197[0m
[32mchloride - nfeatures: 197[0m
[32miron_oxide - nfeatures: 197[0m
[32moxalate - nfeatures: 197[0m
[32moxychlorine - nfeatures: 197[0m
[32mphyllosilicate - nfeatures: 197[0m
[32msilicate - nfeatures: 197[0m
[32msulfate - nfeatures: 197[0m
[32msulfide - nfeatures: 197[0m
Saving fts_corr_mz4_XGB_opt_trvl_COLS

In [13]:
# Compute new features selected from the fitted model
# If just to read the features use compute_features=False
# Saves features as Loads FTS_NAME_tr_SFM_COLS
SFM_COLUMNS = smf.select_features(cv_new_model=cvloss)
smf.show_no_fts_label(SFM_COLUMNS)

Recomputing current model features from fitted model fts_corr_mz4_XGB_opt_trvl
Computing optimal threshold for each label
basalt - Computing threshold on fts_corr_mz4_XGB_opt_trvl_basalt.joblib.dat


In [None]:
# Retrain the model with newly selected features
TRAIN_FTS_SFM = 'fts_corr_mz4'            # Features selected with SMF() for training.


# ===== TRAIN =====
split_type = 'trvl'

if TRAIN_FTS_SFM:
    # Loads FTS_NAME_tr_SFM_COLS - cols to train with if
    # training is done without full column lenght of input data
    TRAIN_FTS_DICT = smf.load_features(file_name=TRAIN_FTS_SFM)
else: 
    TRAIN_FTS_DICT = None
    
# Train the model- saves features as MODEL_NAME_tr_COLS.txt'
cvloss, submission_model = training.train_tbl(
    df_train=X_trvl,
    df_labels=trvl_labels,
    target_list=target_labels_list,
    df_test=X_vlte,
    split_type=split_type,
    model_algo=MODEL_ALGO,
    sub_name=MODEL_NAME + '_' + split_type,
    fts_select_cols=TRAIN_FTS_DICT,
    )

# Compute validation loss when full model is trained
mloss, mloss_avg = training.compute_valid_loss(submission_model,
                                               valid_files, valid_labels,
                                               target_labels_list,
                                               sub_name=MODEL_NAME + '_' + split_type,
                                               fts_select_cols=TRAIN_FTS_DICT)
print(colored(f'VAL LogLoss: {np.round(mloss_avg, 5)}', 'green'))

[34mLoading feature column names[0m
Reading fts_mra_tempmz_XGB_opt_tr_SFM_COLS.txt
[34mCV training ....[0m
[33mbasalt[0m
[33mLogLoss 0.2037926424671946[0m
[33mcarbonate[0m
[33mLogLoss 0.10880419331023979[0m
[33mchloride[0m
[33mLogLoss 0.20993679096199142[0m
[33miron_oxide[0m
[33mLogLoss 0.2242095433609456[0m
[33moxalate[0m
[33mLogLoss 0.025979876300493976[0m
[33moxychlorine[0m
[33mLogLoss 0.18017037229844265[0m
[33mphyllosilicate[0m
[33mLogLoss 0.24537565622910645[0m
[33msilicate[0m
[33mLogLoss 0.23317893970805786[0m
[33msulfate[0m
[33mLogLoss 0.19015973202334008[0m
[33msulfide[0m
[33mLogLoss 0.0675372862361483[0m
[34mFull training .....[0m
[32mbasalt - nfeatures: 31[0m
[32mcarbonate - nfeatures: 126[0m
[32mchloride - nfeatures: 45[0m
[32miron_oxide - nfeatures: 159[0m
[32moxalate - nfeatures: 7[0m
[32moxychlorine - nfeatures: 30[0m
[32mphyllosilicate - nfeatures: 245[0m
[32msilicate - nfeatures: 1584[0m
[32msulfate - nfeat

**SAM TRAINING**