In [1]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
## Environment
# Change main system path to be able to run code from src folder
import sys
p = sys.path[0]
# Mac OS
if sys.path[0].endswith('/models'):
    main_path = p[:-len('/models')]
sys.path[0] = main_path

import os, gc, json
from termcolor import colored
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
from src import (config, fe, features, feature_selection, 
                 preprocess, training)
from src.fe import CreateFeatures

  from pandas import MultiIndex, Int64Index


In [3]:
# DATA PREPARATION
# ===== LOAD DATA ======
metadata = pd.read_csv(config.DATA_DIR + 'metadata.csv')
print(f'Metadata: {metadata.shape}')

train_labels = pd.read_csv(config.DATA_DIR + 'train_labels.csv')
print(f'Train labels: {train_labels.shape}')

valid_labels = pd.read_csv(config.DATA_DIR + 'val_labels.csv')
print(f'Train labels: {valid_labels.shape}')

# Combine train and valid labels
trvl_labels = pd.concat([train_labels, valid_labels], axis = 0)

submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
print(f'Submission: {submission.shape}')

# ===== FILE PATHS OF SAMPLES =====
train_files = metadata[metadata.split == 'train']['features_path'].to_dict()
valid_files = metadata[metadata.split == 'val']['features_path'].to_dict()
test_files = metadata[metadata.split == 'test']['features_path'].to_dict()
# Train & Valid files
trva_files = train_files.copy()
trva_files.update(valid_files)
# All files
all_test_files = valid_files.copy()
all_test_files.update(test_files)

# Define SAM testbed files
sam_files = metadata[(metadata.instrument_type == 'sam_testbed') & (metadata.split == 'train')]['features_path']
sam_files = sam_files.to_dict()

# Get the names of the target columns in a list
target_labels_list = [i for i in train_labels.columns if i not in ['sample_id']]
print(target_labels_list)

# SAM testbed labels
sam_labels = train_labels.drop(train_labels.tail(len(sam_files)).index)
sam_labels = pd.concat([sam_labels, valid_labels], axis=0)
print(f'Labels w/o SAM : {sam_labels.shape}')

Metadata: (1570, 5)
Train labels: (766, 11)
Train labels: (293, 11)
Submission: (804, 11)
['basalt', 'carbonate', 'chloride', 'iron_oxide', 'oxalate', 'oxychlorine', 'phyllosilicate', 'silicate', 'sulfate', 'sulfide']
Labels w/o SAM : (1047, 11)


# Base + Slope

In [23]:
# Define a list of all feature groups
FTS_GROUPS = config.FTS_GROUPS
LABEL = 'basalt'
target_labels_list = [LABEL]
SPLIT_TYPE = 'tr'
MODEL_ALGO = 'XGB_opt'                                  # Name of the classifier

path_base_fts = os.path.join(config.MODELS_DIR, 
                         'models_base_label.txt')
with open(path_base_fts) as json_file:
    base_features = json.load(json_file)
BASE_MODEL_FTS = base_features[LABEL]
BASE_MODEL_FTS = BASE_MODEL_FTS.split('_'+MODEL_ALGO)[0]

FTS_NAME = 'fts_mra_tempmz_slope'                       # Name of the file with base features for TRAINING
COMPUTE_FTS = True                                      # Should the features be recomputed

MODEL_NAME = FTS_NAME + '_' + MODEL_ALGO + '_' + LABEL                # Name of the model
NEW_FEATURES = 'fts_slope_tt'                           # Name of a data frame with new features to add to model
COMBINE_FTS = [BASE_MODEL_FTS, NEW_FEATURES]        # Feature sets to combine for training
TRAIN_FTS_SFM = BASE_MODEL_FTS                        # Features selected with SMF() for training.
BASE_MODEL = TRAIN_FTS_SFM + '_' + MODEL_ALGO

In [24]:
# Check if feature is computed and load it or choose to compute it
check_file = 0
for i in ['_tr', '_trvl', '_vlte']:
    check_file += os.path.exists(os.path.join(config.DATA_DIR_OUT, FTS_NAME +
                                              str(i) + '.csv'))

if (check_file == 3) & (not COMPUTE_FTS):
    print('Reading features ... ')
    X_tr = pd.read_csv(os.path.join(config.DATA_DIR_OUT, FTS_NAME + '_tr.csv'))
    print(X_tr.shape)
    X_trvl = pd.read_csv(os.path.join(config.DATA_DIR_OUT, FTS_NAME + '_trvl.csv'))
    print(X_trvl.shape)
    X_vlte = pd.read_csv(os.path.join(config.DATA_DIR_OUT, FTS_NAME + '_vlte.csv'))
    print(X_vlte.shape)
    
else:
    print('Computing features ... ')
    # ----- TRAIN -----
    fe = CreateFeatures(metadata, train_files, 'tr', FTS_NAME)
    X_tr = fe.combine_features(COMBINE_FTS)
    print(colored(f'train => {X_tr.shape}', 'blue'))
    
    # ----- TRAIN & VALID -----
    fe = CreateFeatures(metadata, trva_files, 'trvl', FTS_NAME)
    X_trvl = fe.combine_features(COMBINE_FTS)
    print(colored(f'train & valid => {X_trvl.shape}', 'blue'))
    
    # ----- VALID & TEST -----
    fe = CreateFeatures(metadata, all_test_files, 'vlte', FTS_NAME)
    X_vlte = fe.combine_features(COMBINE_FTS)
    print(colored(f'valid & test => {X_vlte.shape}', 'blue'))

Computing features ... 
[34mtrain => (766, 1585)[0m
[34mtrain & valid => (1059, 1585)[0m
[34mvalid & test => (804, 1585)[0m


In [28]:
# ===== TRAIN =====

# Initialize the feature selection class
smf = feature_selection.SelectModelFeatures(
    base_sfm_features_name=TRAIN_FTS_SFM,
    base_fitted_model_name=BASE_MODEL,
    target_labels_list=target_labels_list,
    new_features_file_name=NEW_FEATURES,
    fitted_model_name=MODEL_NAME,
    fitted_model_algo=MODEL_ALGO,
    X_tr=X_tr,
    X_vlte=X_vlte,
    split_type=SPLIT_TYPE,
    train_labels=train_labels,
    valid_files=valid_files,
    valid_labels=valid_labels)

if TRAIN_FTS_SFM:
    # Loads FTS_NAME_tr_SFM_COLS - cols to train with if
    # training is done without full column lenght of input data
    TRAIN_FTS_DICT = smf.load_features(file_name=TRAIN_FTS_SFM)
else: 
    TRAIN_FTS_DICT = None

# Train the model- saves features as MODEL_NAME_tr_COLS.txt'
cvloss, submission_model = training.train_tbl(
    df_train=X_tr,
    df_labels=train_labels,
    target_list=target_labels_list,
    df_test=X_vlte,
    split_type=SPLIT_TYPE,
    model_algo=MODEL_ALGO,
    sub_name=MODEL_NAME + '_' + SPLIT_TYPE,
    base_model_name=TRAIN_FTS_SFM + '_' + MODEL_ALGO + '_' + SPLIT_TYPE + '_sfm',
    fts_select_cols=TRAIN_FTS_DICT
    )

# Compute validation loss when full model is trained
mloss, mloss_avg = training.compute_valid_loss(submission_model,
                                               valid_files, valid_labels,
                                               target_labels_list,
                                               sub_name=MODEL_NAME + '_' + SPLIT_TYPE,
                                               fts_select_cols=TRAIN_FTS_DICT)
print(colored(f'VAL LogLoss: {np.round(mloss_avg, 5)}', 'green'))

[34mLoading feature column names[0m
Reading fts_mra_tempmz_XGB_opt_tr_SFM_COLS.txt
Adding features from fts_slope_tt
[34mCV training ....[0m
Basel model CVloss: ../models/fts_mra_tempmz_XGB_opt_tr_sfm_cvloss.csv
[33mbasalt: LogLoss=0.19118[0m [32m-> -0.01261[0m
[34mFull training .....[0m
[32mbasalt - nfeatures: 32[0m
Saving fts_mra_tempmz_slope_XGB_opt_basalt_tr_COLS_sfm.txt
[33mCV LogLoss: 0.19118[0m
[32mVAL LogLoss: 0.12257[0m


# `lr_corr_mz4`

In [29]:
BASE_MODEL_FTS = 'fts_mra_tempmz_slope'
NEW_FEATURES = 'fts_lr_corr_mz4'                           # Name of a data frame with new features to add to model
FTS_NAME = 'fts_mra_tempmz_slope_lrcorrmz4'                       # Name of the file with base features for TRAINING

COMPUTE_FTS = True                                      # Should the features be recomputed
MODEL_NAME = FTS_NAME + '_' + MODEL_ALGO                # Name of the model
COMBINE_FTS = [BASE_MODEL_FTS, NEW_FEATURES]        # Feature sets to combine for training
TRAIN_FTS_SFM = BASE_MODEL_FTS                        # Features selected with SMF() for training.
BASE_MODEL = TRAIN_FTS_SFM + '_' + MODEL_ALGO

In [30]:
# Check if feature is computed and load it or choose to compute it
check_file = 0
for i in ['_tr', '_trvl', '_vlte']:
    check_file += os.path.exists(os.path.join(config.DATA_DIR_OUT, FTS_NAME +
                                              str(i) + '.csv'))

if (check_file == 3) & (not COMPUTE_FTS):
    print('Reading features ... ')
    X_tr = pd.read_csv(os.path.join(config.DATA_DIR_OUT, FTS_NAME + '_tr.csv'))
    print(X_tr.shape)
    X_trvl = pd.read_csv(os.path.join(config.DATA_DIR_OUT, FTS_NAME + '_trvl.csv'))
    print(X_trvl.shape)
    X_vlte = pd.read_csv(os.path.join(config.DATA_DIR_OUT, FTS_NAME + '_vlte.csv'))
    print(X_vlte.shape)
    
else:
    print('Computing features ... ')
    # ----- TRAIN -----
    fe = CreateFeatures(metadata, train_files, 'tr', FTS_NAME)
    X_tr = fe.combine_features(COMBINE_FTS)
    print(colored(f'train => {X_tr.shape}', 'blue'))
    
    # ----- TRAIN & VALID -----
    fe = CreateFeatures(metadata, trva_files, 'trvl', FTS_NAME)
    X_trvl = fe.combine_features(COMBINE_FTS)
    print(colored(f'train & valid => {X_trvl.shape}', 'blue'))
    
    # ----- VALID & TEST -----
    fe = CreateFeatures(metadata, all_test_files, 'vlte', FTS_NAME)
    X_vlte = fe.combine_features(COMBINE_FTS)
    print(colored(f'valid & test => {X_vlte.shape}', 'blue'))

Computing features ... 
[34mtrain => (766, 1586)[0m
[34mtrain & valid => (1059, 1586)[0m
[34mvalid & test => (804, 1586)[0m


In [31]:
# ===== TRAIN =====

# Initialize the feature selection class
smf = feature_selection.SelectModelFeatures(
    base_sfm_features_name=TRAIN_FTS_SFM,
    base_fitted_model_name=BASE_MODEL,
    target_labels_list=target_labels_list,
    new_features_file_name=NEW_FEATURES,
    fitted_model_name=MODEL_NAME,
    fitted_model_algo=MODEL_ALGO,
    X_tr=X_tr,
    X_vlte=X_vlte,
    split_type=SPLIT_TYPE,
    train_labels=train_labels,
    valid_files=valid_files,
    valid_labels=valid_labels)

if TRAIN_FTS_SFM:
    # Loads FTS_NAME_tr_SFM_COLS - cols to train with if
    # training is done without full column lenght of input data
    TRAIN_FTS_DICT = smf.load_features(file_name=TRAIN_FTS_SFM)
else: 
    TRAIN_FTS_DICT = None

# Train the model- saves features as MODEL_NAME_tr_COLS.txt'
cvloss, submission_model = training.train_tbl(
    df_train=X_tr,
    df_labels=train_labels,
    target_list=target_labels_list,
    df_test=X_vlte,
    split_type=SPLIT_TYPE,
    model_algo=MODEL_ALGO,
    sub_name=MODEL_NAME + '_' + SPLIT_TYPE,
    base_model_name=TRAIN_FTS_SFM + '_' + MODEL_ALGO + '_' + SPLIT_TYPE + '_sfm',
    fts_select_cols=TRAIN_FTS_DICT,
    )

# Compute validation loss when full model is trained
mloss, mloss_avg = training.compute_valid_loss(submission_model,
                                               valid_files, valid_labels,
                                               target_labels_list,
                                               sub_name=MODEL_NAME + '_' + SPLIT_TYPE,
                                               fts_select_cols=TRAIN_FTS_DICT)
print(colored(f'VAL LogLoss: {np.round(mloss_avg, 5)}', 'green'))

[34mLoading feature column names[0m
Reading fts_mra_tempmz_slope_XGB_opt_tr_SFM_COLS.txt
Adding features from fts_lr_corr_mz4
[34mCV training ....[0m
Basel model CVloss: ../models/fts_mra_tempmz_slope_XGB_opt_tr_sfm_cvloss.csv
[33mbasalt: LogLoss=0.19294[0m [31m-> 0.00177[0m
[34mFull training .....[0m
[32mbasalt - nfeatures: 33[0m
Saving fts_mra_tempmz_slope_lrcorrmz4_XGB_opt_tr_COLS_sfm.txt
[33mCV LogLoss: 0.19294[0m
[32mVAL LogLoss: 0.13273[0m


# `fts_cntpk_mratt`

In [32]:
BASE_MODEL_FTS = 'fts_mra_tempmz_slope'
NEW_FEATURES = 'fts_cntpk_mratt'                           # Name of a data frame with new features to add to model
FTS_NAME = 'fts_mra_tempmz_slope_cntpk'                       # Name of the file with base features for TRAINING

COMPUTE_FTS = True                                      # Should the features be recomputed
MODEL_NAME = FTS_NAME + '_' + MODEL_ALGO                # Name of the model
COMBINE_FTS = [BASE_MODEL_FTS, NEW_FEATURES]        # Feature sets to combine for training
TRAIN_FTS_SFM = BASE_MODEL_FTS                        # Features selected with SMF() for training.
BASE_MODEL = TRAIN_FTS_SFM + '_' + MODEL_ALGO

In [33]:
# Check if feature is computed and load it or choose to compute it
check_file = 0
for i in ['_tr', '_trvl', '_vlte']:
    check_file += os.path.exists(os.path.join(config.DATA_DIR_OUT, FTS_NAME +
                                              str(i) + '.csv'))

if (check_file == 3) & (not COMPUTE_FTS):
    print('Reading features ... ')
    X_tr = pd.read_csv(os.path.join(config.DATA_DIR_OUT, FTS_NAME + '_tr.csv'))
    print(X_tr.shape)
    X_trvl = pd.read_csv(os.path.join(config.DATA_DIR_OUT, FTS_NAME + '_trvl.csv'))
    print(X_trvl.shape)
    X_vlte = pd.read_csv(os.path.join(config.DATA_DIR_OUT, FTS_NAME + '_vlte.csv'))
    print(X_vlte.shape)
    
else:
    print('Computing features ... ')
    # ----- TRAIN -----
    fe = CreateFeatures(metadata, train_files, 'tr', FTS_NAME)
    X_tr = fe.combine_features(COMBINE_FTS)
    print(colored(f'train => {X_tr.shape}', 'blue'))
    
    # ----- TRAIN & VALID -----
    fe = CreateFeatures(metadata, trva_files, 'trvl', FTS_NAME)
    X_trvl = fe.combine_features(COMBINE_FTS)
    print(colored(f'train & valid => {X_trvl.shape}', 'blue'))
    
    # ----- VALID & TEST -----
    fe = CreateFeatures(metadata, all_test_files, 'vlte', FTS_NAME)
    X_vlte = fe.combine_features(COMBINE_FTS)
    print(colored(f'valid & test => {X_vlte.shape}', 'blue'))

Computing features ... 
[34mtrain => (766, 1981)[0m
[34mtrain & valid => (1059, 1981)[0m
[34mvalid & test => (804, 1981)[0m


In [34]:
# ===== TRAIN =====

# Initialize the feature selection class
smf = feature_selection.SelectModelFeatures(
    base_sfm_features_name=TRAIN_FTS_SFM,
    base_fitted_model_name=BASE_MODEL,
    target_labels_list=target_labels_list,
    new_features_file_name=NEW_FEATURES,
    fitted_model_name=MODEL_NAME,
    fitted_model_algo=MODEL_ALGO,
    X_tr=X_tr,
    X_vlte=X_vlte,
    split_type=SPLIT_TYPE,
    train_labels=train_labels,
    valid_files=valid_files,
    valid_labels=valid_labels)

if TRAIN_FTS_SFM:
    # Loads FTS_NAME_tr_SFM_COLS - cols to train with if
    # training is done without full column lenght of input data
    TRAIN_FTS_DICT = smf.load_features(file_name=TRAIN_FTS_SFM)
else: 
    TRAIN_FTS_DICT = None

# Train the model- saves features as MODEL_NAME_tr_COLS.txt'
cvloss, submission_model = training.train_tbl(
    df_train=X_tr,
    df_labels=train_labels,
    target_list=target_labels_list,
    df_test=X_vlte,
    split_type=SPLIT_TYPE,
    model_algo=MODEL_ALGO,
    sub_name=MODEL_NAME + '_' + SPLIT_TYPE,
    base_model_name=TRAIN_FTS_SFM + '_' + MODEL_ALGO + '_' + SPLIT_TYPE + '_sfm',
    fts_select_cols=TRAIN_FTS_DICT,
    )

# Compute validation loss when full model is trained
mloss, mloss_avg = training.compute_valid_loss(submission_model,
                                               valid_files, valid_labels,
                                               target_labels_list,
                                               sub_name=MODEL_NAME + '_' + SPLIT_TYPE,
                                               fts_select_cols=TRAIN_FTS_DICT)
print(colored(f'VAL LogLoss: {np.round(mloss_avg, 5)}', 'green'))

[34mLoading feature column names[0m
Reading fts_mra_tempmz_slope_XGB_opt_tr_SFM_COLS.txt
Adding features from fts_cntpk_mratt
[34mCV training ....[0m
Basel model CVloss: ../models/fts_mra_tempmz_slope_XGB_opt_tr_sfm_cvloss.csv
[33mbasalt: LogLoss=0.18535[0m [32m-> -0.00583[0m
[34mFull training .....[0m
[32mbasalt - nfeatures: 428[0m
Saving fts_mra_tempmz_slope_cntpk_XGB_opt_tr_COLS_sfm.txt
[33mCV LogLoss: 0.18535[0m
[32mVAL LogLoss: 0.12594[0m


# `fts_topmz`

In [35]:
BASE_MODEL_FTS = 'fts_mra_tempmz_slope_cntpk'
NEW_FEATURES = 'fts_topmz'                           # Name of a data frame with new features to add to model
FTS_NAME = 'fts_mra_tempmz_slope_cntpk_topmz'                       # Name of the file with base features for TRAINING

COMPUTE_FTS = True                                      # Should the features be recomputed
MODEL_NAME = FTS_NAME + '_' + MODEL_ALGO                # Name of the model
COMBINE_FTS = [BASE_MODEL_FTS, NEW_FEATURES]        # Feature sets to combine for training
TRAIN_FTS_SFM = BASE_MODEL_FTS                        # Features selected with SMF() for training.
BASE_MODEL = TRAIN_FTS_SFM + '_' + MODEL_ALGO

In [36]:
# Check if feature is computed and load it or choose to compute it
check_file = 0
for i in ['_tr', '_trvl', '_vlte']:
    check_file += os.path.exists(os.path.join(config.DATA_DIR_OUT, FTS_NAME +
                                              str(i) + '.csv'))

if (check_file == 3) & (not COMPUTE_FTS):
    print('Reading features ... ')
    X_tr = pd.read_csv(os.path.join(config.DATA_DIR_OUT, FTS_NAME + '_tr.csv'))
    print(X_tr.shape)
    X_trvl = pd.read_csv(os.path.join(config.DATA_DIR_OUT, FTS_NAME + '_trvl.csv'))
    print(X_trvl.shape)
    X_vlte = pd.read_csv(os.path.join(config.DATA_DIR_OUT, FTS_NAME + '_vlte.csv'))
    print(X_vlte.shape)
    
else:
    print('Computing features ... ')
    # ----- TRAIN -----
    fe = CreateFeatures(metadata, train_files, 'tr', FTS_NAME)
    X_tr = fe.combine_features(COMBINE_FTS)
    print(colored(f'train => {X_tr.shape}', 'blue'))
    
    # ----- TRAIN & VALID -----
    fe = CreateFeatures(metadata, trva_files, 'trvl', FTS_NAME)
    X_trvl = fe.combine_features(COMBINE_FTS)
    print(colored(f'train & valid => {X_trvl.shape}', 'blue'))
    
    # ----- VALID & TEST -----
    fe = CreateFeatures(metadata, all_test_files, 'vlte', FTS_NAME)
    X_vlte = fe.combine_features(COMBINE_FTS)
    print(colored(f'valid & test => {X_vlte.shape}', 'blue'))

Computing features ... 
[34mtrain => (766, 1984)[0m
[34mtrain & valid => (1059, 1984)[0m
[34mvalid & test => (804, 1984)[0m


In [37]:
# ===== TRAIN =====

# Initialize the feature selection class
smf = feature_selection.SelectModelFeatures(
    base_sfm_features_name=TRAIN_FTS_SFM,
    base_fitted_model_name=BASE_MODEL,
    target_labels_list=target_labels_list,
    new_features_file_name=NEW_FEATURES,
    fitted_model_name=MODEL_NAME,
    fitted_model_algo=MODEL_ALGO,
    X_tr=X_tr,
    X_vlte=X_vlte,
    split_type=SPLIT_TYPE,
    train_labels=train_labels,
    valid_files=valid_files,
    valid_labels=valid_labels)

if TRAIN_FTS_SFM:
    # Loads FTS_NAME_tr_SFM_COLS - cols to train with if
    # training is done without full column lenght of input data
    TRAIN_FTS_DICT = smf.load_features(file_name=TRAIN_FTS_SFM)
else: 
    TRAIN_FTS_DICT = None

# Train the model- saves features as MODEL_NAME_tr_COLS.txt'
cvloss, submission_model = training.train_tbl(
    df_train=X_tr,
    df_labels=train_labels,
    target_list=target_labels_list,
    df_test=X_vlte,
    split_type=SPLIT_TYPE,
    model_algo=MODEL_ALGO,
    sub_name=MODEL_NAME + '_' + SPLIT_TYPE,
    base_model_name=TRAIN_FTS_SFM + '_' + MODEL_ALGO + '_' + SPLIT_TYPE + '_sfm',
    fts_select_cols=TRAIN_FTS_DICT,
    )

# Compute validation loss when full model is trained
mloss, mloss_avg = training.compute_valid_loss(submission_model,
                                               valid_files, valid_labels,
                                               target_labels_list,
                                               sub_name=MODEL_NAME + '_' + SPLIT_TYPE,
                                               fts_select_cols=TRAIN_FTS_DICT)
print(colored(f'VAL LogLoss: {np.round(mloss_avg, 5)}', 'green'))

[34mLoading feature column names[0m
Reading fts_mra_tempmz_slope_cntpk_XGB_opt_tr_SFM_COLS.txt
Adding features from fts_topmz
[34mCV training ....[0m
Basel model CVloss: ../models/fts_mra_tempmz_slope_cntpk_XGB_opt_tr_sfm_cvloss.csv
[33mbasalt: LogLoss=0.17807[0m [32m-> -0.00727[0m
[34mFull training .....[0m
[32mbasalt - nfeatures: 79[0m
Saving fts_mra_tempmz_slope_cntpk_topmz_XGB_opt_tr_COLS_sfm.txt
[33mCV LogLoss: 0.17807[0m
[32mVAL LogLoss: 0.12618[0m
