In [1]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
## Environment
# Change main system path to be able to run code from src folder
import sys
p = sys.path[0]
# Mac OS
if sys.path[0].endswith('/models'):
    main_path = p[:-len('/models')]
sys.path[0] = main_path

import os, gc
from termcolor import colored
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
from src import (config, fe, features, feature_selection, 
                 preprocess, training)
from src.fe import CreateFeatures

  from pandas import MultiIndex, Int64Index


In [3]:
# DATA PREPARATION
# ===== LOAD DATA ======
metadata = pd.read_csv(config.DATA_DIR + 'metadata.csv')
print(f'Metadata: {metadata.shape}')

train_labels = pd.read_csv(config.DATA_DIR + 'train_labels.csv')
print(f'Train labels: {train_labels.shape}')

valid_labels = pd.read_csv(config.DATA_DIR + 'val_labels.csv')
print(f'Train labels: {valid_labels.shape}')

# Combine train and valid labels
trvl_labels = pd.concat([train_labels, valid_labels], axis = 0)

submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
print(f'Submission: {submission.shape}')

# ===== FILE PATHS OF SAMPLES =====
train_files = metadata[metadata.split == 'train']['features_path'].to_dict()
valid_files = metadata[metadata.split == 'val']['features_path'].to_dict()
test_files = metadata[metadata.split == 'test']['features_path'].to_dict()
# Train & Valid files
trva_files = train_files.copy()
trva_files.update(valid_files)
# All files
all_test_files = valid_files.copy()
all_test_files.update(test_files)

# Define SAM testbed files
sam_files = metadata[(metadata.instrument_type == 'sam_testbed') & (metadata.split == 'train')]['features_path']
sam_files = sam_files.to_dict()

# Get the names of the target columns in a list
target_labels_list = [i for i in train_labels.columns if i not in ['sample_id']]
print(target_labels_list)

# SAM testbed labels
sam_labels = train_labels.drop(train_labels.tail(len(sam_files)).index)
sam_labels = pd.concat([sam_labels, valid_labels], axis=0)
print(f'Labels w/o SAM : {sam_labels.shape}')

Metadata: (1570, 5)
Train labels: (766, 11)
Train labels: (293, 11)
Submission: (804, 11)
['basalt', 'carbonate', 'chloride', 'iron_oxide', 'oxalate', 'oxychlorine', 'phyllosilicate', 'silicate', 'sulfate', 'sulfide']
Labels w/o SAM : (1047, 11)


In [6]:
FTS_NAME = 'fts_mra_tempmz_maxabun'                 # Name of the file with base features for TRAINING
COMPUTE_FTS = True                                      # Should the features be recomputed
COMPUTE_FTS_SAM = False                                 # Compute SAM test bed
MODEL_ALGO = 'XGB_opt'                                  # Name of the classifier
MODEL_NAME = FTS_NAME + '_' + MODEL_ALGO                # Name of the model
COMBINE_FTS = ['fts_mra_tempmz', 'fts_mz_maxabun']   # Feature sets to combine for training
NEW_FEATURES = None                                        # Name of a data frame with new features to add to model
TRAIN_FTS_SFM = ['fts_mra_tempmz', 'fts_mz_maxabun'] # Features selected with SMF() for training.   
BASE_MODEL_FTS = 'fts_mra_tempmz'
BASE_MODEL = BASE_MODEL_FTS + '_' + MODEL_ALGO

**COMPUTE FEATURES**

- Change the `fe._` method depending on the feature that we wish to calculate

In [7]:
# Check if feature is computed and load it or choose to compute it
check_file = 0
for i in ['_tr', '_trvl', '_vlte']:
    check_file += os.path.exists(os.path.join(config.DATA_DIR_OUT, FTS_NAME +
                                              str(i) + '.csv'))

if (check_file == 3) & (not COMPUTE_FTS):
    print('Reading features ... ')
    X_tr = pd.read_csv(os.path.join(config.DATA_DIR_OUT, FTS_NAME + '_tr.csv'))
    print(X_tr.shape)
    X_trvl = pd.read_csv(os.path.join(config.DATA_DIR_OUT, FTS_NAME + '_trvl.csv'))
    print(X_trvl.shape)
    X_vlte = pd.read_csv(os.path.join(config.DATA_DIR_OUT, FTS_NAME + '_vlte.csv'))
    print(X_vlte.shape)
    
else:
    print('Computing features ... ')
    # ----- TRAIN -----
    fe = CreateFeatures(metadata, train_files, 'tr', FTS_NAME)
    X_tr = fe.combine_features(COMBINE_FTS)
    print(colored(f'train => {X_tr.shape}', 'blue'))
    
    # ----- TRAIN & VALID -----
    fe = CreateFeatures(metadata, trva_files, 'trvl', FTS_NAME)
    X_trvl = fe.combine_features(COMBINE_FTS)
    print(colored(f'train & valid => {X_trvl.shape}', 'blue'))
    
    # ----- VALID & TEST -----
    fe = CreateFeatures(metadata, all_test_files, 'vlte', FTS_NAME)
    X_vlte = fe.combine_features(COMBINE_FTS)
    print(colored(f'valid & test => {X_vlte.shape}', 'blue'))
    
if COMPUTE_FTS_SAM:
    print(f'\nCreating SAM testbed samples ...')
    # Training without SAM testbed
    X_tr_sam = X_tr.drop(X_tr.tail(len(sam_files)).index).copy()
    X_tr_sam = pd.concat([X_tr_sam, X_trvl.iloc[len(train_files):,:]], axis=0)
    print(f'Train shape: {X_tr_sam.shape}')
    # Validation data
    X_vl_sam = X_tr.tail(len(sam_files)).copy()
    print(f'Valid shape: {X_vl_sam.shape}')

Computing features ... 
[34mtrain => (766, 1683)[0m
[34mtrain & valid => (1059, 1683)[0m
[34mvalid & test => (804, 1683)[0m


## MODELS

**TRAIN**

In [8]:
split_type = 'tr'
feature_selection.combine_sfm_features(
                    base_sfm_features_name=TRAIN_FTS_SFM,
                    fitted_model_algo=MODEL_ALGO,
                    target_labels_list=target_labels_list,
                    split_type=split_type,
                    fitted_model_name=FTS_NAME)
#TRAIN_FTS_SFM = FTS_NAME

Adding fts_mra_tempmz
Adding fts_mz_maxabun
Saving fts_mra_tempmz_maxabun_XGB_opt_tr_SFM_COLS.txt


In [9]:
# ===== TRAIN =====
split_type = 'tr'

# Initialize the feature selection class
smf = feature_selection.SelectModelFeatures(
    base_sfm_features_name=TRAIN_FTS_SFM,
    base_fitted_model_name=None,
    target_labels_list=target_labels_list,
    new_features_file_name=NEW_FEATURES,
    fitted_model_name=MODEL_NAME,
    fitted_model_algo=MODEL_ALGO,
    X_tr=X_tr,
    X_vlte=X_vlte,
    split_type='tr',
    train_labels=train_labels,
    valid_files=valid_files,
    valid_labels=valid_labels)

if TRAIN_FTS_SFM:
    # Loads FTS_NAME_tr_SFM_COLS - cols to train with if
    # training is done without full column lenght of input data
    TRAIN_FTS_DICT = smf.load_features(file_name=FTS_NAME)
else: 
    TRAIN_FTS_DICT = None

# Train the model- saves features as MODEL_NAME_tr_COLS.txt'
cvloss, submission_model = training.train_tbl(
    df_train=X_tr,
    df_labels=train_labels,
    target_list=target_labels_list,
    df_test=X_vlte,
    split_type=split_type,
    model_algo=MODEL_ALGO,
    sub_name=MODEL_NAME + '_' + split_type,
    base_model_name=BASE_MODEL_FTS + '_' + MODEL_ALGO + '_' + split_type + '_sfm',
    fts_select_cols=TRAIN_FTS_DICT,
    )

# Compute validation loss when full model is trained
mloss, mloss_avg = training.compute_valid_loss(submission_model,
                                               valid_files, valid_labels,
                                               target_labels_list,
                                               sub_name=MODEL_NAME + '_' + split_type,
                                               fts_select_cols=TRAIN_FTS_DICT)
print(colored(f'VAL LogLoss: {np.round(mloss_avg, 5)}', 'green'))

[34mLoading feature column names[0m
Reading fts_mra_tempmz_maxabun_XGB_opt_tr_SFM_COLS.txt
[34mCV training ....[0m
Basel model CVloss: ../models/fts_mra_tempmz_XGB_opt_tr_sfm_cvloss.csv
[33mbasalt: LogLoss=0.19076[0m [32m-> -0.01303[0m
[33mcarbonate: LogLoss=0.10985[0m [31m-> 0.00105[0m
[33mchloride: LogLoss=0.18517[0m [32m-> -0.02477[0m
[33miron_oxide: LogLoss=0.22697[0m [31m-> 0.00276[0m
[33moxalate: LogLoss=0.02271[0m [32m-> -0.00327[0m
[33moxychlorine: LogLoss=0.1644[0m [32m-> -0.01577[0m
[33mphyllosilicate: LogLoss=0.24467[0m [32m-> -0.0007[0m
[33msilicate: LogLoss=0.24195[0m [31m-> 0.00877[0m
[33msulfate: LogLoss=0.20801[0m [31m-> 0.01785[0m
[33msulfide: LogLoss=0.06749[0m [32m-> -4e-05[0m
[34mFull training .....[0m
[32mbasalt - nfeatures: 55[0m
[32mcarbonate - nfeatures: 143[0m
[32mchloride - nfeatures: 91[0m
[32miron_oxide - nfeatures: 249[0m
[32moxalate - nfeatures: 27[0m
[32moxychlorine - nfeatures: 83[0m
[32mphyllosi

**TRAIN & VALID**

In [10]:
FTS_NAME = 'fts_mra_tempmz_maxabun'                 # Name of the file with base features for TRAINING
COMPUTE_FTS = True                                      # Should the features be recomputed
COMPUTE_FTS_SAM = False                                 # Compute SAM test bed
MODEL_ALGO = 'XGB_opt'                                  # Name of the classifier
MODEL_NAME = FTS_NAME + '_' + MODEL_ALGO                # Name of the model
COMBINE_FTS = ['fts_mra_tempmz', 'fts_mz_maxabun']   # Feature sets to combine for training
NEW_FEATURES = None                                        # Name of a data frame with new features to add to model
TRAIN_FTS_SFM = ['fts_mra_tempmz', 'fts_mz_maxabun'] # Features selected with SMF() for training.   
BASE_MODEL_FTS = 'fts_mra_tempmz'
BASE_MODEL = BASE_MODEL_FTS + '_' + MODEL_ALGO

In [11]:
split_type = 'trvl'
feature_selection.combine_sfm_features(
                    base_sfm_features_name=TRAIN_FTS_SFM,
                    fitted_model_algo=MODEL_ALGO,
                    target_labels_list=target_labels_list,
                    split_type=split_type,
                    fitted_model_name=FTS_NAME)
#TRAIN_FTS_SFM = FTS_NAME

Adding fts_mra_tempmz
Adding fts_mz_maxabun
Saving fts_mra_tempmz_maxabun_XGB_opt_trvl_SFM_COLS.txt


In [12]:
# ===== TRAIN =====
split_type = 'trvl'

# Initialize the feature selection class
smf = feature_selection.SelectModelFeatures(
    base_sfm_features_name=TRAIN_FTS_SFM,
    base_fitted_model_name=None,
    target_labels_list=target_labels_list,
    new_features_file_name=NEW_FEATURES,
    fitted_model_name=MODEL_NAME,
    fitted_model_algo=MODEL_ALGO,
    X_tr=X_trvl,
    X_vlte=X_vlte,
    split_type=split_type,
    train_labels=trvl_labels,
    valid_files=valid_files,
    valid_labels=valid_labels)

if TRAIN_FTS_SFM:
    # Loads FTS_NAME_tr_SFM_COLS - cols to train with if
    # training is done without full column lenght of input data
    TRAIN_FTS_DICT = smf.load_features(file_name=FTS_NAME)
else: 
    TRAIN_FTS_DICT = None

# Train the model- saves features as MODEL_NAME_tr_COLS.txt'
cvloss, submission_model = training.train_tbl(
    df_train=X_trvl,
    df_labels=trvl_labels,
    target_list=target_labels_list,
    df_test=X_vlte,
    split_type=split_type,
    model_algo=MODEL_ALGO,
    sub_name=MODEL_NAME + '_' + split_type,
    base_model_name=BASE_MODEL_FTS + '_' + MODEL_ALGO + '_' + split_type + '_sfm',
    fts_select_cols=TRAIN_FTS_DICT,
    )

# Compute validation loss when full model is trained
mloss, mloss_avg = training.compute_valid_loss(submission_model,
                                               valid_files, valid_labels,
                                               target_labels_list,
                                               sub_name=MODEL_NAME + '_' + split_type,
                                               fts_select_cols=TRAIN_FTS_DICT)
print(colored(f'VAL LogLoss: {np.round(mloss_avg, 5)}', 'green'))

[34mLoading feature column names[0m
Reading fts_mra_tempmz_maxabun_XGB_opt_trvl_SFM_COLS.txt
[34mCV training ....[0m
Basel model CVloss: ../models/fts_mra_tempmz_XGB_opt_trvl_sfm_cvloss.csv
[33mbasalt: LogLoss=0.15847[0m [32m-> -0.00183[0m
[33mcarbonate: LogLoss=0.0845[0m [32m-> -0.00498[0m
[33mchloride: LogLoss=0.17325[0m [31m-> 0.00225[0m
[33miron_oxide: LogLoss=0.20251[0m [32m-> -0.00854[0m
[33moxalate: LogLoss=0.01159[0m [32m-> -0.00251[0m
[33moxychlorine: LogLoss=0.15008[0m [32m-> -0.00967[0m
[33mphyllosilicate: LogLoss=0.2372[0m [32m-> -0.0021[0m
[33msilicate: LogLoss=0.18702[0m [31m-> 0.00028[0m
[33msulfate: LogLoss=0.16424[0m [32m-> -0.00932[0m
[33msulfide: LogLoss=0.0811[0m [32m-> -0.00057[0m
[34mFull training .....[0m
[32mbasalt - nfeatures: 1646[0m
[32mcarbonate - nfeatures: 287[0m
[32mchloride - nfeatures: 1667[0m
[32miron_oxide - nfeatures: 1678[0m
[32moxalate - nfeatures: 34[0m
[32moxychlorine - nfeatures: 1676[0m


**SAM TRAINING**