In [1]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
## Environment
# Change main system path to be able to run code from src folder
import sys
p = sys.path[0]
# Mac OS
if sys.path[0].endswith('/models'):
    main_path = p[:-len('/models')]
sys.path[0] = main_path

import os, gc, json 
from termcolor import colored
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from src import (config, fe, features, feature_selection, 
                 preprocess, training, model_selection)
from src.fe import CreateFeatures

  from pandas import MultiIndex, Int64Index


In [3]:
# DATA PREPARATION
# ===== LOAD DATA ======
metadata = pd.read_csv(config.DATA_DIR + 'metadata.csv')
print(f'Metadata: {metadata.shape}')

train_labels = pd.read_csv(config.DATA_DIR + 'train_labels.csv')
print(f'Train labels: {train_labels.shape}')

valid_labels = pd.read_csv(config.DATA_DIR + 'val_labels.csv')
print(f'Train labels: {valid_labels.shape}')

# Combine train and valid labels
trvl_labels = pd.concat([train_labels, valid_labels], axis = 0)

submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
print(f'Submission: {submission.shape}')

# ===== FILE PATHS OF SAMPLES =====
train_files = metadata[metadata.split == 'train']['features_path'].to_dict()
valid_files = metadata[metadata.split == 'val']['features_path'].to_dict()
test_files = metadata[metadata.split == 'test']['features_path'].to_dict()
# Train & Valid files
trva_files = train_files.copy()
trva_files.update(valid_files)
# All files
all_test_files = valid_files.copy()
all_test_files.update(test_files)

# Define SAM testbed files
sam_files = metadata[(metadata.instrument_type == 'sam_testbed') & (metadata.split == 'train')]['features_path']
sam_files = sam_files.to_dict()

# Get the names of the target columns in a list
target_labels_list = [i for i in train_labels.columns if i not in ['sample_id']]
print(target_labels_list)

# SAM testbed labels
sam_labels = train_labels.drop(train_labels.tail(len(sam_files)).index)
sam_labels = pd.concat([sam_labels, valid_labels], axis=0)
print(f'Labels w/o SAM : {sam_labels.shape}')

Metadata: (1570, 5)
Train labels: (766, 11)
Train labels: (293, 11)
Submission: (804, 11)
['basalt', 'carbonate', 'chloride', 'iron_oxide', 'oxalate', 'oxychlorine', 'phyllosilicate', 'silicate', 'sulfate', 'sulfide']
Labels w/o SAM : (1047, 11)


Define the global variables:

In [4]:
LABEL = 'basalt'
# Base model features
INITIAL_FTS = 'fts_mra_tempmz'
INITIAL_CVLOSS = 0.2037926424671946
# Features to add to the base model
ALL_FTS_GROUPS = config.FTS_GROUPS
MODEL_ALGO = 'XGB_opt'
DF_TARGET = train_labels

In [5]:
SPLIT_TYPE = 'tr'

# Save best model features: CV loss the lowest when compared to previous fitted model
BEST_MODEL_FTS_GROUPS = [INITIAL_FTS]   # Name of a feature group included in the final model
BEST_MODEL_FTS = []                     # Selected features after SFM if applied
BEST_MODEL_CVLOSS = 0
BEST_MODEL_VLOSS = 0

In [6]:
# Inital X_train: base model features
X_train = pd.read_csv(os.path.join(config.DATA_DIR_OUT, 
                                   INITIAL_FTS + '_' + SPLIT_TYPE + '.csv'))
print(f'X_train initial: {X_train.shape}')
initial_fts_path = INITIAL_FTS + '_' + MODEL_ALGO + '_' + SPLIT_TYPE + '_SFM_COLS.txt'
if os.path.exists(initial_fts_path):
    print(f'Reading features:', colored(f'{initial_fts_path}', 'yellow'))
    with open(initial_fts_path) as json_file:
            initial_fts_dict = json.load(json_file)
    BEST_MODEL_FTS = BEST_MODEL_FTS + initial_fts_dict[LABEL]
    print(f'No features added:',colored(f'{len(initial_fts_dict[LABEL])}', 'blue'))
else:
    BEST_MODEL_FTS = BEST_MODEL_FTS + X_train.columns.tolist()
    print(f'No features added:',colored(f'{len(X_train.columns.tolist())}', 'blue'))

# Redefine X_train given new set of features
X_train = X_train[BEST_MODEL_FTS].copy()
print(colored(f'X_train base: {X_train.shape}', 'green'))

# Define validation & test dataset
# X_vlte = pd.read_csv(os.path.join(config.DATA_DIR_OUT, 
#                                   INITIAL_FTS + '_vlte.csv'))
# X_vlte = X_vlte[BEST_MODEL_FTS].copy()
# print(colored(f'X_vlte base: {X_vlte.shape}', 'green'))

X_train initial: (766, 1584)
Reading features: [33mfts_mra_tempmz_XGB_opt_tr_SFM_COLS.txt[0m
No features added: [34m31[0m
[32mX_train base: (766, 31)[0m


In [7]:
# Starting cv loss
best_cv_loss = INITIAL_CVLOSS

for FTS_GROUP in ALL_FTS_GROUPS:
    print('-'*55)
    print(f'Feature group:', colored(f'{FTS_GROUP}', 'yellow'))
    
    X_train_temp = X_train.copy()

    # If the feature is not already included in the best model
    if FTS_GROUP not in BEST_MODEL_FTS_GROUPS:
        
        # ----- Read in the features df -----
        # TODO Add check if the fts are already in the df ot to add them twice
        new_df = pd.read_csv(os.path.join(config.DATA_DIR_OUT, FTS_GROUP + '_' + SPLIT_TYPE + '.csv'))
        new_df_vlte = pd.read_csv(os.path.join(config.DATA_DIR_OUT, FTS_GROUP + '_vlte.csv'))
        # Filter with selected features if they exist
        new_df_fts_path = FTS_GROUP + '_' + MODEL_ALGO + '_' + SPLIT_TYPE + '_SFM_COLS.txt'
        if os.path.exists(new_df_fts_path):
            print(f'Reading features: {new_df_fts_path}')
            with open(new_df_fts_path) as json_file:
                    new_df_fts_dict = json.load(json_file)
            new_features_list = new_df_fts_dict[LABEL]
            new_df = new_df[new_features_list].copy()
            no_features = len(new_features_list)
            print(f'No features:',colored(f'{no_features}', 'blue'))
        else:
            new_features_list = new_df.columns.tolist()
            no_features = len(new_features_list)
            print(f'No features:',colored(f'{no_features}', 'blue'))

        # Add the new features to existing X_train_temp and X_vlte_temp
        X_train_temp = pd.concat([X_train_temp, new_df], axis=1)
        print(colored(f'X_train temp: {X_train_temp.shape}', 'green'))
        
        # ----- CV TRAINING -----
        # MODEL INFORMATION
        logloss_avg = []    # Average value of log loss for each label
        
        # Target variable
        y = DF_TARGET[LABEL].copy()
        
        # Define cross validation
        cv = StratifiedKFold(n_splits = config.NO_CV_FOLDS,
                             random_state =config.RANDOM_SEED,
                             shuffle = True)

        # CROSS VALIDATION TRAINING
        oof_logloss = [] # Metric for each fold for one label

        # Define the folds and train the model
        for fold, (t_, v_) in enumerate(cv.split(X_train_temp, y)):
            #print(colored(f'FOLD {fold+1}', 'magenta'))
            Xtr = X_train_temp.iloc[t_].copy()
            ytr = y.iloc[t_].values
            Xval = X_train_temp.iloc[v_].copy()
            yval = y.iloc[v_].values

            clf = model_selection.models[MODEL_ALGO]
            clf.fit(Xtr, ytr)
            
            # Compute predictions
            y_preds = clf.predict_proba(Xval)[:,1]

            # Compute model metric
            oof_logloss.append(log_loss(yval, y_preds))
        
        # Average log loss per label
        logloss_avg = np.sum(oof_logloss)/config.NO_CV_FOLDS
        
        # Compare CV losses
        if logloss_avg < best_cv_loss:
            print(f'CVloss',colored(f'{(np.round(logloss_avg-best_cv_loss,5))}', 'green'))
            best_cv_loss = logloss_avg
            
            # Add feature group to BEST_MODEL_FTS_GROUPS
            BEST_MODEL_FTS_GROUPS = BEST_MODEL_FTS_GROUPS + [FTS_GROUP]
            print(f'BEST_MODEL_FTS_GROUPS: {BEST_MODEL_FTS_GROUPS}')
            
            # Add features to BEST_MODEL_FTS
            if no_features > 1:
                BEST_MODEL_FTS = BEST_MODEL_FTS + new_features_list
            else:
                BEST_MODEL_FTS = BEST_MODEL_FTS + [new_features_list]
            print(f'No features added: {len(BEST_MODEL_FTS)}')
            
            # Copy the training dataset
            X_train = X_train_temp.copy()
        else:
            print(f'CVloss',colored(f'{(np.round(logloss_avg-best_cv_loss,5))}', 'red'))
            print(f'BEST_MODEL_FTS_GROUPS: {BEST_MODEL_FTS_GROUPS}')
        
        print(colored(f'BEST CV LOSS: {best_cv_loss}', 'magenta'))
        
        del X_train_temp

-------------------------------------------------------
Feature group: [33mfts_slope_tt[0m
No features: [34m1[0m
[32mX_train temp: (766, 32)[0m
CVloss [32m-0.01261[0m
BEST_MODEL_FTS_GROUPS: ['fts_mra_tempmz', 'fts_slope_tt']
No features added: 32
[35mBEST CV LOSS: 0.19117839287332034[0m
-------------------------------------------------------
Feature group: [33mfts_mz_spectra[0m
No features: [34m10[0m
[32mX_train temp: (766, 42)[0m
CVloss [32m-0.01712[0m
BEST_MODEL_FTS_GROUPS: ['fts_mra_tempmz', 'fts_slope_tt', 'fts_mz_spectra']
No features added: 42
[35mBEST CV LOSS: 0.17405980386256345[0m
-------------------------------------------------------
Feature group: [33mfts_mzstats[0m
Reading features: fts_mzstats_XGB_opt_tr_SFM_COLS.txt
No features: [34m55[0m
[32mX_train temp: (766, 97)[0m
CVloss [32m-0.00824[0m
BEST_MODEL_FTS_GROUPS: ['fts_mra_tempmz', 'fts_slope_tt', 'fts_mz_spectra', 'fts_mzstats']
No features added: 97
[35mBEST CV LOSS: 0.1658204440848679[0m