In [1]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
## Environment
# Change main system path to be able to run code from src folder
import sys
p = sys.path[0]
# Mac OS
if sys.path[0].endswith('/models'):
    main_path = p[:-len('/models')]
sys.path[0] = main_path

import os, gc
from termcolor import colored
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
from src import (config, fe, features, preprocess, training)
from src.fe import CreateFeatures

  from pandas import MultiIndex, Int64Index


In [3]:
# DATA PREPARATION
# ===== LOAD DATA ======
metadata = pd.read_csv(config.DATA_DIR + 'metadata.csv')
print(f'Metadata: {metadata.shape}')

train_labels = pd.read_csv(config.DATA_DIR + 'train_labels.csv')
print(f'Train labels: {train_labels.shape}')

valid_labels = pd.read_csv(config.DATA_DIR + 'val_labels.csv')
print(f'Train labels: {valid_labels.shape}')

# Combine train and valid labels
trvl_labels = pd.concat([train_labels, valid_labels], axis = 0)

submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
print(f'Submission: {submission.shape}')

# ===== FILE PATHS OF SAMPLES =====
train_files = metadata[metadata.split == 'train']['features_path'].to_dict()
valid_files = metadata[metadata.split == 'val']['features_path'].to_dict()
test_files = metadata[metadata.split == 'test']['features_path'].to_dict()
# Train & Valid files
trva_files = train_files.copy()
trva_files.update(valid_files)
# All files
all_test_files = valid_files.copy()
all_test_files.update(test_files)

# Define SAM testbed files
sam_files = metadata[(metadata.instrument_type == 'sam_testbed') & (metadata.split == 'train')]['features_path']
sam_files = sam_files.to_dict()

# Ion type list
ion_list = list(np.arange(0,100,1.0))
ion_list.remove(4.0)

# Get the names of the target columns in a list
target_labels_list = [i for i in train_labels.columns if i not in ['sample_id']]
print(target_labels_list)

# SAM testbed labels
sam_labels = train_labels.drop(train_labels.tail(len(sam_files)).index)

Metadata: (1570, 5)
Train labels: (766, 11)
Train labels: (293, 11)
Submission: (804, 11)
['basalt', 'carbonate', 'chloride', 'iron_oxide', 'oxalate', 'oxychlorine', 'phyllosilicate', 'silicate', 'sulfate', 'sulfide']


In [4]:
FTS_NAME = 'fts_mra_tempmz'                 # Name of the file with features
COMPUTE_FTS = False                         # Should the features be recomputed
COMPUTE_FTS_SAM = True                      # Compute SAM test bed
MODEL_ALGO = 'XGB_opt'                      # Name of the classifier
MODEL_NAME = FTS_NAME + '_' + MODEL_ALGO    # Name of the model

**COMPUTE FEATURES**

- Change the `fe._` method depending on the feature that we wish to calculate

In [5]:
# Check if feature is computed and load it or choose to compute it
check_file = 0
for i in ['_tr', '_trvl', '_vlte']:
    check_file += os.path.exists(os.path.join(config.DATA_DIR_OUT, FTS_NAME +
                                              str(i) + '.csv'))

if (check_file == 3) & (not COMPUTE_FTS):
    print('Reading features ... ')
    X_tr = pd.read_csv(os.path.join(config.DATA_DIR_OUT, FTS_NAME + '_tr.csv'))
    print(X_tr.shape)
    X_trvl = pd.read_csv(os.path.join(config.DATA_DIR_OUT, FTS_NAME + '_trvl.csv'))
    print(X_trvl.shape)
    X_vlte = pd.read_csv(os.path.join(config.DATA_DIR_OUT, FTS_NAME + '_vlte.csv'))
    print(X_vlte.shape)
    
else:
    print('Computing features ... ')
    # ----- TRAIN -----
    fe = CreateFeatures(metadata, train_files, 'tr')
    X_tr = fe.fts_mra_tempmz()
    print(colored(f'train => {X_tr.shape}', 'blue'))
    
    # ----- TRAIN & VALID -----
    fe = CreateFeatures(metadata, trva_files, 'trvl')
    X_trvl = fe.fts_mra_tempmz()
    print(colored(f'train => {X_trvl.shape}', 'blue'))
    
    # ----- VALID & TEST -----
    fe = CreateFeatures(metadata, all_test_files, 'vlte')
    X_vlte = fe.fts_mra_tempmz()
    print(colored(f'train => {X_vlte.shape}', 'blue'))
    
if COMPUTE_FTS_SAM:
    print(f'\nCreating SAM testbed samples ...')
    # Training without SAM testbed
    X_tr_sam = X_tr.drop(X_tr.tail(len(sam_files)).index).copy()
    print(f'Train shape: {X_tr_sam.shape}')
    # Validation data
    X_vl_sam = X_tr.tail(len(sam_files)).copy()
    print(f'Valid shape: {X_vl_sam.shape}')

Computing features ... 
Number of samples: 766
[34mtrain => (766, 1584)[0m
Number of samples: 1059
[34mtrain => (1059, 1584)[0m
Number of samples: 804
[34mtrain => (804, 1584)[0m

Creating SAM testbed samples ...
Train shape: (754, 1584)
Valid shape: (12, 1584)


## MODELS

**TRAIN**

In [6]:
# ===== TRAIN =====
cvloss, submission_model = training.train_tbl(
    df_train=X_tr,
    df_labels=train_labels,
    target_list=target_labels_list,
    df_test=X_vlte,
    model_algo=MODEL_ALGO,
    sub_name=MODEL_NAME + '_tr'
    )

# Compute validation loss when full model is trained
mloss, mloss_avg = training.compute_valid_loss(submission_model,
                                               valid_files, valid_labels,
                                               target_labels_list,
                                               sub_name=MODEL_NAME + '_tr')

[34mCV training ....[0m
[35mbasalt[0m
[35mcarbonate[0m
[35mchloride[0m
[35miron_oxide[0m
[35moxalate[0m
[35moxychlorine[0m
[35mphyllosilicate[0m
[35msilicate[0m
[35msulfate[0m
[35msulfide[0m
[34mFull training .....[0m
[34mLABEL: basalt[0m
[34mLABEL: carbonate[0m
[34mLABEL: chloride[0m
[34mLABEL: iron_oxide[0m
[34mLABEL: oxalate[0m
[34mLABEL: oxychlorine[0m
[34mLABEL: phyllosilicate[0m
[34mLABEL: silicate[0m
[34mLABEL: sulfate[0m
[34mLABEL: sulfide[0m
[34m
Average Log Loss: 0.2759[0m
Log Loss per Label:
{'basalt': 0.27678260590883075, 'carbonate': 0.2612545003933846, 'chloride': 0.2842979960325033, 'iron_oxide': 0.3824699336087238, 'oxalate': 0.004059239489520996, 'oxychlorine': 0.2991500982787292, 'phyllosilicate': 0.4074758334090647, 'silicate': 0.35168474392654814, 'sulfate': 0.34471883287139204, 'sulfide': 0.14713584783404984}
(293, 10)


In [7]:
print(f'CV Logloss: {np.mean(list(cvloss.values()))}')
print(f'VALID FM LogLoss: {mloss_avg}')
print(f'CV logLoss Label')
print(cvloss)

CV Logloss: 0.27590296317527474
VALID FM LogLoss: 0.27328136675340103
CV logLoss Label
{'basalt': 0.27678260590883075, 'carbonate': 0.2612545003933846, 'chloride': 0.2842979960325033, 'iron_oxide': 0.3824699336087238, 'oxalate': 0.004059239489520996, 'oxychlorine': 0.2991500982787292, 'phyllosilicate': 0.4074758334090647, 'silicate': 0.35168474392654814, 'sulfate': 0.34471883287139204, 'sulfide': 0.14713584783404984}


**TRAIN & VALID**

In [8]:
# ===== TRAIN =====
cvloss, submission_model = training.train_tbl(
    df_train=X_trvl,
    df_labels=trvl_labels,
    target_list=target_labels_list,
    df_test=X_vlte,
    model_algo=MODEL_ALGO,
    sub_name=MODEL_NAME + '_trvl'
    )

[34mCV training ....[0m
[35mbasalt[0m
[35mcarbonate[0m
[35mchloride[0m
[35miron_oxide[0m
[35moxalate[0m
[35moxychlorine[0m
[35mphyllosilicate[0m
[35msilicate[0m
[35msulfate[0m
[35msulfide[0m
[34mFull training .....[0m
[34mLABEL: basalt[0m
[34mLABEL: carbonate[0m
[34mLABEL: chloride[0m
[34mLABEL: iron_oxide[0m
[34mLABEL: oxalate[0m
[34mLABEL: oxychlorine[0m
[34mLABEL: phyllosilicate[0m
[34mLABEL: silicate[0m
[34mLABEL: sulfate[0m
[34mLABEL: sulfide[0m
[34m
Average Log Loss: 0.2602[0m
Log Loss per Label:
{'basalt': 0.26152610421079714, 'carbonate': 0.22292052739226142, 'chloride': 0.25846765369554403, 'iron_oxide': 0.37216113777027066, 'oxalate': 0.004914672949143167, 'oxychlorine': 0.2526301582566911, 'phyllosilicate': 0.4112433512412358, 'silicate': 0.3498702145055753, 'sulfate': 0.3228680540360762, 'sulfide': 0.1453285461910574}


In [9]:
print(f'CV Logloss: {np.mean(list(cvloss.values()))}')
print(f'CV logLoss Label')
print(cvloss)

CV Logloss: 0.2601930420248652
CV logLoss Label
{'basalt': 0.26152610421079714, 'carbonate': 0.22292052739226142, 'chloride': 0.25846765369554403, 'iron_oxide': 0.37216113777027066, 'oxalate': 0.004914672949143167, 'oxychlorine': 0.2526301582566911, 'phyllosilicate': 0.4112433512412358, 'silicate': 0.3498702145055753, 'sulfate': 0.3228680540360762, 'sulfide': 0.1453285461910574}


**SAM TRAINING**

In [10]:
# ===== TRAIN SAM =====
cvloss, submission_model = training.train_tbl(
    df_train=X_tr_sam,
    df_labels=sam_labels,
    target_list=target_labels_list,
    df_test=X_vl_sam,
    model_algo=MODEL_ALGO,
    sub_name=MODEL_NAME + '_sam',
    test_sam=True
    )

# Compute validation loss when full model is trained
mloss, mloss_avg = training.compute_valid_loss(submission_model,
                                               sam_files, sam_labels,
                                               target_labels_list,
                                               sub_name=MODEL_NAME + '_sam')

[34mCV training ....[0m
[35mbasalt[0m
[35mcarbonate[0m
[35mchloride[0m
[35miron_oxide[0m
[35moxalate[0m
[35moxychlorine[0m
[35mphyllosilicate[0m
[35msilicate[0m
[35msulfate[0m
[35msulfide[0m
[34mFull training .....[0m
[34mLABEL: basalt[0m
[34mLABEL: carbonate[0m
[34mLABEL: chloride[0m
[34mLABEL: iron_oxide[0m
[34mLABEL: oxalate[0m
[34mLABEL: oxychlorine[0m
[34mLABEL: phyllosilicate[0m
[34mLABEL: silicate[0m
[34mLABEL: sulfate[0m
[34mLABEL: sulfide[0m
[34m
Average Log Loss: 0.2704[0m
Log Loss per Label:
{'basalt': 0.2808329837457333, 'carbonate': 0.2396414000642748, 'chloride': 0.2944534149635854, 'iron_oxide': 0.3627412891834739, 'oxalate': 0.003807048659824649, 'oxychlorine': 0.28098086045920684, 'phyllosilicate': 0.4096872796425961, 'silicate': 0.3550628205936525, 'sulfate': 0.32020170372120454, 'sulfide': 0.15613394403395534}
(12, 10)


In [11]:
print(f'CV Logloss: {np.mean(list(cvloss.values()))}')
print(f'VALID FM LogLoss: {mloss_avg}')
print(f'CV logLoss Label')
print(cvloss)

CV Logloss: 0.27035427450675076
VALID FM LogLoss: 1.3671033893386526
CV logLoss Label
{'basalt': 0.2808329837457333, 'carbonate': 0.2396414000642748, 'chloride': 0.2944534149635854, 'iron_oxide': 0.3627412891834739, 'oxalate': 0.003807048659824649, 'oxychlorine': 0.28098086045920684, 'phyllosilicate': 0.4096872796425961, 'silicate': 0.3550628205936525, 'sulfate': 0.32020170372120454, 'sulfide': 0.15613394403395534}
