## Environment

In [1]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
# Change main system path to be able to run code from src folder
import sys
p = sys.path[0]
# Mac OS
if sys.path[0].endswith('/notebooks'):
    main_path = p[:-len('/notebooks')]
if sys.path[0].endswith('/techdoc/content'):
    main_path = p[:-len('/techdoc/content')]
    
# Windows OS
if sys.path[0].endswith('\\notebooks'): 
    main_path = p[:-len('\\notebooks')]
if sys.path[0].endswith('\\techdoc\content'): 
    main_path = p[:-len('\\techdoc\content')]

sys.path[0] = main_path

In [3]:
import os, gc, itertools
from termcolor import colored
from tqdm import tqdm

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, f1_score
from sklearn.model_selection import cross_val_predict
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from scipy.signal import find_peaks
from scipy.ndimage.filters import gaussian_filter1d
from sklearn.metrics import auc

from src import (config, describe_data, features,
                 preprocess, training)

  from pandas import MultiIndex, Int64Index


# Data Load

In [4]:
# ===== LOAD DATA ======
metadata = pd.read_csv(config.DATA_DIR + 'metadata.csv')
print(f'Metadata: {metadata.shape}')

train_labels = pd.read_csv(config.DATA_DIR + 'train_labels.csv')
print(f'Train labels: {train_labels.shape}')

valid_labels = pd.read_csv(config.DATA_DIR + 'val_labels.csv')
print(f'Train labels: {valid_labels.shape}')

submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
print(f'Submission: {submission.shape}')

Metadata: (1570, 5)
Train labels: (766, 11)
Train labels: (293, 11)
Submission: (804, 11)


In [5]:
# ===== MODEL SAMPLES ======
train = metadata[metadata.split == 'train'].copy().reset_index(drop=True)
print(f'TRAIN: {train.shape}')

valid = metadata[metadata.split == 'val'].copy().reset_index(drop=True)
print(f'VALID: {valid.shape}')

test = metadata[metadata.split == 'test'].copy().reset_index(drop=True)
print(f'TEST: {test.shape}')

TRAIN: (766, 5)
VALID: (293, 5)
TEST: (511, 5)


In [6]:
# ===== FILE PATHS OF SAMPLES =====
train_files = metadata[metadata.split == 'train']['features_path'].to_dict()
valid_files = metadata[metadata.split == 'val']['features_path'].to_dict()
test_files = metadata[metadata.split == 'test']['features_path'].to_dict()
# Train & Valid files
trva_files = train_files.copy()
trva_files.update(valid_files)
# All files
all_test_files = valid_files.copy()
all_test_files.update(test_files)

# Ion type list
ion_list = list(np.arange(0,100,1.0))
ion_list.remove(4.0)

# Get the names of the target columns in a list
target_labels_list = [i for i in train_labels.columns if i not in ['sample_id']]
print(target_labels_list)

['basalt', 'carbonate', 'chloride', 'iron_oxide', 'oxalate', 'oxychlorine', 'phyllosilicate', 'silicate', 'sulfate', 'sulfide']


# Notes

- How long does it take for temperature to rise to certain level where we get abundance?
- How to account for the non-linear diferences of time vs temp?
- Each type of ion as a feature, make a matrix and append to the training dataset
- Max value of abundance of each ion - take temp reading and time - or duration
- How to define characteristic of peaks in ion abundances; temp at which they occur, the shape of peaks (height, width, area)
- How to deal with overlapping peaks
- Number of peaks, size of the peaks
- Can we do some target encoding?

# SAMPLES

## `combo_maxabund_peaks_slope_topions`

In [27]:
# ===== TRAIN SAMPLE =====
metadata = pd.read_csv(os.path.join(config.DATA_DIR_OUT, 'metadata.csv'))
metadata_upt = metadata.iloc[:train.shape[0],:].copy()
combo_maxabund_peaks_slope = pd.read_csv(os.path.join(config.DATA_DIR_OUT, 'combo_maxabund_peaks_slope.csv'))
assert metadata_upt.shape[0] == combo_maxabund_peaks_slope.shape[0]
print(f'Shape: {metadata_upt.shape}')

combo_maxabund_peaks_slope_topions = combo_maxabund_peaks_slope.copy()
fts = [i for i in metadata_upt if i.startswith('top_')]
combo_maxabund_peaks_slope_topions = pd.concat([combo_maxabund_peaks_slope_topions, 
                                                metadata_upt[fts]],
                                               axis=1)
combo_maxabund_peaks_slope_topions.to_csv(os.path.join(config.DATA_DIR_OUT +
                                            'combo_maxabund_peaks_slope_topions.csv'),
                index=False)
print(f'Sample shape: {combo_maxabund_peaks_slope_topions.shape}')
combo_maxabund_peaks_slope_topions.head(2)

Shape: (766, 8)
Sample shape: (766, 1984)


Unnamed: 0,Ion_0.0_-100_0,Ion_0.0_0_100,Ion_0.0_100_200,Ion_0.0_200_300,Ion_0.0_300_400,Ion_0.0_400_500,Ion_0.0_500_600,Ion_0.0_600_700,Ion_0.0_700_800,Ion_0.0_800_900,...,peak_abund_94.0,peak_abund_95.0,peak_abund_96.0,peak_abund_97.0,peak_abund_98.0,peak_abund_99.0,slope_tt,top_1,top_2,top_3
0,0.0,0.004085,0.004641,0.001394,0.000188,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.195083,0.18,0.17,0.16
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.167294,0.44,0.16,0.02


In [39]:
# ===== TRAIN & VALID SAMPLE (_trvl) =====
metadata = pd.read_csv(os.path.join(config.DATA_DIR_OUT, 'metadata.csv'))
metadata_upt = metadata.iloc[:len(trva_files),:].copy()
combo_maxabund_peaks_slope_trvl = pd.read_csv(os.path.join(config.DATA_DIR_OUT, 'combo_maxabund_peaks_slope_trvl.csv'))
assert metadata_upt.shape[0] == combo_maxabund_peaks_slope_trvl.shape[0]
print(f'Shape: {metadata_upt.shape}')

combo_maxabund_peaks_slope_topions_trvl = combo_maxabund_peaks_slope_trvl.copy()
fts = [i for i in metadata_upt if i.startswith('top_')]
combo_maxabund_peaks_slope_topions_trvl = pd.concat([combo_maxabund_peaks_slope_topions_trvl, 
                                                metadata_upt[fts]],
                                               axis=1)
combo_maxabund_peaks_slope_topions_trvl.to_csv(os.path.join(config.DATA_DIR_OUT +
                                            'combo_maxabund_peaks_slope_topions_trvl.csv'),
                index=False)
print(f'Sample shape: {combo_maxabund_peaks_slope_topions_trvl.shape}')
combo_maxabund_peaks_slope_topions_trvl.head(2)

Shape: (1059, 8)
Sample shape: (1059, 1984)


Unnamed: 0,Ion_0.0_-100_0,Ion_0.0_0_100,Ion_0.0_100_200,Ion_0.0_200_300,Ion_0.0_300_400,Ion_0.0_400_500,Ion_0.0_500_600,Ion_0.0_600_700,Ion_0.0_700_800,Ion_0.0_800_900,...,peak_abund_94.0,peak_abund_95.0,peak_abund_96.0,peak_abund_97.0,peak_abund_98.0,peak_abund_99.0,slope_tt,top_1,top_2,top_3
0,0.0,0.004085,0.004641,0.001394,0.000188,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.195083,0.18,0.17,0.16
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.167294,0.44,0.16,0.02


In [41]:
# ===== VALID & TEST SAMPLE (_VT) =====
metadata_upt = metadata.iloc[train.shape[0]:,:].copy()
combo_maxabund_peaks_VT = pd.read_csv(os.path.join(config.DATA_DIR_OUT, 'combo_maxabund_peaks_slope_VT.csv'))
assert metadata_upt.shape[0] == combo_maxabund_peaks_VT.shape[0]
print(f'Shape: {metadata_upt.shape}')

combo_maxabund_peaks_slope_topions_VT = combo_maxabund_peaks_VT.copy()
fts = [i for i in metadata_upt if i.startswith('top_')]
combo_maxabund_peaks_slope_topions_VT = pd.concat([combo_maxabund_peaks_slope_topions_VT, 
                                                metadata_upt[fts].reset_index(drop=True)],
                                               axis=1)
assert combo_maxabund_peaks_slope_topions_VT.shape[0]== len(all_test_files)
print(f'Sample shape: {combo_maxabund_peaks_slope_topions_VT.shape}')
# Save the sample
combo_maxabund_peaks_slope_topions_VT.to_csv(os.path.join(config.DATA_DIR_OUT +
                                            'combo_maxabund_peaks_slope_topions_VT.csv'),
                index=False)
combo_maxabund_peaks_slope_topions_VT.head(2)

Shape: (804, 8)
Sample shape: (804, 1984)


Unnamed: 0,Ion_0.0_-100_0,Ion_0.0_0_100,Ion_0.0_100_200,Ion_0.0_200_300,Ion_0.0_300_400,Ion_0.0_400_500,Ion_0.0_500_600,Ion_0.0_600_700,Ion_0.0_700_800,Ion_0.0_800_900,...,peak_abund_94.0,peak_abund_95.0,peak_abund_96.0,peak_abund_97.0,peak_abund_98.0,peak_abund_99.0,slope_tt,top_1,top_2,top_3
0,0.0,0.4101,0.593334,0.606578,0.678006,0.553591,0.596561,0.606902,0.535487,0.69642,...,0.0,0.0,0.0,0.0,0.0,0.0,0.609997,0.18,0.01,0.0
1,0.0,0.058563,0.073878,0.081069,0.083869,0.085114,0.08781,0.078234,0.079305,0.074327,...,0.0,0.0,0.0,0.0,0.0,0.0,0.60809,0.18,0.17,0.32
