## Environment

In [1]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
# Change main system path to be able to run code from src folder
import sys
p = sys.path[0]
# Mac OS
if sys.path[0].endswith('/notebooks'):
    main_path = p[:-len('/notebooks')]
if sys.path[0].endswith('/techdoc/content'):
    main_path = p[:-len('/techdoc/content')]
    
# Windows OS
if sys.path[0].endswith('\\notebooks'): 
    main_path = p[:-len('\\notebooks')]
if sys.path[0].endswith('\\techdoc\content'): 
    main_path = p[:-len('\\techdoc\content')]

sys.path[0] = main_path

In [3]:
import os, gc, itertools
from termcolor import colored
from tqdm import tqdm

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, f1_score
from sklearn.model_selection import cross_val_predict
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from scipy.signal import find_peaks
from scipy.ndimage.filters import gaussian_filter1d
from sklearn.metrics import auc

from src import (config, describe_data, features,
                 preprocess, training)

  from pandas import MultiIndex, Int64Index


# Data Load

In [4]:
# ===== LOAD DATA ======
metadata = pd.read_csv(config.DATA_DIR + 'metadata.csv')
print(f'Metadata: {metadata.shape}')

train_labels = pd.read_csv(config.DATA_DIR + 'train_labels.csv')
print(f'Train labels: {train_labels.shape}')

valid_labels = pd.read_csv(config.DATA_DIR + 'val_labels.csv')
print(f'Train labels: {valid_labels.shape}')

submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
print(f'Submission: {submission.shape}')

Metadata: (1570, 5)
Train labels: (766, 11)
Train labels: (293, 11)
Submission: (804, 11)


In [5]:
# ===== MODEL SAMPLES ======
train = metadata[metadata.split == 'train'].copy().reset_index(drop=True)
print(f'TRAIN: {train.shape}')

valid = metadata[metadata.split == 'val'].copy().reset_index(drop=True)
print(f'VALID: {valid.shape}')

test = metadata[metadata.split == 'test'].copy().reset_index(drop=True)
print(f'TEST: {test.shape}')

TRAIN: (766, 5)
VALID: (293, 5)
TEST: (511, 5)


In [6]:
# ===== FILE PATHS OF SAMPLES =====
train_files = metadata[metadata.split == 'train']['features_path'].to_dict()
valid_files = metadata[metadata.split == 'val']['features_path'].to_dict()
test_files = metadata[metadata.split == 'test']['features_path'].to_dict()
# Train & Valid files
trva_files = train_files.copy()
trva_files.update(valid_files)
# All files
all_test_files = valid_files.copy()
all_test_files.update(test_files)

# Ion type list
ion_list = list(np.arange(0,100,1.0))
ion_list.remove(4.0)

# Get the names of the target columns in a list
target_labels_list = [i for i in train_labels.columns if i not in ['sample_id']]
print(target_labels_list)

['basalt', 'carbonate', 'chloride', 'iron_oxide', 'oxalate', 'oxychlorine', 'phyllosilicate', 'silicate', 'sulfate', 'sulfide']


# Notes

- How long does it take for temperature to rise to certain level where we get abundance?
- How to account for the non-linear diferences of time vs temp?
- Max value of abundance of each ion - take temp reading and time - or duration
- How to define characteristic of peaks in ion abundances; temp at which they occur, the shape of peaks (height, width, area)
- How to deal with overlapping peaks
- Size of the peaks

# SAMPLES

## `metadata` with top 3 ions
- not normalized since it messes up the target encoding later on
    - *if used as a feature normalize prior to training*

In [44]:
metadata = pd.read_csv(config.DATA_DIR + 'metadata.csv')
topN_ions = features.get_topN_ions(metadata, N=3, normalize=False)
topN_ions.index = topN_ions.index.set_names('sample_id')
topN_ions = topN_ions.reset_index()
topN_ions.head()

100%|██████████| 1570/1570 [01:55<00:00, 13.55it/s]


Unnamed: 0,sample_id,top_1,top_2,top_3
0,S0000,18.0,17.0,16.0
1,S0001,44.0,16.0,2.0
2,S0002,18.0,32.0,17.0
3,S0003,18.0,44.0,17.0
4,S0004,18.0,32.0,17.0


In [45]:
# Merge to metadata
metadata = pd.merge(metadata, topN_ions, on='sample_id', how='left')
metadata[[i for i in metadata if i.startswith('top_')]] = metadata[[i for i in metadata if i.startswith('top_')]].astype('int')
print(metadata.shape)
print(metadata.head(2))

metadata.to_csv(os.path.join(config.DATA_DIR_OUT + 'metadata.csv'), 
                index=False)

(1570, 8)
  sample_id  split instrument_type             features_path  \
0     S0000  train      commercial  train_features/S0000.csv   
1     S0001  train      commercial  train_features/S0001.csv   
2     S0002  train      commercial  train_features/S0002.csv   
3     S0003  train      commercial  train_features/S0003.csv   
4     S0004  train      commercial  train_features/S0004.csv   

                  features_md5_hash  top_1  top_2  top_3  
0  017b9a71a702e81a828e6242aa15f049     18     17     16  
1  0d09840214054d254bd49436c6a6f315     44     16      2  
2  3f58b3c9b001bfed6ed4e4f757083e09     18     32     17  
3  e9a12f96114a2fda60b36f4c0f513fb1     18     44     17  
4  b67603d3931897bfa796ac42cc16de78     18     32     17  


## `fts_maxrelabund_tempion` - Max relative abundance per temp bin and ion type

Bin the temp from min to max in intervals of 100 degrees and combine with the ion type i.e. `m/z`. For each combination compute the maximum relative abundance.

In [13]:
# ===== TRAIN SAMPLE =====
print(f'Number of files: {len(train_files)}')
fts_maxrelabund_tempion = features.features_iontemp_abun(metadata, train_files, 
                                                         detrend_method='min')
print(f'Shape: {fts_maxrelabund_tempion.shape}')
assert fts_maxrelabund_tempion.shape[0] == len(train_files)
fts_maxrelabund_tempion = fts_maxrelabund_tempion.replace(np.nan, 0)
fts_maxrelabund_tempion.to_csv(os.path.join(config.DATA_DIR_OUT + 
                                            'fts_maxrelabund_tempion.csv'), 
                               index=False)

"""
fts_maxrelabund_tempion = features.features_iontemp_abun(metadata, train_files, 
                                                         detrend_method='lin_reg')
print(f'Shape: {fts_maxrelabund_tempion.shape}')
assert fts_maxrelabund_tempion.shape[0] == len(train_files)
fts_maxrelabund_tempion = fts_maxrelabund_tempion.replace(np.nan, 0)
fts_maxrelabund_tempion.to_csv(os.path.join(config.DATA_DIR_OUT + 
                                            'fts_maxrelabund_tempion_lr.csv'), 
                               index=False)

fts_maxrelabund_tempion = features.features_iontemp_abun(metadata, train_files, 
                                                         detrend_method='poly')
print(f'Shape: {fts_maxrelabund_tempion.shape}')
assert fts_maxrelabund_tempion.shape[0] == len(train_files)
fts_maxrelabund_tempion = fts_maxrelabund_tempion.replace(np.nan, 0)
fts_maxrelabund_tempion.to_csv(os.path.join(config.DATA_DIR_OUT + 
                                            'fts_maxrelabund_tempion_poly.csv'), 
                               index=False)
"""

Number of files: 766
Number of samples: 766
Shape: (766, 1584)


"\nfts_maxrelabund_tempion = features.features_iontemp_abun(metadata, train_files, \n                                                         detrend_method='lin_reg')\nprint(f'Shape: {fts_maxrelabund_tempion.shape}')\nassert fts_maxrelabund_tempion.shape[0] == len(train_files)\nfts_maxrelabund_tempion = fts_maxrelabund_tempion.replace(np.nan, 0)\nfts_maxrelabund_tempion.to_csv(os.path.join(config.DATA_DIR_OUT + \n                                            'fts_maxrelabund_tempion_lr.csv'), \n                               index=False)\n\nfts_maxrelabund_tempion = features.features_iontemp_abun(metadata, train_files, \n                                                         detrend_method='poly')\nprint(f'Shape: {fts_maxrelabund_tempion.shape}')\nassert fts_maxrelabund_tempion.shape[0] == len(train_files)\nfts_maxrelabund_tempion = fts_maxrelabund_tempion.replace(np.nan, 0)\nfts_maxrelabund_tempion.to_csv(os.path.join(config.DATA_DIR_OUT + \n                                          

In [14]:
# ===== TRAIN & VALID SAMPLE (_trvl) =====
print(f'Number of files: {len(trva_files)}')
fts_maxrelabund_tempion_trvl = features.features_iontemp_abun(metadata, trva_files, 
                                                              detrend_method='min')
print(f'Shape: {fts_maxrelabund_tempion_trvl.shape}')
assert fts_maxrelabund_tempion_trvl.shape[0] == len(trva_files)
fts_maxrelabund_tempion_trvl = fts_maxrelabund_tempion_trvl.replace(np.nan, 0)
fts_maxrelabund_tempion_trvl.to_csv(os.path.join(config.DATA_DIR_OUT + 
                                            'fts_maxrelabund_tempion_trvl.csv'), 
                index=False)

"""
fts_maxrelabund_tempion_trvl = features.features_iontemp_abun(metadata, trva_files, 
                                                              detrend_method='lin_reg')
print(f'Shape: {fts_maxrelabund_tempion_trvl.shape}')
assert fts_maxrelabund_tempion_trvl.shape[0] == len(trva_files)
fts_maxrelabund_tempion_trvl = fts_maxrelabund_tempion_trvl.replace(np.nan, 0)
fts_maxrelabund_tempion_trvl.to_csv(os.path.join(config.DATA_DIR_OUT + 
                                            'fts_maxrelabund_tempion_trvl_lr.csv'), 
                index=False)

print(f'Number of all test files: {len(trva_files)}')
fts_maxrelabund_tempion_trvl = features.features_iontemp_abun(metadata, trva_files, 
                                                              detrend_method='poly')
print(f'Shape: {fts_maxrelabund_tempion_trvl.shape}')
assert fts_maxrelabund_tempion_trvl.shape[0] == len(trva_files)
fts_maxrelabund_tempion_trvl = fts_maxrelabund_tempion_trvl.replace(np.nan, 0)
fts_maxrelabund_tempion_trvl.to_csv(os.path.join(config.DATA_DIR_OUT + 
                                            'fts_maxrelabund_tempion_trvl_poly.csv'), 
                index=False)
"""

Number of files: 1059
Number of samples: 1059
Shape: (1059, 1584)


"\nfts_maxrelabund_tempion_trvl = features.features_iontemp_abun(metadata, trva_files, \n                                                              detrend_method='lin_reg')\nprint(f'Shape: {fts_maxrelabund_tempion_trvl.shape}')\nassert fts_maxrelabund_tempion_trvl.shape[0] == len(trva_files)\nfts_maxrelabund_tempion_trvl = fts_maxrelabund_tempion_trvl.replace(np.nan, 0)\nfts_maxrelabund_tempion_trvl.to_csv(os.path.join(config.DATA_DIR_OUT + \n                                            'fts_maxrelabund_tempion_trvl_lr.csv'), \n                index=False)\n\nprint(f'Number of all test files: {len(trva_files)}')\nfts_maxrelabund_tempion_trvl = features.features_iontemp_abun(metadata, trva_files, \n                                                              detrend_method='poly')\nprint(f'Shape: {fts_maxrelabund_tempion_trvl.shape}')\nassert fts_maxrelabund_tempion_trvl.shape[0] == len(trva_files)\nfts_maxrelabund_tempion_trvl = fts_maxrelabund_tempion_trvl.replace(np.nan, 0)\nfts_

In [15]:
# ===== VALID & TEST SAMPLE =====
print(f'Number of all test files: {len(all_test_files)}')
fts_maxrelabund_tempion_VT = features.features_iontemp_abun(metadata, all_test_files, 
                                                            detrend_method='min')
print(f'Shape: {fts_maxrelabund_tempion_VT.shape}')
assert fts_maxrelabund_tempion_VT.shape[0] == len(all_test_files)
fts_maxrelabund_tempion_VT = fts_maxrelabund_tempion_VT.replace(np.nan,0)
fts_maxrelabund_tempion_VT.to_csv(os.path.join(config.DATA_DIR_OUT + 
                                            'fts_maxrelabund_tempion_VT.csv'),
                index=False)

"""
fts_maxrelabund_tempion_VT = features.features_iontemp_abun(metadata, all_test_files, 
                                                            detrend_method='lin_reg')
print(f'Shape: {fts_maxrelabund_tempion_VT.shape}')
assert fts_maxrelabund_tempion_VT.shape[0] == len(all_test_files)
fts_maxrelabund_tempion_VT = fts_maxrelabund_tempion_VT.replace(np.nan,0)
fts_maxrelabund_tempion_VT.to_csv(os.path.join(config.DATA_DIR_OUT + 
                                            'fts_maxrelabund_tempion_VT_lr.csv'),
                index=False)

fts_maxrelabund_tempion_VT = features.features_iontemp_abun(metadata, all_test_files, 
                                                            detrend_method='poly')
print(f'Shape: {fts_maxrelabund_tempion_VT.shape}')
assert fts_maxrelabund_tempion_VT.shape[0] == len(all_test_files)
fts_maxrelabund_tempion_VT = fts_maxrelabund_tempion_VT.replace(np.nan,0)
fts_maxrelabund_tempion_VT.to_csv(os.path.join(config.DATA_DIR_OUT + 
                                            'fts_maxrelabund_tempion_VT_poly.csv'),
                index=False)
"""

Number of all test files: 804
Number of samples: 804
Shape: (804, 1584)


"\nfts_maxrelabund_tempion_VT = features.features_iontemp_abun(metadata, all_test_files, \n                                                            detrend_method='lin_reg')\nprint(f'Shape: {fts_maxrelabund_tempion_VT.shape}')\nassert fts_maxrelabund_tempion_VT.shape[0] == len(all_test_files)\nfts_maxrelabund_tempion_VT = fts_maxrelabund_tempion_VT.replace(np.nan,0)\nfts_maxrelabund_tempion_VT.to_csv(os.path.join(config.DATA_DIR_OUT + \n                                            'fts_maxrelabund_tempion_VT_lr.csv'),\n                index=False)\n\nfts_maxrelabund_tempion_VT = features.features_iontemp_abun(metadata, all_test_files, \n                                                            detrend_method='poly')\nprint(f'Shape: {fts_maxrelabund_tempion_VT.shape}')\nassert fts_maxrelabund_tempion_VT.shape[0] == len(all_test_files)\nfts_maxrelabund_tempion_VT = fts_maxrelabund_tempion_VT.replace(np.nan,0)\nfts_maxrelabund_tempion_VT.to_csv(os.path.join(config.DATA_DIR_OUT + \n   

## `fts_ion_peaks` - peak analysis and stats

In [8]:
# ===== TRAIN SAMPLE =====
fts_ion_peaks = features.features_ion_peaks(train_files, metadata, ion_list, detrend_method='min')
print(fts_ion_peaks.shape)
fts_ion_peaks.head()
fts_ion_peaks.to_csv(os.path.join(config.DATA_DIR_OUT + 
                                       'fts_ion_peaks.csv'),
                index=False)

100%|██████████| 766/766 [02:19<00:00,  5.50it/s]


(766, 396)


In [10]:
# ===== TRAIN & VALID SAMPLE (_trvl) =====
fts_ion_peaks_trvl = features.features_ion_peaks(trva_files, metadata, ion_list, detrend_method='min')
print(fts_ion_peaks_trvl.shape)
fts_ion_peaks_trvl.head()
fts_ion_peaks_trvl.to_csv(os.path.join(config.DATA_DIR_OUT + 
                                       'fts_ion_peaks_trvl.csv'),
                index=False)

100%|██████████| 1059/1059 [03:07<00:00,  5.64it/s]


(1059, 396)


In [11]:
# ===== VALID & TEST SAMPLE (_VT) =====
fts_ion_peaks_VT = features.features_ion_peaks(all_test_files, metadata, ion_list, detrend_method='min')
print(fts_ion_peaks_VT.shape)
fts_ion_peaks_VT.head()
fts_ion_peaks_VT.to_csv(os.path.join(config.DATA_DIR_OUT + 
                                       'fts_ion_peaks_VT.csv'),
                index=False)

100%|██████████| 804/804 [02:23<00:00,  5.62it/s]


(804, 396)


## `combo_maxabund_peaks` - Combine max rel abund per temp_ion bin and peak analysis

In [16]:
# ===== TRAIN SAMPLE =====
combo_maxabund_peaks = pd.concat([fts_maxrelabund_tempion, fts_ion_peaks], 
                               axis = 1, 
                               ignore_index=False)
print(combo_maxabund_peaks.shape)
combo_maxabund_peaks.to_csv(os.path.join(config.DATA_DIR_OUT + 
                                               'combo_maxabund_peaks.csv'), 
                                  index=False)

(766, 1980)


In [18]:
# ===== TRAIN & VALID SAMPLE (_trvl) =====
combo_maxabund_peaks_trvl = pd.concat([fts_maxrelabund_tempion_trvl, fts_ion_peaks_trvl], 
                               axis = 1, 
                               ignore_index=False)
print(combo_maxabund_peaks_trvl.shape)
combo_maxabund_peaks_trvl.to_csv(os.path.join(config.DATA_DIR_OUT + 
                                               'combo_maxabund_peaks_trvl.csv'), 
                                  index=False)

(1059, 1980)


In [17]:
# ===== VALID & TEST SAMPLE =====
combo_maxabund_peaks_VT = pd.concat([fts_maxrelabund_tempion_VT, fts_ion_peaks_VT], 
                               axis = 1, 
                               ignore_index=False)
print(combo_maxabund_peaks_VT.shape)
combo_maxabund_peaks_VT.to_csv(os.path.join(config.DATA_DIR_OUT + 
                                               'combo_maxabund_peaks_VT.csv'), 
                                  index=False)

(804, 1980)


## `combo_maxabund_peaks_slope`

In [20]:
# ===== TRAIN SAMPLE =====
# Dict with sample_id as keys
fts_slope_timetemp = features.slope_time_temp(train_files, metadata, detrend_method='min')
combo_maxabund_peaks_slope = combo_maxabund_peaks.copy()
combo_maxabund_peaks_slope['slope_tt'] = combo_maxabund_peaks_slope.index.map(fts_slope_timetemp)
combo_maxabund_peaks_slope.to_csv(os.path.join(config.DATA_DIR_OUT + 'combo_maxabund_peaks_slope.csv'), 
                                  index=False)
combo_maxabund_peaks_slope.head()

100%|██████████| 766/766 [00:54<00:00, 13.98it/s]


Unnamed: 0_level_0,Ion_0.0_-100_0,Ion_0.0_0_100,Ion_0.0_100_200,Ion_0.0_200_300,Ion_0.0_300_400,Ion_0.0_400_500,Ion_0.0_500_600,Ion_0.0_600_700,Ion_0.0_700_800,Ion_0.0_800_900,...,peak_abund_91.0,peak_abund_92.0,peak_abund_93.0,peak_abund_94.0,peak_abund_95.0,peak_abund_96.0,peak_abund_97.0,peak_abund_98.0,peak_abund_99.0,slope_tt
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
S0000,0.0,0.004085,0.004641,0.001394,0.000188,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.195083
S0001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.167294
S0002,0.0,0.00227,0.002495,0.001688,0.000636,0.000597,0.000819,0.000155,0.000235,0.000227,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.601609
S0003,0.0,0.001336,0.002464,0.001099,0.000992,0.000676,0.000883,0.000892,0.000631,0.000361,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.609247
S0004,0.0,0.005993,0.012429,0.00938,0.005099,0.006921,0.001966,0.000499,0.00088,0.000767,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.613325


In [21]:
# ===== TRAIN & VALID SAMPLE (_trvl) =====
fts_slope_timetemp_trvl = features.slope_time_temp(trva_files, metadata, detrend_method='min')
combo_maxabund_peaks_slope_trvl = combo_maxabund_peaks_trvl.copy()
combo_maxabund_peaks_slope_trvl['slope_tt'] = combo_maxabund_peaks_slope_trvl.index.map(fts_slope_timetemp_trvl)
combo_maxabund_peaks_slope_trvl.to_csv(os.path.join(config.DATA_DIR_OUT + 'combo_maxabund_peaks_slope_trvl.csv'), 
                                  index=False)
combo_maxabund_peaks_slope_trvl.head()

100%|██████████| 1059/1059 [01:10<00:00, 15.05it/s]


Unnamed: 0_level_0,Ion_0.0_-100_0,Ion_0.0_0_100,Ion_0.0_100_200,Ion_0.0_200_300,Ion_0.0_300_400,Ion_0.0_400_500,Ion_0.0_500_600,Ion_0.0_600_700,Ion_0.0_700_800,Ion_0.0_800_900,...,peak_abund_91.0,peak_abund_92.0,peak_abund_93.0,peak_abund_94.0,peak_abund_95.0,peak_abund_96.0,peak_abund_97.0,peak_abund_98.0,peak_abund_99.0,slope_tt
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
S0000,0.0,0.004085,0.004641,0.001394,0.000188,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.195083
S0001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.167294
S0002,0.0,0.00227,0.002495,0.001688,0.000636,0.000597,0.000819,0.000155,0.000235,0.000227,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.601609
S0003,0.0,0.001336,0.002464,0.001099,0.000992,0.000676,0.000883,0.000892,0.000631,0.000361,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.609247
S0004,0.0,0.005993,0.012429,0.00938,0.005099,0.006921,0.001966,0.000499,0.00088,0.000767,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.613325


In [23]:
# ===== VALID & TEST SAMPLE (_VT) =====
fts_slope_timetemp_VT = features.slope_time_temp(all_test_files, metadata, detrend_method='min')
combo_maxabund_peaks_slope_VT = combo_maxabund_peaks_VT.copy()
combo_maxabund_peaks_slope_VT['slope_tt'] = combo_maxabund_peaks_slope_VT.index.map(fts_slope_timetemp_VT)
combo_maxabund_peaks_slope_VT.to_csv(os.path.join(config.DATA_DIR_OUT +
                                            'combo_maxabund_peaks_slope_VT.csv'),
                index=False)
combo_maxabund_peaks_slope_VT.head()

100%|██████████| 804/804 [00:57<00:00, 13.96it/s]


Unnamed: 0_level_0,Ion_0.0_-100_0,Ion_0.0_0_100,Ion_0.0_100_200,Ion_0.0_200_300,Ion_0.0_300_400,Ion_0.0_400_500,Ion_0.0_500_600,Ion_0.0_600_700,Ion_0.0_700_800,Ion_0.0_800_900,...,peak_abund_91.0,peak_abund_92.0,peak_abund_93.0,peak_abund_94.0,peak_abund_95.0,peak_abund_96.0,peak_abund_97.0,peak_abund_98.0,peak_abund_99.0,slope_tt
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
S0766,0.0,0.4101,0.593334,0.606578,0.678006,0.553591,0.596561,0.606902,0.535487,0.69642,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.609997
S0767,0.0,0.058563,0.073878,0.081069,0.083869,0.085114,0.08781,0.078234,0.079305,0.074327,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.60809
S0768,0.0,0.029256,0.040753,0.037022,0.041347,0.035408,0.036785,0.033979,0.02397,0.023692,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.612946
S0769,0.0,0.000261,0.000842,0.001313,0.001544,0.001405,0.002993,0.001424,0.000768,0.000992,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.541738
S0770,0.0,0.00346,0.003929,0.003286,0.002893,0.010054,0.003877,0.001918,0.000824,0.000323,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.61167


## `combo_maxabund_peaks_slope_topions`

(temp bins & ions at max abundance) + (#peaks, time to, temp at, abund at) + slope + (top 3 ions)

In [46]:
#TODO Normalize topions before training - integers now!!!!
# ===== TRAIN SAMPLE =====
metadata = pd.read_csv(os.path.join(config.DATA_DIR_OUT, 'metadata.csv'))
metadata_upt = metadata.iloc[:train.shape[0],:].copy()
combo_maxabund_peaks_slope = pd.read_csv(os.path.join(config.DATA_DIR_OUT, 'combo_maxabund_peaks_slope.csv'))
assert metadata_upt.shape[0] == combo_maxabund_peaks_slope.shape[0]
print(f'Shape: {metadata_upt.shape}')

combo_maxabund_peaks_slope_topions = combo_maxabund_peaks_slope.copy()
fts = [i for i in metadata_upt if i.startswith('top_')]
combo_maxabund_peaks_slope_topions = pd.concat([combo_maxabund_peaks_slope_topions, 
                                                metadata_upt[fts]],
                                               axis=1)
combo_maxabund_peaks_slope_topions.to_csv(os.path.join(config.DATA_DIR_OUT +
                                            'combo_maxabund_peaks_slope_topions.csv'),
                index=False)
print(f'Sample shape: {combo_maxabund_peaks_slope_topions.shape}')
combo_maxabund_peaks_slope_topions.head(2)

Shape: (766, 8)
Sample shape: (766, 1984)


Unnamed: 0,Ion_0.0_-100_0,Ion_0.0_0_100,Ion_0.0_100_200,Ion_0.0_200_300,Ion_0.0_300_400,Ion_0.0_400_500,Ion_0.0_500_600,Ion_0.0_600_700,Ion_0.0_700_800,Ion_0.0_800_900,...,peak_abund_94.0,peak_abund_95.0,peak_abund_96.0,peak_abund_97.0,peak_abund_98.0,peak_abund_99.0,slope_tt,top_1,top_2,top_3
0,0.0,0.004085,0.004641,0.001394,0.000188,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.195083,18,17,16
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.167294,44,16,2


In [47]:
# ===== TRAIN & VALID SAMPLE (_trvl) =====
metadata = pd.read_csv(os.path.join(config.DATA_DIR_OUT, 'metadata.csv'))
metadata_upt = metadata.iloc[:len(trva_files),:].copy()
combo_maxabund_peaks_slope_trvl = pd.read_csv(os.path.join(config.DATA_DIR_OUT, 'combo_maxabund_peaks_slope_trvl.csv'))
assert metadata_upt.shape[0] == combo_maxabund_peaks_slope_trvl.shape[0]
print(f'Shape: {metadata_upt.shape}')

combo_maxabund_peaks_slope_topions_trvl = combo_maxabund_peaks_slope_trvl.copy()
fts = [i for i in metadata_upt if i.startswith('top_')]
combo_maxabund_peaks_slope_topions_trvl = pd.concat([combo_maxabund_peaks_slope_topions_trvl, 
                                                metadata_upt[fts]],
                                               axis=1)
combo_maxabund_peaks_slope_topions_trvl.to_csv(os.path.join(config.DATA_DIR_OUT +
                                            'combo_maxabund_peaks_slope_topions_trvl.csv'),
                index=False)
print(f'Sample shape: {combo_maxabund_peaks_slope_topions_trvl.shape}')
combo_maxabund_peaks_slope_topions_trvl.head(2)

Shape: (1059, 8)
Sample shape: (1059, 1984)


Unnamed: 0,Ion_0.0_-100_0,Ion_0.0_0_100,Ion_0.0_100_200,Ion_0.0_200_300,Ion_0.0_300_400,Ion_0.0_400_500,Ion_0.0_500_600,Ion_0.0_600_700,Ion_0.0_700_800,Ion_0.0_800_900,...,peak_abund_94.0,peak_abund_95.0,peak_abund_96.0,peak_abund_97.0,peak_abund_98.0,peak_abund_99.0,slope_tt,top_1,top_2,top_3
0,0.0,0.004085,0.004641,0.001394,0.000188,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.195083,18,17,16
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.167294,44,16,2


In [48]:
# ===== VALID & TEST SAMPLE (_VT) =====
metadata_upt = metadata.iloc[train.shape[0]:,:].copy()
combo_maxabund_peaks_VT = pd.read_csv(os.path.join(config.DATA_DIR_OUT, 'combo_maxabund_peaks_slope_VT.csv'))
assert metadata_upt.shape[0] == combo_maxabund_peaks_VT.shape[0]
print(f'Shape: {metadata_upt.shape}')

combo_maxabund_peaks_slope_topions_VT = combo_maxabund_peaks_VT.copy()
fts = [i for i in metadata_upt if i.startswith('top_')]
combo_maxabund_peaks_slope_topions_VT = pd.concat([combo_maxabund_peaks_slope_topions_VT, 
                                                metadata_upt[fts].reset_index(drop=True)],
                                               axis=1)
assert combo_maxabund_peaks_slope_topions_VT.shape[0]== len(all_test_files)
print(f'Sample shape: {combo_maxabund_peaks_slope_topions_VT.shape}')
# Save the sample
combo_maxabund_peaks_slope_topions_VT.to_csv(os.path.join(config.DATA_DIR_OUT +
                                            'combo_maxabund_peaks_slope_topions_VT.csv'),
                index=False)
combo_maxabund_peaks_slope_topions_VT.head(2)

Shape: (804, 8)
Sample shape: (804, 1984)


Unnamed: 0,Ion_0.0_-100_0,Ion_0.0_0_100,Ion_0.0_100_200,Ion_0.0_200_300,Ion_0.0_300_400,Ion_0.0_400_500,Ion_0.0_500_600,Ion_0.0_600_700,Ion_0.0_700_800,Ion_0.0_800_900,...,peak_abund_94.0,peak_abund_95.0,peak_abund_96.0,peak_abund_97.0,peak_abund_98.0,peak_abund_99.0,slope_tt,top_1,top_2,top_3
0,0.0,0.4101,0.593334,0.606578,0.678006,0.553591,0.596561,0.606902,0.535487,0.69642,...,0.0,0.0,0.0,0.0,0.0,0.0,0.609997,18,1,0
1,0.0,0.058563,0.073878,0.081069,0.083869,0.085114,0.08781,0.078234,0.079305,0.074327,...,0.0,0.0,0.0,0.0,0.0,0.0,0.60809,18,17,32


## next