This notebook closely follows the [benchmark notebook](https://www.drivendata.co/blog/mars-spectrometry-benchmark/) provided by the competition organizors. It will serve as a starting point in understanding the problem.

# Environment

In [1]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
# Change main system path to be able to run code from src folder
import sys
p = sys.path[0]
# Mac OS
if sys.path[0].endswith('/notebooks'):
    main_path = p[:-len('/notebooks')]
if sys.path[0].endswith('/techdoc/content'):
    main_path = p[:-len('/techdoc/content')]
    
# Windows OS
if sys.path[0].endswith('\\notebooks'): 
    main_path = p[:-len('\\notebooks')]
if sys.path[0].endswith('\\techdoc\content'): 
    main_path = p[:-len('\\techdoc\content')]

sys.path[0] = main_path

In [5]:
import pandas as pd
import numpy as np

from src import config

# Load the Data

In [12]:
metadata = pd.read_csv(config.DATA_DIR + 'metadata.csv')
print(f'Metadata: {metadata.shape}')

train_labels = pd.read_csv(config.DATA_DIR + 'train_labels.csv')
print(f'Train labels: {train_labels.shape}')

submission = pd.read_csv(config.DATA_DIR + 'submission_format.csv')
print(f'Train features: {submission.shape}')

Metadata: (1570, 5)
Train labels: (766, 11)
Train features: (804, 11)


Check whether the samples are unique in the table.

In [13]:
print(metadata.sample_id.nunique() == metadata.shape[0])

True


In [14]:
metadata.head()

Unnamed: 0,sample_id,split,instrument_type,features_path,features_md5_hash
0,S0000,train,commercial,train_features/S0000.csv,017b9a71a702e81a828e6242aa15f049
1,S0001,train,commercial,train_features/S0001.csv,0d09840214054d254bd49436c6a6f315
2,S0002,train,commercial,train_features/S0002.csv,3f58b3c9b001bfed6ed4e4f757083e09
3,S0003,train,commercial,train_features/S0003.csv,e9a12f96114a2fda60b36f4c0f513fb1
4,S0004,train,commercial,train_features/S0004.csv,b67603d3931897bfa796ac42cc16de78


In [15]:
metadata.split.value_counts()

train    766
test     511
val      293
Name: split, dtype: int64

In [16]:
metadata.instrument_type.value_counts()

commercial     1494
sam_testbed      76
Name: instrument_type, dtype: int64

In [15]:
file_path = metadata.iloc[0,3]
file_path

'train_features/S0000.csv'

In [17]:
f = pd.read_csv(DATA_DIR + file_path)
f.head()

Unnamed: 0,time,temp,m/z,abundance
0,0.0,35.289,0.0,5.550957e-11
1,0.0,35.289,1.0,5.318589e-11
2,0.0,35.289,2.0,2.040361e-11
3,0.0,35.289,3.0,3.989464e-11
4,0.0,35.289,4.0,1.594648e-08


In [20]:
f.describe()

Unnamed: 0,time,temp,m/z,abundance
count,38600.0,38600.0,38600.0,38600.0
mean,985.59787,199.44213,99.5,1.297623e-10
std,572.120885,99.268459,57.735053,1.277154e-09
min,0.0,35.289,0.0,-5.866912e-14
25%,492.677,112.251,49.75,1.797645e-14
50%,985.548,199.971,99.5,4.00631e-14
75%,1478.434,285.339,149.25,1.53865e-13
max,1971.66,370.383,199.0,3.604555e-08
