In [1]:
import numpy as np
import pandas as pd

## The various CSV files in the data folder

### Metadata
We load the metadata file. It contains a list of all available samples as well as paths to the corresponding csv files containing features

In [2]:
metadata = pd.read_csv('../Data/metadata.csv')

metadata.head()

Unnamed: 0,sample_id,split,instrument_type,features_path,features_md5_hash
0,S0000,train,commercial,train_features/S0000.csv,017b9a71a702e81a828e6242aa15f049
1,S0001,train,commercial,train_features/S0001.csv,0d09840214054d254bd49436c6a6f315
2,S0002,train,commercial,train_features/S0002.csv,3f58b3c9b001bfed6ed4e4f757083e09
3,S0003,train,commercial,train_features/S0003.csv,e9a12f96114a2fda60b36f4c0f513fb1
4,S0004,train,commercial,train_features/S0004.csv,b67603d3931897bfa796ac42cc16de78


### Submission 
This file shows the expected format of a submission to the competition. All values corresponding to ions are placehoders set to 0.5, so this is not a  useful file for understanding our data.

In [3]:
submission = pd.read_csv('../Data/submission_format.csv')
submission

Unnamed: 0,sample_id,basalt,carbonate,chloride,iron_oxide,oxalate,oxychlorine,phyllosilicate,silicate,sulfate,sulfide
0,S0766,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
1,S0767,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
2,S0768,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
3,S0769,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
4,S0770,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
...,...,...,...,...,...,...,...,...,...,...,...
735,S1501,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
736,S1502,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
737,S1503,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
738,S1504,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5


### Labels
The files whose title contains 'labels' are used to train and later validate the model. They tell you what a given sample actually consists of. The measurements associated to each sample are contained in the features folders. We combine the labels files into one and try to understand their properties.

In [4]:
train_labels = pd.read_csv('../Data/train_labels.csv')
supplemental = pd.read_csv('../Data/supplemental_metadata.csv')
val_labels = pd.read_csv('../Data/val_labels.csv')
train_labels_2 = pd.read_csv('../Data/train_labels2.csv')
val_labels2 = pd.read_csv('../Data/val_labels2.csv')
submission_format = pd.read_csv("../Data/submission_format.csv")


We will check that val_labels and val_labels_2 are the same:

In [5]:
val_labels.equals(val_labels2)

True

Same for train_labels and train_labels_2

In [6]:
train_labels.equals(train_labels_2)

True

So we may ignore the '2' versions. We join the two files.

In [7]:
frames = [train_labels,val_labels]
combined_labels = pd.concat(frames)

Check that there are no duplicates

In [8]:
for i in combined_labels.duplicated(subset='sample_id',keep = 'first'):
    if i == True:
        print("found duplicate")

So it appears that there aren't any.

In [9]:
print("The shape of the combined labels files is",combined_labels.shape)

n = 5

print("The last",n,"lines of the combined file are")
combined_labels.tail(n)

The shape of the combined labels files is (1047, 11)
The last 5 lines of the combined file are


Unnamed: 0,sample_id,basalt,carbonate,chloride,iron_oxide,oxalate,oxychlorine,phyllosilicate,silicate,sulfate,sulfide
288,S1054,0,0,0,0,0,0,0,0,0,0
289,S1055,0,0,0,0,0,0,0,1,0,0
290,S1056,0,0,0,0,1,0,0,0,0,0
291,S1057,0,1,0,0,0,0,0,0,0,0
292,S1058,0,0,0,0,0,0,1,0,0,0


Note that the sample at the end is marked as 'S1058' but we only have 1047 rows, so not all numbers between S0000 and S1058 are here.

In [10]:
from sklearn.model_selection import train_test_split
labels_train, labels_test = train_test_split(combined_labels, test_size=.2,random_state=42,shuffle=True)

print (labels_test.shape)

(210, 11)


In [11]:
print ("The numbers of nulls in labels_test is",labels_test.isnull().sum().sum())
print ("The numbers of nulls in labels_train is",labels_train.isnull().sum().sum())

The numbers of nulls in labels_test is 0
The numbers of nulls in labels_train is 0


### The actual measurement files
We load one of the measurement files just to see what is in there. 

In [12]:
S0749 = pd.read_csv('../Data/train_features/S0749.csv')
S0749

Unnamed: 0,time,temp,m/z,abundance
0,0.000,30.121,0.0,1.323153e-10
1,0.000,30.121,1.0,2.051331e-10
2,0.000,30.121,2.0,5.915878e-11
3,0.000,30.121,3.0,1.324948e-10
4,0.000,30.121,4.0,1.228752e-07
...,...,...,...,...
31795,1653.308,998.628,95.0,2.211615e-13
31796,1653.308,998.628,96.0,1.769045e-13
31797,1653.308,998.628,97.0,2.203859e-13
31798,1653.308,998.628,98.0,1.788698e-13


Then we check if there are null values

In [13]:
S0749.isnull().sum(axis = 0)

time         0
temp         0
m/z          0
abundance    0
dtype: int64

So there aren't any