In [161]:
pip -q install ISLP

Note: you may need to restart the kernel to use updated packages.


In [162]:
pip -q install l0bnb

Note: you may need to restart the kernel to use updated packages.


In [163]:
import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots
from statsmodels.api import OLS
import sklearn.model_selection as skm
import sklearn.linear_model as skl
from sklearn.preprocessing import StandardScaler
from ISLP import load_data
from ISLP.models import ModelSpec as MS
from functools import partial
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from ISLP.models import \
     (Stepwise,
      sklearn_selected,
      sklearn_selection_path)
# from l0bnb import fit_path


In [164]:
metadata = pd.read_csv('../Data/metadata.csv')

metadata.head()

Unnamed: 0,sample_id,split,instrument_type,features_path,features_md5_hash
0,S0000,train,commercial,train_features/S0000.csv,017b9a71a702e81a828e6242aa15f049
1,S0001,train,commercial,train_features/S0001.csv,0d09840214054d254bd49436c6a6f315
2,S0002,train,commercial,train_features/S0002.csv,3f58b3c9b001bfed6ed4e4f757083e09
3,S0003,train,commercial,train_features/S0003.csv,e9a12f96114a2fda60b36f4c0f513fb1
4,S0004,train,commercial,train_features/S0004.csv,b67603d3931897bfa796ac42cc16de78


### Labels
The files whose title contains 'labels' are presumably used to train and validate the model. They tell you what a given sample actually consists of. The measurements associated to each sample are contained in the features folders. We combine the labels files into one and try to understand their properties.

In [165]:
train_labels = pd.read_csv('../Data/train_labels.csv')
supplemental = pd.read_csv('../Data/supplemental_metadata.csv')
val_labels = pd.read_csv('../Data/val_labels.csv')
train_labels_2 = pd.read_csv('../Data/train_labels2.csv')
val_labels2 = pd.read_csv('../Data/val_labels2.csv')
submission_format = pd.read_csv("../Data/submission_format.csv")


We will check that val_labels and val_labels_2 are the same:

In [166]:
val_labels.equals(val_labels2)

True

Same for train_labels and train_labels_2

In [167]:
train_labels.equals(train_labels_2)

True

So we may ignore the '2' versions. We join the two files.

In [168]:
frames = [train_labels,val_labels]
combined_labels = pd.concat(frames)

Check that there are no duplicates

In [169]:
for i in combined_labels.duplicated(subset='sample_id',keep = 'first'):
    if i == True:
        print("found duplicate")

So it appears that there aren't any.

In [170]:
print("The shape of the combined labels files is",combined_labels.shape)

n = 5

print("The last",n,"lines of the combined file are")
combined_labels.tail(n)

The shape of the combined labels files is (1047, 11)
The last 5 lines of the combined file are


Unnamed: 0,sample_id,basalt,carbonate,chloride,iron_oxide,oxalate,oxychlorine,phyllosilicate,silicate,sulfate,sulfide
288,S1054,0,0,0,0,0,0,0,0,0,0
289,S1055,0,0,0,0,0,0,0,1,0,0
290,S1056,0,0,0,0,1,0,0,0,0,0
291,S1057,0,1,0,0,0,0,0,0,0,0
292,S1058,0,0,0,0,0,0,1,0,0,0


Note that the sample at the end is marked as 'S1058' but we only have 1047 rows, so not all numbers between S0000 and S1058 are here.

In [171]:
from sklearn.model_selection import train_test_split
labels_train, labels_test = train_test_split(combined_labels, test_size=.2,random_state=42,shuffle=True)

print (labels_test.shape)

(210, 11)


In [172]:
print ("The numbers of nulls in labels_test is",labels_test.isnull().sum().sum())
print ("The numbers of nulls in labels_train is",labels_train.isnull().sum().sum())



The numbers of nulls in labels_test is 0
The numbers of nulls in labels_train is 0


### The actual measurement files
We load one of the measurement files just to see what is in there. 

In [173]:
S0749 = pd.read_csv('../Data/train_features/S0749.csv')

Then we check if there are null values

In [174]:
S0749.isnull().sum(axis = 0)

time         0
temp         0
m/z          0
abundance    0
dtype: int64

So there aren't any

#### Loading sample files
The following code attempts to load various files into a Pandas series consisting of dataframes. It runs, although with some warnings that I do not quite understand at the moment.

In [175]:
# Choose the number of samples you want to load. It might make sense to not load all of them at once if there are possible memory issues.
num_samples = 10

# Create a series, which intially consists of integers 0,..,num_samples and is indexed by the names of the samples.
df_collection = pd.Series(range(num_samples), index = labels_train.sample_id.values[0:num_samples])

# The index i corresponds to the i-th sample

for i,sample_id_no in enumerate(labels_train.sample_id.head(num_samples)):
      
    # There are two folders containing samples named as SXXXX, presumably corresponding to the inital splitting in train/test. 
    # This code first checks the first folder and then the second.
    # It might make sense to combine both of them into one and avoid the "try" command
    try:
        df_collection[i] = pd.read_csv("../Data/train_features/"+str(sample_id_no)+".csv")
    except:
        df_collection[i] = pd.read_csv("../Data/val_features/"+str(sample_id_no)+".csv")

# Check the top few lines of, say, the 3rd dataframe
df_collection.iloc[3].head(10)

  df_collection[i] = pd.read_csv("../Data/train_features/"+str(sample_id_no)+".csv")
0         0.00   30.005   0.0  1.359419e-10
1         0.00   30.005   1.0  2.143780e-10
2         0.00   30.005   2.0  5.823261e-11
3         0.00   30.005   3.0  1.288442e-10
4         0.00   30.005   4.0  1.195885e-07
...        ...      ...   ...           ...
31895  1657.15  999.651  95.0  2.310026e-13
31896  1657.15  999.651  96.0  2.573921e-13
31897  1657.15  999.651  97.0  2.233591e-13
31898  1657.15  999.651  98.0  2.203500e-13
31899  1657.15  999.651  99.0  2.261479e-13

[31900 rows x 4 columns]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df_collection[i] = pd.read_csv("../Data/train_features/"+str(sample_id_no)+".csv")
  df_collection[i] = pd.read_csv("../Data/val_features/"+str(sample_id_no)+".csv")
  df_collection[i] = pd.read_csv("../Data/train_features/"+str(sample_id_no)+".csv")
  df_collection[i] = pd.read_csv("../Data/train_features/"+str(s

Unnamed: 0,time,temp,m/z,abundance
0,0.0,29.965,0.0,6.142533e-09
1,0.0,29.965,1.0,9.506487e-09
2,0.0,29.965,2.0,3.685148e-09
3,0.0,29.965,3.0,6.109329e-09
4,0.0,29.965,4.0,4.062397e-07
5,0.0,29.965,5.0,5.997939e-10
6,0.0,29.965,6.0,8.601567e-12
7,0.0,29.965,7.0,9.934129e-13
8,0.0,29.965,8.0,3.433641e-12
9,0.0,29.965,9.0,5.151772e-13
