Following the Driven Data benchmark post, I will create a dummy classifer model and then do log-loss to evaluate the relative performance of the models.

In [1]:
# Import the necessary libraries 
import numpy as np
import pandas as pd 
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, log_loss
from sklearn.model_selection import StratifiedKFold, cross_val_score

This is a multi-label classification problem with 10 label classes. To start, we will do the "one vs. all" approach by creating a binary classifier for each label class and then combining the results from each of those together. We can start with K-fold cross-validation for the model and choose a random seed to make sure we are comparing models exactly.

In [37]:
# Make log-loss scorer
log_loss_scorer = make_scorer(log_loss, greater_is_better=False, needs_proba=True)

# Function to check log-loss score for the dummy model
def log_loss_check(clf, X, y):
    # Dictionary to hold the scores
    log_loss_scores = {}

    for column in y.columns:
        # go through labels one-by-one
        y_col = y[column]
        # take the mean of the scores from K-fold classification
        log_loss_scores[column] = np.mean(cross_val_score(clf, X.values, y_col,scoring=log_loss_scorer))

    return log_loss_scores




In [40]:
# Make a new train_features CSV file from the new labels that Nikolas generated in the EDA
# Look at the train labels augmented file
train_labels_aug = pd.read_csv("Data/train_labels_augmented.csv")
# Get the last 20 columns with the values from feature engineering
train_features = train_labels_aug.iloc[:,-20:] 
# Set the index to sample_id so that it's the same index as the train labels
train_features.set_index(train_labels_aug.sample_id, inplace=True)
# Write new csv with just the train features in it
train_features.to_csv("Data/train_features_EDA.csv")
train_features.head()


Unnamed: 0_level_0,1_ratio,1_ratio_peak_temp,1_ratio_peak_abund,1_ratio_meanstd_abund,2_ratio,2_ratio_peak_temp,2_ratio_peak_abund,2_ratio_meanstd_abund,3_ratio,3_ratio_peak_temp,3_ratio_peak_abund,3_ratio_meanstd_abund,4_ratio,4_ratio_peak_temp,4_ratio_peak_abund,4_ratio_meanstd_abund,5_ratio,5_ratio_peak_temp,5_ratio_peak_abund,5_ratio_meanstd_abund
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
S0000,18.0,103.666,1.0,0.380121,17.0,103.666,0.276675,0.10522,16.0,103.666,0.024563,0.009736,28.0,35.42,0.010877,0.009101,19.0,105.458,0.0066,0.002841
S0001,44.0,497.404,1.0,0.218379,2.0,-60.37,0.061658,0.028689,32.0,-60.37,0.033209,0.024006,16.0,497.404,0.093082,0.020585,18.0,-60.37,0.030234,0.019214
S0002,18.0,145.673,1.0,0.344421,17.0,145.673,0.254002,0.08777,32.0,545.304,0.294738,0.07585,16.0,545.304,0.040446,0.013012,1.0,145.673,0.004003,0.001801
S0003,18.0,120.72,1.0,0.237834,44.0,593.222,0.485557,0.147348,17.0,120.72,0.260027,0.061956,28.0,593.222,0.081701,0.031606,16.0,593.222,0.083077,0.028375
S0004,18.0,142.874,1.0,0.344214,32.0,471.733,0.595652,0.164283,17.0,142.874,0.23829,0.082294,16.0,471.733,0.084732,0.026218,44.0,333.273,0.029998,0.013378


In [38]:
# Create dummy classifier
dummy_clf = DummyClassifier(strategy="prior")

# Get train_labels and train_features
train_labels = pd.read_csv("Data/train_labels.csv", index_col = "sample_id")
train_labels.head()

# Make sure the train label and train features indices match
assert train_features.index.equals(train_labels.index)

# Use the function to get the log-loss for each label 
log_loss_check(dummy_clf, train_features, train_labels)

{'basalt': -0.36831286060392643,
 'carbonate': -0.38382649881939435,
 'chloride': -0.37354720816378506,
 'iron_oxide': -0.5320955856383575,
 'oxalate': -0.16727014935915463,
 'oxychlorine': -0.548065082036397,
 'phyllosilicate': -0.6275978315252589,
 'silicate': -0.4011910670361959,
 'sulfate': -0.4952211648059204,
 'sulfide': -0.1918610162489621}