# Multiclass logarithmic loss

In [14]:
import pandas as pd
train = pd.read_csv('train.csv')
train['target']

0        Class_2
1        Class_1
2        Class_1
3        Class_4
4        Class_2
          ...   
99995    Class_1
99996    Class_2
99997    Class_3
99998    Class_2
99999    Class_3
Name: target, Length: 100000, dtype: object

In [15]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(train['target'].unique())
le.classes_

array(['Class_1', 'Class_2', 'Class_3', 'Class_4'], dtype=object)

In [16]:
target = le.transform(train['target'])
target

array([1, 0, 0, ..., 2, 1, 2])

The definition used in the competition is exactly the one from sklearn.

# Baseline model: majority class

In [17]:
from sklearn.metrics import log_loss

In [26]:
import numpy as np
labels, frequencies = np.unique(target, return_counts=True)
(labels, frequencies)

(array([0, 1, 2, 3]), array([ 8490, 57497, 21420, 12593]))

In [51]:
frequencies/len(train)

array([0.0849 , 0.57497, 0.2142 , 0.12593])

Calculated value:
* y = class 1 (57.5% of cases): logloss = 0
* y != class 1 (42.5% of cases): logloss = log(10^-15) = -34.5

In [69]:
print(f'logloss:{-math.log(10**-15)*(1-frequencies[1]/len(train))}')

logloss:14.68001613112889


Calculate with sklearn

In [71]:
log_loss(target, [[0,1,0,0]]*len(train), labels=[0,1,2,3])

14.68001613112889

# Baseline model: a priori probabilities

Calculated value:
* y = class 0 (8.49% of cases): logloss=log(0.0849)=-2.46
* y = class 1 (57.5% of cases): logloss = log(0.57497)=-0.55
* ..    

In [81]:
pred = frequencies/len(train)

In [82]:
print(f'logloss:{-np.sum(pred * np.log(pred))}')

logloss:1.118576829482654


In [83]:
pred = np.tile(frequencies/len(train), (len(train),1))
pred

array([[0.0849 , 0.57497, 0.2142 , 0.12593],
       [0.0849 , 0.57497, 0.2142 , 0.12593],
       [0.0849 , 0.57497, 0.2142 , 0.12593],
       ...,
       [0.0849 , 0.57497, 0.2142 , 0.12593],
       [0.0849 , 0.57497, 0.2142 , 0.12593],
       [0.0849 , 0.57497, 0.2142 , 0.12593]])

In [84]:
log_loss(target, pred, labels=[0,1,2,3])

1.1185768294826535