# Freesound Audio Tagging 2019

## StratifiedKFold using randomized algolitim

reference : https://www.kaggle.com/osciiart/multilabel-stratifiedkfold-by-randomized-algorithm

In [1]:
# ============= #
# import module #
# ============= #

# sys
import gc
import os
import warnings

# util
import time
import logging
import datetime
import tqdm
from pathlib import Path
from IPython.display import display
from itertools import chain

# numerical
import numpy as np
import pandas as pd
from scipy import stats

# machine learning
from sklearn.model_selection import KFold, StratifiedKFold

# visualize
import matplotlib.pyplot as plt
import seaborn as sns

# addiction
warnings.filterwarnings("ignore")

## Load Data

In [2]:
is_curated = False

In [3]:
ROOT_PATH = Path(".").absolute().parents[0]

In [4]:
train_curated_df = pd.read_csv(ROOT_PATH / "input" / "train_curated.csv")
train_curated_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4970 entries, 0 to 4969
Data columns (total 2 columns):
fname     4970 non-null object
labels    4970 non-null object
dtypes: object(2)
memory usage: 77.7+ KB


In [5]:
train_noisy_df = pd.read_csv(ROOT_PATH / "input" / "train_noisy.csv")
train_noisy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19815 entries, 0 to 19814
Data columns (total 2 columns):
fname     19815 non-null object
labels    19815 non-null object
dtypes: object(2)
memory usage: 309.7+ KB


In [6]:
test_df = pd.read_csv(ROOT_PATH / "input" / "sample_submission.csv")
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1120 entries, 0 to 1119
Data columns (total 81 columns):
fname                                 1120 non-null object
Accelerating_and_revving_and_vroom    1120 non-null int64
Accordion                             1120 non-null int64
Acoustic_guitar                       1120 non-null int64
Applause                              1120 non-null int64
Bark                                  1120 non-null int64
Bass_drum                             1120 non-null int64
Bass_guitar                           1120 non-null int64
Bathtub_(filling_or_washing)          1120 non-null int64
Bicycle_bell                          1120 non-null int64
Burping_and_eructation                1120 non-null int64
Bus                                   1120 non-null int64
Buzz                                  1120 non-null int64
Car_passing_by                        1120 non-null int64
Cheering                              1120 non-null int64
Chewing_and_masticatio

## Split Fold

### Set parameters

In [7]:
NUM_FOLD = 5
SEED = 1116
LABELS = test_df.columns[1:].tolist()
NUM_CLASS = len(LABELS)

In [8]:
np.random.seed(SEED)

### Split Curated Train

In [9]:
train_df = train_curated_df if is_curated else train_noisy_df

In [10]:
for label in LABELS:
    train_df[label] = train_df['labels'].apply(lambda x: label in x)
display(train_df.shape)
train_df.head(10)

(19815, 82)

Unnamed: 0,fname,labels,Accelerating_and_revving_and_vroom,Accordion,Acoustic_guitar,Applause,Bark,Bass_drum,Bass_guitar,Bathtub_(filling_or_washing),...,Toilet_flush,Traffic_noise_and_roadway_noise,Trickle_and_dribble,Walk_and_footsteps,Water_tap_and_faucet,Waves_and_surf,Whispering,Writing,Yell,Zipper_(clothing)
0,00097e21.wav,Bathtub_(filling_or_washing),False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
1,000b6cfb.wav,Motorcycle,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,00116cd2.wav,"Marimba_and_xylophone,Glockenspiel",False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,00127d14.wav,"Water_tap_and_faucet,Sink_(filling_or_washing)",False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
4,0019adae.wav,Raindrop,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,001b819d.wav,Bass_guitar,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
6,001c054e.wav,"Raindrop,Trickle_and_dribble",False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
7,001ceaf1.wav,"Strum,Acoustic_guitar",False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8,001f3501.wav,"Bass_drum,Hi-hat",False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
9,0020becb.wav,Harmonica,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


### Stratified KFold

#### KFold

・No Use

In [11]:
# try KFold
folds = list(KFold(n_splits=NUM_FOLD, shuffle=True, random_state=SEED).split(np.arange(len(train_df))))
train_df['fold'] = 0
for i in range(NUM_FOLD):
    train_df['fold'][folds[i][1]] = i
display(train_df.head(5))

Unnamed: 0,fname,labels,Accelerating_and_revving_and_vroom,Accordion,Acoustic_guitar,Applause,Bark,Bass_drum,Bass_guitar,Bathtub_(filling_or_washing),...,Traffic_noise_and_roadway_noise,Trickle_and_dribble,Walk_and_footsteps,Water_tap_and_faucet,Waves_and_surf,Whispering,Writing,Yell,Zipper_(clothing),fold
0,00097e21.wav,Bathtub_(filling_or_washing),False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,0
1,000b6cfb.wav,Motorcycle,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
2,00116cd2.wav,"Marimba_and_xylophone,Glockenspiel",False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,4
3,00127d14.wav,"Water_tap_and_faucet,Sink_(filling_or_washing)",False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,4
4,0019adae.wav,Raindrop,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,4


In [12]:
# Check how well the folds are stratified.
def check_fold(df):
    print("fold                                         1    2    3    4    5   total")
    print("==========================================================================")
    for label in LABELS:
        label_padded = label + " "*(40-len(label))
        dist = ": "
        for i in range(NUM_FOLD):
            dist += "{:4d} ".format(df[label][folds[i][1]].sum())
        dist += "{:4d} ".format(df[label].sum())
        print(label_padded + dist)
    label_padded = "total" + " "*(40-len("total"))
    dist = ": "
    for i in range(NUM_FOLD):
        dist += "{:4d} ".format(df[df['fold']==i].shape[0])
    dist += "{:4d} ".format(df.shape[0])
    print(label_padded + dist)

check_fold(train_df)

fold                                         1    2    3    4    5   total
Accelerating_and_revving_and_vroom      :   67   63   64   51   55  300 
Accordion                               :   57   60   68   60   55  300 
Acoustic_guitar                         :   67   66   40   60   67  300 
Applause                                :   61   56   48   58   77  300 
Bark                                    :   71   55   60   55   59  300 
Bass_drum                               :   58   54   70   62   56  300 
Bass_guitar                             :   48   71   65   57   59  300 
Bathtub_(filling_or_washing)            :   54   68   52   60   66  300 
Bicycle_bell                            :   59   65   53   72   51  300 
Burping_and_eructation                  :   54   57   66   65   58  300 
Bus                                     :   53   59   61   59   68  300 
Buzz                                    :   52   54   71   64   59  300 
Car_passing_by                          :   65   

#### Only Single KFold

In [13]:
# calculate number of positive label for each sample
train_df['num_labels'] = train_df[LABELS].values.sum(axis=1)
train_df.loc[:,['labels', 'num_labels']].head(5)

Unnamed: 0,labels,num_labels
0,Bathtub_(filling_or_washing),1
1,Motorcycle,1
2,"Marimba_and_xylophone,Glockenspiel",2
3,"Water_tap_and_faucet,Sink_(filling_or_washing)",2
4,Raindrop,1


In [14]:
# extract data sample with single label and do StratifiedKFold
train_single_df = train_df[train_df['num_labels']==1].reset_index(drop=True)
single_folds = list(StratifiedKFold(n_splits=NUM_FOLD, shuffle=True, random_state=SEED).split(
    np.arange(len(train_single_df)), train_single_df[LABELS].values.argmax(axis=1)))
train_single_df['fold'] = 0
for i in range(NUM_FOLD):
    train_single_df['fold'][single_folds[i][1]] = i
check_fold(train_single_df)

fold                                         1    2    3    4    5   total
Accelerating_and_revving_and_vroom      :   51   62   41   49   47  250 
Accordion                               :   49   43   58   59   52  261 
Acoustic_guitar                         :   51   45   52   47   54  249 
Applause                                :   47   59   47   48   53  254 
Bark                                    :   51   50   51   50   55  257 
Bass_drum                               :   37   50   44   39   30  200 
Bass_guitar                             :   42   56   46   47   44  235 
Bathtub_(filling_or_washing)            :   55   38   38   41   37  209 
Bicycle_bell                            :   41   47   46   48   39  221 
Burping_and_eructation                  :   42   44   47   47   41  221 
Bus                                     :   53   65   38   53   60  269 
Buzz                                    :   55   40   44   39   41  219 
Car_passing_by                          :   33   

In [15]:
# extract data sample with multi labels
train_multi_df = train_df[train_df['num_labels']!=1].reset_index(drop=True)

# count each label
label_counts = []
for i in range(NUM_CLASS):    
    label = LABELS[i] + " "*(40-len(LABELS[i]))
    label_counts.append(train_multi_df[LABELS[i]].sum())
    print("{:2d} {} {}".format(i, label, label_counts[i]))

 0 Accelerating_and_revving_and_vroom       50
 1 Accordion                                39
 2 Acoustic_guitar                          51
 3 Applause                                 46
 4 Bark                                     43
 5 Bass_drum                                100
 6 Bass_guitar                              65
 7 Bathtub_(filling_or_washing)             91
 8 Bicycle_bell                             79
 9 Burping_and_eructation                   79
10 Bus                                      31
11 Buzz                                     81
12 Car_passing_by                           91
13 Cheering                                 55
14 Chewing_and_mastication                  149
15 Child_speech_and_kid_speaking            49
16 Chink_and_clink                          100
17 Chirp_and_tweet                          37
18 Church_bell                              41
19 Clapping                                 52
20 Computer_keyboard                        105
21 Crackl

In [16]:
# concatenate single-label data and multi-label data
train_all_df = pd.concat([train_single_df, train_multi_df]).reset_index(drop=True)
check_fold(train_all_df)

fold                                         1    2    3    4    5   total
Accelerating_and_revving_and_vroom      :   63   74   50   57   56  300 
Accordion                               :   55   52   64   66   63  300 
Acoustic_guitar                         :   61   58   63   51   67  300 
Applause                                :   58   67   59   57   59  300 
Bark                                    :   65   57   60   52   66  300 
Bass_drum                               :   59   73   63   57   48  300 
Bass_guitar                             :   57   66   56   56   65  300 
Bathtub_(filling_or_washing)            :   76   51   63   58   52  300 
Bicycle_bell                            :   50   69   63   66   52  300 
Burping_and_eructation                  :   59   58   65   57   61  300 
Bus                                     :   61   70   42   59   68  300 
Buzz                                    :   71   53   59   56   61  300 
Car_passing_by                          :   53   

#### Optimize

In [17]:
def calc_score(df):
    score = np.zeros([5,NUM_CLASS+1])
    for i in range(5):
        score[i] = df.loc[df.fold==i, LABELS+['num_labels']].values.sum(axis=0)
    score = score.std(axis=0).mean()
    return score
score = calc_score(train_df)
print("KFold score: {:.6f}".format(calc_score(train_df)))
print("StratifiedKFold score: {:.6f}".format(calc_score(train_all_df)))

KFold score: 6.546316
StratifiedKFold score: 3.964405


In [18]:
def do_optimize(df, size, steps):
    """
    df: dataframe to optimize folds
    size: number of data to change fold
    steps: number of for loop
    """
    starttime = time.time()
    score = calc_score(df)
    for i in range(steps):
        # select index to change fold
        change_idx = np.random.choice(np.arange(df.shape[0]), size, replace=False)
        # change fold randomly
        change_fold = np.random.randint(0, NUM_FOLD, size)
        df_new = df.copy()
        df_new['fold'][change_idx] = change_fold

        score_new = calc_score(df_new)
        if score_new < score: # if score getting small, folds will be update
            score = score_new
            df = df_new
        if i%500==0:
            print("step: {:4d}, change size: {:2d}, score: {:.6f}, sec: {:.1f}".format(
                i, size, score, time.time()-starttime))
    return df

In [19]:
# Let's do optimization with randomized algorithm.
train_opt_df = train_all_df.copy()
train_opt_df = do_optimize(train_opt_df, size=64, steps=2000)
train_opt_df = do_optimize(train_opt_df, size=32, steps=2000)
train_opt_df = do_optimize(train_opt_df, size=16, steps=2000)
train_opt_df = do_optimize(train_opt_df, size=8, steps=2000)
train_opt_df = do_optimize(train_opt_df, size=4, steps=2000)
train_opt_df = do_optimize(train_opt_df, size=2, steps=2000)
train_opt_df = do_optimize(train_opt_df, size=1, steps=20000)

print("StratifiedKFold with randomized algorithm score: {:.6f}".format(calc_score(train_opt_df)))

step:    0, change size: 64, score: 3.964405, sec: 0.1
step:  500, change size: 64, score: 3.014818, sec: 38.9
step: 1000, change size: 64, score: 2.759164, sec: 74.2
step: 1500, change size: 64, score: 2.746977, sec: 108.7
step:    0, change size: 32, score: 2.696284, sec: 0.1
step:  500, change size: 32, score: 2.478926, sec: 33.5
step: 1000, change size: 32, score: 2.398053, sec: 72.0
step: 1500, change size: 32, score: 2.251907, sec: 108.0
step:    0, change size: 16, score: 2.148352, sec: 0.1
step:  500, change size: 16, score: 2.068905, sec: 29.7
step: 1000, change size: 16, score: 1.987804, sec: 59.5
step: 1500, change size: 16, score: 1.908925, sec: 89.4
step:    0, change size:  8, score: 1.795690, sec: 0.1
step:  500, change size:  8, score: 1.590419, sec: 34.8
step: 1000, change size:  8, score: 1.426721, sec: 73.2
step: 1500, change size:  8, score: 1.347338, sec: 109.1
step:    0, change size:  4, score: 1.266438, sec: 0.1
step:  500, change size:  4, score: 1.158996, sec:

In [20]:
# Check how well the folds are stratified.
print("fold                                         1    2    3    4    5   total")
print("==========================================================================")
for label in LABELS:
    label_padded = label + " "*(40-len(label))
    dist = ": "
    for i in range(NUM_FOLD):
        dist += "{:4d} ".format(train_all_df[label][train_all_df['fold']==i].sum())
    dist += "{:4d} ".format(train_all_df[label].sum())
    print(label_padded + dist)
label_padded = "total" + " "*(40-len("total"))
dist = ": "
for i in range(5):
    dist += "{:4d} ".format(train_all_df[train_all_df['fold']==i].shape[0])
dist += "{:4d} ".format(train_all_df.shape[0])
print(label_padded + dist)

fold                                         1    2    3    4    5   total
Accelerating_and_revving_and_vroom      :   64   66   59   56   55  300 
Accordion                               :   60   57   59   63   61  300 
Acoustic_guitar                         :   62   59   55   64   60  300 
Applause                                :   59   61   56   57   67  300 
Bark                                    :   62   61   60   60   57  300 
Bass_drum                               :   53   60   65   65   57  300 
Bass_guitar                             :   61   61   57   61   60  300 
Bathtub_(filling_or_washing)            :   63   58   54   64   61  300 
Bicycle_bell                            :   63   62   59   61   55  300 
Burping_and_eructation                  :   60   59   60   62   59  300 
Bus                                     :   57   60   62   58   63  300 
Buzz                                    :   57   58   66   63   56  300 
Car_passing_by                          :   73   

In [21]:
# save
train_kind = "curated" if is_curated else "noisy"
train_opt_df.to_csv(ROOT_PATH / "data" / "fold" / "train_{}_sfk.csv".format(train_kind), index=None)