In [2]:
import os, random, math

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import lightgbm as lgb

import librosa
import librosa.display

from scipy.stats import skew, kurtosis
from sklearn.cross_validation import StratifiedKFold
from prettytable import PrettyTable
from tqdm import tqdm_notebook, tqdm_pandas
tqdm_notebook().pandas(smoothing=0.7)

import IPython
import IPython.display as ipd

import matplotlib as mpl
import kaggle_util
from util import *

DEBUG = 1
nfold = 5
nround = 1500
if DEBUG:
    nfold = 2
    nround = 5

%matplotlib inline



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.





In [3]:
nrows = None if not DEBUG else 1000

train = kaggle_util.reduce_mem_usage(pd.read_csv('../data/train_mel.csv', nrows=nrows))
test = kaggle_util.reduce_mem_usage(pd.read_csv('../data/test_mel.csv', nrows=nrows))
y = pd.get_dummies(train.label)

LABELS = list(train.label.unique())
n_categories = len(LABELS)
train = train.drop(['fname', 'label', 'manually_verified'], axis=1)
feature_names = list(test.drop(['fname', 'label'], axis=1).columns.values)
test = test.drop(['fname', 'label'], axis=1).values


labels = y.columns.values
y_label = y.values
y_label = [np.argmax(row) for row in y_label]

 34%|███▍      | 138/402 [00:00<00:00, 1370.65it/s]

Memory usage of dataframe is 3.07 MB


100%|██████████| 402/402 [00:00<00:00, 1557.74it/s]
 34%|███▍      | 137/401 [00:00<00:00, 1361.35it/s]

Memory usage after optimization is: 0.78 MB
Decreased by 74.4%
Memory usage of dataframe is 3.06 MB


100%|██████████| 401/401 [00:00<00:00, 1583.03it/s]


Memory usage after optimization is: 0.78 MB
Decreased by 74.4%


In [4]:
PREDICTION_FOLDER = '../result/predictions/lgb'
if not os.path.exists(PREDICTION_FOLDER):
    os.mkdir(PREDICTION_FOLDER)

cvscores = []
skf = StratifiedKFold(y_label, n_folds=nfold)
for i, (train_split, val_split) in enumerate(skf):
    X_train = train.iloc[train_split].values
    y_train = [np.argmax(row) for row in y.iloc[train_split].values] 
    X_valid = train.iloc[val_split].values
    y_valid = [np.argmax(row) for row in y.iloc[val_split].values] 
    
    print(X_train.shape, X_valid.shape)
    
    d_train = lgb.Dataset(X_train, label=y_train, feature_name=feature_names)
    d_valid = lgb.Dataset(X_valid, label=y_valid, feature_name=feature_names)
    
    params = {
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        'metric': 'multi_logloss',
        'max_depth': 5,
        'num_leaves': 31,
        'learning_rate': 0.025,
        'feature_fraction': 0.85,
        'bagging_fraction': 0.85,
        'bagging_freq': 5,
        'num_threads': os.cpu_count(),
        'lambda_l2': 1.0,
        'min_gain_to_split': 0,
        'num_class': n_categories,
    }
    
    clf = lgb.train(params, d_train, num_boost_round=nround, 
                    valid_sets=d_valid, verbose_eval=100, 
                    early_stopping_rounds=100)
    p = clf.predict(X_valid, num_iteration=clf.best_iteration)

    #predictions = [list(np.argsort(p[i])[::-1][:3]) for i in range(len(p))]
    #actual = [[i] for i in y_valid]
    #valid_score = mapk(actual, predictions, k=3)
    valid_score = get_valid_score(y_valid, p)
    print("Score = {:.4f}".format(valid_score))
    cvscores.append(valid_score)
    
    pre_test = clf.predict(test, num_iteration=clf.best_iteration)
    savepath = "/p{}.npy"
    savepath = savepath.format(i)
    np.save(PREDICTION_FOLDER + savepath, pre_test)

(489, 399) (511, 399)
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[5]	valid_0's multi_logloss: 3.56912
Score = 0.3092
(511, 399) (489, 399)
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[5]	valid_0's multi_logloss: 3.5467
Score = 0.3231


In [5]:
cvmean = np.mean(cvscores)
cvstd = np.std(cvscores)
print('mean {0:.3f} std {1:.3f}'.format(cvmean, cvstd))
actual_prefix = '{:.2f}_{:.2f}'.format(cvmean, cvstd)
ensemble(LABELS, nfold, [PREDICTION_FOLDER], actual_prefix, 'lgb', False)

mean 0.316 std 0.007
ensemble...
      sub0
sub0   1.0
save result


In [None]:
n_categories

In [None]:
def mel_spectral_features(fname=None, root=None, n_mels=32, return_fnames=False):
    feature_names = []
    for i in ['mean', 'std', 'min', 'max', 'skew', 'kurt']:
        for j in range(n_mels):
            feature_names.append('mel_{}_{}'.format(j, i))
    
    if return_fnames:
        return feature_names

     
    try:
        data, fs = librosa.core.load(root + fname, sr=None)
        n_fft = 2048
        stft = librosa.stft(data, n_fft=n_fft, hop_length=512)
        mel_basis = librosa.filters.mel(fs, n_fft, n_mels)
        s = np.dot(mel_basis, np.abs(stft)**2.0)
        M = librosa.power_to_db(s, ref=np.max)
        
        data_row = np.hstack((np.mean(M, axis=1), np.std(M, axis=1), np.min(M, axis=1),
                              np.max(M, axis=1), skew(M, axis=1), kurtosis(M, axis=1)))
        
        return pd.Series(data_row)
        
    except:
        print("Bad file at {}".format(fname))
        return pd.Series([0]*len(feature_names)) 

In [None]:
train_seg = pd.read_csv('../data/train_seg.csv', nrows=10)

In [None]:
train_df

In [None]:
train_df['fname'].progress_apply(mel_spectral_features, root=train_root)

In [None]:
mel_spectral_features(return_fnames=True)