In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils import data

In [2]:
from scipy.io import wavfile

In [3]:
from functools import lru_cache

In [4]:
from pathlib import Path

In [5]:
from loguru import logger

In [6]:
import utils

In [7]:
def get_dataset_meta(filename):
    raw_data = pd.read_csv(filename)
    labels = tuple(raw_data.label.unique())
    label_dct = {name: i for i, name in enumerate(labels)}
    return labels, label_dct

In [8]:
def get_test_data(filename):
    """
    """
    dirname = Path(filename).parent
    res_filename = dirname / 'train_validate.csv'
    if res_filename.exists():
        logger.info(f'Reading from {res_filename}')
        return pd.read_csv(res_filename)
    
    data = pd.read_csv(filename)
    num_data = len(data)
    num = num_data // 10
    test_bool = np.array([False] * num_data)
    index_array = np.arange(num_data)
    test_index = np.random.choice(index_array, num)
    test_bool[test_index] = True
    data['test'] = test_bool
    data.to_csv(res_filename)
    return data

In [9]:
class Dataset(data.Dataset):
    """

    """

    def __init__(self,
                root,
                data_frame,
                input_length, 
                label_dct,
                data_loader=data_loader):
        self.root = Path(root)
        self.input_length = input_length
        self.__raw_data = data_frame
        self.label_dct = label_dct
        self.data_loader = data_loader
    
    @property
    @lru_cache(maxsize=None)
    def filenames(self):
        return self.__raw_data.fname

    def __len__(self):
        """

        :return:
        """
        return self.__raw_data.shape[0]

    def __getitem__(self, index):
        """

        :param index:
        :return:
        """
        
        record = self.__raw_data.loc[index]
        name = record.fname
        file_name = self.root / name
        label_name = record.label
        data = self.data_loader(file_name, self.input_length)
        label = self.label_dct[label_name]
        return data, torch.tensor(label, dtype=torch.uint8)

In [10]:
root = Path('/data/FSDKaggle2018/FSDKaggle2018.audio_train')
train_fname = Path('/data/FSDKaggle2018/FSDKaggle2018.meta/train_post_competition.csv')

In [11]:
labels, label_dct = get_dataset_meta(train_fname)

In [12]:
full_df = get_test_data(train_fname)

2019-06-20 14:12:45.243 | INFO     | __main__:get_test_data:7 - Reading from /data/FSDKaggle2018/FSDKaggle2018.meta/train_validate.csv


In [13]:
train_df = full_df[full_df.test == False]
train_df.index = range(len(train_df))

In [14]:
test_df = full_df[full_df.test == True]
test_df.index = range(len(test_df))

In [15]:
train_dataset = Dataset(root, train_df, input_length=2 * 44100, label_dct=label_dct)

In [16]:
test_dataset =  Dataset(root, test_df, input_length=2 * 44100, label_dct=label_dct)

In [17]:
batch_size = len(train_dataset)

In [18]:
# size_list = [train_dataset[i][0].shape for i in range(len(train_dataset))]

In [19]:
train_data_iter = data.DataLoader(train_dataset, batch_size=len(train_dataset))

In [41]:
test_data_iter = data.DataLoader(test_dataset, batch_size=len(test_dataset))

In [38]:
def get_data(dataset):
    for dt, label in dataset:
        return dt.numpy(), label.numpy()

In [40]:
train_dt, train_label = get_data(train_data_iter)

In [42]:
test_dt, test_label = get_data(test_data_iter)

In [22]:
import lightgbm as lgb

In [30]:
train_data = lgb.Dataset(train_dt, label=train_label)

In [31]:
num_class = len(labels)

In [32]:
num_class

41

In [33]:
param = {'num_leaves': 31, 'objective': 'multiclass', 'num_class': num_class}
param['metric'] = 'multi_logloss'

In [84]:
num_round = 100

In [85]:
bst = lgb.train(param, train_data, num_round)

In [90]:
from sklearn.metrics import accuracy_score

In [91]:
from sklearn.metrics import f1_score

In [87]:
prediction = bst.predict(test_dt)

In [88]:
max_index = np.argmax(prediction, axis=1)

In [89]:
accuracy_score(max_index, test_label)

0.18948521358159912

In [99]:
prediction.shape

(913, 41)

In [95]:
prediction_tensor = torch.from_numpy(prediction)

In [100]:
probs, indices =  torch.max(prediction_tensor, dim=1)

In [104]:
indices.dtype

torch.int64

In [109]:
label_tensor = torch.from_numpy(test_label.astype(np.int64))

In [112]:
torch.sum(indices == label_tensor)

tensor(173)

In [None]:
train_dataset.