In [83]:
import numpy as np
import pandas as pd
import torch
from torch.utils import data

In [84]:
from scipy.io import wavfile

In [85]:
from functools import lru_cache

In [86]:
from pathlib import Path

In [275]:
from loguru import logger

In [150]:
def data_loader(fname, input_length):
    _, data = wavfile.read(fname)
    amp = np.max(np.abs(data))
    if len(data) > input_length:
        max_index = len(data) - input_length
        offset = np.random.randint(max_index)
        data = data[offset: input_length + offset]
    elif len(data) < input_length:
        total_pad = input_length - len(data)
        head_pad = np.random.randint(total_pad)
        tail_pad = total_pad - head_pad
        data = np.pad(data, (head_pad, tail_pad), 'constant')
    data = data.astype(dtype=np.float32) / amp
    return torch.from_numpy(data.copy()).float()

In [196]:
def get_dataset_meta(filename):
    raw_data = pd.read_csv(filename)
    labels = tuple(raw_data.label.unique())
    label_dct = {name: i for i, name in enumerate(labels)}
    return labels, label_dct

In [276]:
def get_test_data(filename):
    """
    """
    dirname = Path(filename).parent
    res_filename = dirname / 'train_validate.csv'
    if res_filename.exists():
        logger.info(f'Reading from {res_filename}')
        return pd.read_csv(res_filename)
    
    data = pd.read_csv(filename)
    num_data = len(data)
    num = num_data // 10
    test_bool = np.array([False] * num_data)
    index_array = np.arange(num_data)
    test_index = np.random.choice(index_array, num)
    test_bool[test_index] = True
    data['test'] = test_bool
    data.to_csv(res_filename)
    return data

In [284]:
class Dataset(data.Dataset):
    """

    """

    def __init__(self,
                root,
                data_frame,
                input_length, 
                label_dct,
                data_loader=data_loader):
        self.root = Path(root)
        self.input_length = input_length
        self.__raw_data = data_frame
        self.label_dct = label_dct
        self.data_loader = data_loader
    
    @property
    @lru_cache(maxsize=None)
    def filenames(self):
        return self.__raw_data.fname

    def __len__(self):
        """

        :return:
        """
        return self.__raw_data.shape[0]

    def __getitem__(self, index):
        """

        :param index:
        :return:
        """
        
        record = self.__raw_data.loc[index]
        name = record.fname
        file_name = self.root / name
        label_name = record.label
        data = self.data_loader(file_name, self.input_length)
        label = self.label_dct[label_name]
        return data, torch.tensor(label, dtype=torch.uint8)

In [285]:
root = Path('C:/Datasets/FSDKaggle2018.audio_train')
train_fname = Path('C:/Datasets/FSDKaggle2018.meta/train_post_competition.csv')

In [286]:
labels, label_dct = get_dataset_meta(train_fname)

In [287]:
full_df = get_test_data(train_fname)

2019-06-20 11:19:04.173 | INFO     | __main__:get_test_data:7 - Reading from C:\Datasets\FSDKaggle2018.meta\train_validate.csv


In [306]:
train_df = full_df[full_df.test == False]
train_df.index = range(len(train_df))

In [308]:
test_df = full_df[full_df.test == True]
test_df.index = range(len(test_df))

In [309]:
train_dataset = Dataset(root, train_df, input_length=2 * 44100, label_dct=label_dct)

In [310]:
test_dataset =  Dataset(root, test_df, input_length=2 * 44100, label_dct=label_dct)

In [311]:
batch_size = len(train_dataset)

In [312]:
# size_list = [train_dataset[i][0].shape for i in range(len(train_dataset))]

In [313]:
train_data_iter = data.DataLoader(train_dataset, batch_size=len(train_dataset))

In [314]:
def get_data():
    for dt, label in train_data_iter:
        return dt.numpy(), label.numpy()

In [318]:
train_dt, train_label = get_data()

In [207]:
import lightgbm as lgb

In [319]:
w = np.random.rand(500, )
train_data = lgb.Dataset(train_dt, label=train_label, weight=w)

In [320]:
param = {'num_leaves': 31, 'objective': 'multiclass'}
param['metric'] = 'multi_logloss'

In [None]:
num_round = 10
bst = lgb.train(param, train_data, num_round)