In [1]:
from glob import glob
import librosa
import librosa.display
import soundfile as sf
import numpy as np
import pandas as pd
import sys
import time
import datetime
from tqdm import tqdm
from concurrent import futures

import random

import torch
import torch.nn as nn
import sys
sys.path.append('..')
from libs import transform as tr
import warnings
warnings.filterwarnings("ignore")

In [2]:
train_tp = pd.read_csv('../../data/train_tp.csv')
train_fp = pd.read_csv('../../data/train_fp.csv')
tp_files = ['../../data/train/'+i+'.flac' for i in train_tp.recording_id]
test_files = glob('../../data/test/*.flac')

In [3]:
test_data_params = {
    'melspectrogram_parameters': {
        'n_mels': 128,
        'fmin': 20,
        'fmax': 24000,
    },
    'pcen_parameters': {
        'gain': 0.98,
        'bias': 2,
        'power': 0.5,
        'time_constant': 0.4,
        'eps': 0.000001,
    }
}

In [4]:
def normalize_melspec(X: np.ndarray):
    eps = 1e-6
    mean = X.mean()
    X = X - mean
    std = X.std()
    Xstd = X / (std + eps)
    norm_min, norm_max = Xstd.min(), Xstd.max()
    if (norm_max - norm_min) > eps:
        V = Xstd
        V[V < norm_min] = norm_min
        V[V > norm_max] = norm_max
        V = 255 * (V - norm_min) / (norm_max - norm_min)
        V = V.astype(np.uint8)
    else:
        # Just zero
        V = np.zeros_like(Xstd, dtype=np.uint8)
    return V

In [5]:
def create_mel(y, sr, fp, tp, params):
    img_size = 168
    
    melspec = librosa.feature.melspectrogram(
        y,
        sr=sr,
        fmin=0,
        fmax=15000,
        n_mels=128,
        n_fft=2048, 
        hop_length=512,
    )
    
    f, m = melspec.shape
    time_rate = m / 60
    freq_rate = f / 15000
    
    tp_f_min = int(tp.iloc[0]['f_min'] * freq_rate)
    tp_f_max = int(tp.iloc[0]['f_max'] * freq_rate)
    tp_t_min = int(tp.iloc[0]['t_min'] * time_rate)
    tp_t_max = int(tp.iloc[0]['t_max'] * time_rate)
    
    for i in range(len(fp)):
        sample = fp.iloc[i]
        f_min = int(sample['f_min'] * freq_rate)
        f_max = int(sample['f_max'] * freq_rate)
        t_min = int(sample['t_min'] * time_rate)
        t_max = int(sample['t_max'] * time_rate)
        
        if tp_f_min < f_min < tp_f_max or tp_f_min < f_max < tp_f_max:
            melspec[f_min:f_max, t_min:t_max] = 1e-4
        else:
            melspec[f_min:f_max, :] = 1e-4
        
    pcen = librosa.pcen(melspec, sr=sr, **params)
    clean_mel = librosa.power_to_db(melspec ** 1.5)
    melspec = librosa.power_to_db(melspec)
    
    norm_melspec = normalize_melspec(melspec)
    norm_pcen = normalize_melspec(pcen)
    norm_clean_mel = normalize_melspec(clean_mel)
    
    image = np.stack([norm_melspec, norm_pcen, norm_clean_mel], axis=-1)
        
    return image

In [6]:
params = {
    'gain': 0.98,
    'bias': 2,
    'power': 0.5,
    'time_constant': 0.4,
    'eps': 0.000001,
}


def convert_tensor(path):
    recording_id = path.split('.flac')[0].split('/')[-1]
    
    fp = train_fp.query(f'recording_id == "{recording_id}"')
    tp = train_tp.query(f'recording_id == "{recording_id}"')
    
    y, sr = librosa.load(path, sr=None)
    
    transform = tr.Compose([
        tr.OneOf([
            tr.GaussianNoiseSNR(min_snr=10),
            tr.PinkNoiseSNR(min_snr=10)
        ]),
        #tr.PitchShift(max_steps=2, sr=sr),
        #tr.TimeStretch(),
        #tr.TimeShift(sr=sr),
        tr.VolumeControl(mode="sine")
    ])
    
    y = transform(y)
    
    mel = create_mel(y, sr, fp, tp, params)
    
    torch.save(mel, path.replace('/train/', '/train_noise_v2/').replace('.flac', '.tensor'))
    
def convert_freq(path, target='train'):
    new_path = path.replace(f"/{target}/", f"/{target}_32000/")
    y, orig_sr = sf.read(path)
    y = librosa.resample(y, orig_sr=orig_sr, target_sr=32000, res_type="kaiser_best")
    sf.write(new_path, y, 32000)

In [7]:
with futures.ProcessPoolExecutor(max_workers=15) as executor:
    c = 0
    for f in executor.map(convert_freq, tp_files):
        c += 1
        print(f'\r{c}/{len(train_tp)}', end='')

1216/1216

In [7]:
transform = tr.Compose([
        #tr.OneOf([
        #    tr.GaussianNoiseSNR(min_snr=5),
        #    tr.PinkNoiseSNR(min_snr=5)
        #]),
        tr.PitchShift(max_steps=2, sr=48000),
        #tr.TimeStretch(),
        #tr.TimeShift(sr=sr),
        tr.VolumeControl(mode="sine")
    ])

def convert_freq(path, target='test'):
    for i in range(3):
        new_path = path.replace(f"/{target}/", f"/test_tta/tta_{i}/")
        y, orig_sr = sf.read(path)
        
        y = transform(y)

        sf.write(new_path, y, orig_sr)

with futures.ProcessPoolExecutor(max_workers=15) as executor:
    c = 0
    for f in executor.map(convert_freq, test_files):
        c += 1
        print(f'\r{c}/{len(test_files)}', end='')

1992/1992

In [None]:
def create_test_mel(y, sr, params):
    img_size = 168
    
    melspec = librosa.feature.melspectrogram(
        y,
        sr=sr,
        fmin=0,
        fmax=15000,
        n_mels=128
    )
    
    pcen = librosa.pcen(melspec, sr=sr, **params)
    clean_mel = librosa.power_to_db(melspec ** 1.5)
    melspec = librosa.power_to_db(melspec)
    
    norm_melspec = normalize_melspec(melspec)
    norm_pcen = normalize_melspec(pcen)
    norm_clean_mel = normalize_melspec(clean_mel)
    
    image = np.stack([norm_melspec, norm_pcen, norm_clean_mel], axis=-1)
    return image

In [9]:
params = {
    'gain': 0.98,
    'bias': 2,
    'power': 0.5,
    'time_constant': 0.4,
    'eps': 0.000001,
}

def convert_test_tensor(path):
    recording_id = path.split('.flac')[0].split('/')[-1]
    
    y, sr = librosa.load(path, sr=None)
    
    mel = create_test_mel(y, sr, params)
    
    torch.save(mel, path.replace('/test/', '/test_wo_fp/').replace('.flac', '.tensor'))

In [10]:
with futures.ProcessPoolExecutor(max_workers=15) as executor:
    c = 0
    for f in executor.map(convert_test_tensor, test_files):
        c += 1
        print(f'\r{c}/{len(test_files)}', end='')

1992/1992

In [11]:
y, sr = librosa.load(tp_files[0], sr=None)
melspec = librosa.feature.melspectrogram(y[0:sr*10], sr=sr, **test_data_params['melspectrogram_parameters'])
pcen = librosa.pcen(melspec, sr=sr, **test_data_params['pcen_parameters'])
clean_mel = librosa.power_to_db(melspec ** 1.5)
melspec = librosa.power_to_db(melspec)
        
norm_melspec = normalize_melspec(melspec)
norm_pcen = normalize_melspec(pcen)
norm_clean_mel = normalize_melspec(clean_mel)
image = np.stack([norm_melspec, norm_pcen, norm_clean_mel], axis=-1)
        
image.shape

(128, 938, 3)

In [12]:
torch.tensor(melspec).size()

torch.Size([128, 938])

In [13]:
1216 / 24

50.666666666666664

In [14]:
train_tp['is_nocall'] = False
train_tp#['f_max'].hist()

Unnamed: 0,recording_id,species_id,songtype_id,t_min,f_min,t_max,f_max,is_nocall
0,003bec244,14,1,44.5440,2531.250,45.1307,5531.25,False
1,006ab765f,23,1,39.9615,7235.160,46.0452,11283.40,False
2,007f87ba2,12,1,39.1360,562.500,42.2720,3281.25,False
3,0099c367b,17,4,51.4206,1464.260,55.1996,4565.04,False
4,009b760e6,10,1,50.0854,947.461,52.5293,10852.70,False
...,...,...,...,...,...,...,...,...
1211,fe8d9ac40,13,1,53.4720,93.750,54.0960,843.75,False
1212,fea6b438a,4,1,43.5787,2531.250,45.7653,4031.25,False
1213,ff2eb9ce5,0,1,15.2267,5906.250,16.0213,8250.00,False
1214,ffb8d8391,5,1,14.3467,4781.250,16.6987,10406.20,False


In [15]:
no_call = train_tp.sample(frac=0.04)
no_call['is_nocall'] = True
no_call['species_id'] = 24
pd.concat([train_tp, no_call]).to_csv('add_no_call.csv')

In [16]:
train_tp = train_tp.drop(['is_none'], axis=1)

KeyError: "['is_none'] not found in axis"

In [None]:
def get_10sec(path, t_min, t_max):
    y, sr = librosa.load(path, sr=None)
    
    annotated_duration = t_max - t_min
        
    if annotated_duration > 10:
        limit_sec = t_max - 10
        start_sec = random.randint(t_min, limit_sec)
        end_sec = start_sec + 10
            
        start = start_sec * sr
        end = end_sec * sr
    else:
        res_time = 10 - annotated_duration
        front_limit = res_time if res_time < t_min else t_min
        
        front_time = random.randint(0, front_limit)
            
        back_limit = 60 - t_max
        
        tmp_time = res_time - front_time
        back_time = tmp_time if tmp_time < back_limit else back_limit
        
        if not tmp_time < back_limit:
            front_time += tmp_time - back_limit
            
        start = (t_min - front_time) * sr
        end = (t_max + back_time) * sr
        
    return end - start
        
for i in range(len(train_tp)):
    df = train_tp.iloc[i,:]
    path = '../../data/train/'+ df['recording_id'] + '.flac'
    t_min = int(round(df['t_min']))
    t_max = int(round(df['t_max']))
    
    print(t_min, t_max)
    
    print(get_10sec(path, t_min, t_max))

In [None]:
import torch
a = torch.tensor([
    [
        [1,2,3]
    ],
    [
        [1,2,3]
    ],
    [
        [1,2,3]
    ]
])

In [None]:
a.size()

In [None]:
import numpy as np
mixup_alpha = 0.2
random_state = np.random.RandomState(123)

for i in range(10):
    print(random_state.beta(mixup_alpha, mixup_alpha, 1)[0])

In [None]:
a = list(range(64))
random.shuffle(a)