In [1]:
import pandas as pd
import numpy as np
import librosa
import torch

In [2]:
root_audio = "/home/data/kbh/DCASE2022_SELD_synth_data/foa/"
root_meta = "/home/data/kbh/DCASE2022_SELD_synth_data/metadata/"

target = "fold1_room10_mix001"

In [3]:
# ms
hop_meta_s = 100
shift = 256
n_fft = 1024
fs = 24000

In [4]:
df = pd.read_csv(root_meta+target + ".csv",names=["idx","class","order","azimuth","elevation"])
display(df.head())

raw,_ = librosa.load(root_audio+target+".wav",sr=fs,mono=False)
print("raw : {}".format(raw.shape))

spec = librosa.stft(raw[0,:],n_fft=n_fft,center=True)
print("spec : {}".format(spec.shape))

Unnamed: 0,idx,class,order,azimuth,elevation
0,10,4,0,-131,0
1,11,4,0,-135,0
2,12,4,0,-139,0
3,13,4,0,-143,-1
4,14,4,0,-147,-1


raw : (4, 1440000)
spec : (513, 5626)


## frame별 라벨로 변환

In [8]:
def align_meta_pt(df,
               n_sample,
               shift=256,
               n_fft=1024,
               hop_meta_s=0.1,
               fs=24000,
               max_n_target = 6
              ):
    hop_meta = fs*hop_meta_s
    ratio = hop_meta/shift
    n_frame = int(np.ceil(n_sample/shift)+1)
    
    out = torch.zeros(n_frame,max_n_target,3) # 3[class,azimuth,elevation]
    out[:,:,0] = -1 # init
    
    ## 
    
    for idx in df.index :
        idx_start = int(df.iloc[idx,0]*ratio)
        idx_end = int((df.iloc[idx,0]+1)*ratio)
        
        cnt=0
        while out[idx_start,cnt,0] != -1 :
            cnt+=1
        
        out[idx_start:idx_end,cnt,0] = df.iloc[idx,1] # class
        out[idx_start:idx_end,cnt,1] = df.iloc[idx,3] # azimuth
        out[idx_start:idx_end,cnt,2] = df.iloc[idx,4] # elevation
    
    return out
output = align_meta_pt(df,raw.shape[1])
display(output[370:380])   

tensor([[[   4.,  113.,   -1.],
         [  -1.,    0.,    0.],
         [  -1.,    0.,    0.],
         [  -1.,    0.,    0.],
         [  -1.,    0.,    0.],
         [  -1.,    0.,    0.]],

        [[   4.,  113.,   -1.],
         [  -1.,    0.,    0.],
         [  -1.,    0.,    0.],
         [  -1.,    0.,    0.],
         [  -1.,    0.,    0.],
         [  -1.,    0.,    0.]],

        [[   4.,  113.,   -1.],
         [  -1.,    0.,    0.],
         [  -1.,    0.,    0.],
         [  -1.,    0.,    0.],
         [  -1.,    0.,    0.],
         [  -1.,    0.,    0.]],

        [[   4.,  113.,   -1.],
         [  -1.,    0.,    0.],
         [  -1.,    0.,    0.],
         [  -1.,    0.,    0.],
         [  -1.,    0.,    0.],
         [  -1.,    0.,    0.]],

        [[   4.,  113.,   -1.],
         [  -1.,    0.,    0.],
         [  -1.,    0.,    0.],
         [  -1.,    0.,    0.],
         [  -1.,    0.,    0.],
         [  -1.,    0.,    0.]],

        [[   4.,  109.,   -1.]

## 입력 포맷과 유사하게 변환

In [20]:
def align_meta_csv(df,
               n_sample,
               shift=256,
               n_fft=1024,
               hop_meta_s=0.1,
               fs=24000
              ):
    hop_meta = fs*hop_meta_s
    ratio = hop_meta/shift
    n_frame = int(np.ceil(n_sample/shift)+1)
    
    out = pd.DataFrame(columns=["frame","class","order","azimuth","elevation"])
    
    for idx in df.index :
        idx_start = int(df.iloc[idx,0]*ratio)
        idx_end = int((df.iloc[idx,0]+1)*ratio)
        
        #print("{} -> {} ~ {} ".format(idx,idx_start,idx_end))
        # insert
        temp = pd.DataFrame(index=range(idx_end-idx_start), columns=["frame","class","order","azimuth","elevation"])
        for idx_temp in range(idx_end-idx_start) : 
            temp.iloc[idx_temp,0] = idx_start + idx_temp
        temp.iloc[:,1:]=df.iloc[idx,1:]
        
        out = pd.concat([out, temp],ignore_index =True)

    
    
    return out
output = align_meta_csv(df,raw.shape[1])
display(output.iloc[270:300])   

Unnamed: 0,frame,class,order,azimuth,elevation
270,363,4,0,117,-1
271,364,4,0,117,-1
272,365,4,0,113,-1
273,366,4,0,113,-1
274,367,4,0,113,-1
275,368,4,0,113,-1
276,369,4,0,113,-1
277,370,4,0,113,-1
278,371,4,0,113,-1
279,372,4,0,113,-1


index랑 frame이 match가 안되서 라벨로 바로 쓸 수는 없게됨

In [24]:
output = pd.concat((output,output),ignore_index=True)
output

Unnamed: 0,frame,class,order,azimuth,elevation
0,93,4,0,-131,0
1,94,4,0,-131,0
2,95,4,0,-131,0
3,96,4,0,-131,0
4,97,4,0,-131,0
...,...,...,...,...,...
56755,5620,9,15,64,0
56756,5621,9,15,64,0
56757,5622,9,15,64,0
56758,5623,9,15,64,0
