In [1]:
import pickle
import os
import numpy as np
import librosa
from python_speech_features import fbank
import shutil
import pandas as pd

In [2]:
def normalize_frames(m,Scale=True):
    if Scale:
        return (m - np.mean(m, axis=0)) / (np.std(m, axis=0) + 2e-12)
    else:
        return (m - np.mean(m, axis=0))

In [3]:
def compute_and_store_features(df_path, out_path):
    
    df = pd.read_csv(df_path)
    users = df.spk_id.unique()
    
    print(f'Number of users: {len(users)}')
    
    for user in users:
        os.makedirs(os.path.join(out_path, str(user)))
        
    
    ######################################################
    # Compute features
    sample_rate = 16000
    tot_rows = []
    
    for idx, row in df.iterrows():

        spk_id = row['spk_id']
        _path = row['utter_path']

        utter_name = os.path.basename(_path).split('.')[0]

        audio, sr = librosa.load(_path, sr=sample_rate, mono=True)

        filter_banks, energies = fbank(audio, samplerate=sample_rate, nfilt=40, winlen=0.025)

        filter_banks = 20 * np.log10(np.maximum(filter_banks,1e-5))

        feature = normalize_frames(filter_banks, Scale=False)

        out = {'label': str(spk_id),
               'feat':feature}


        pickle_file = os.path.join(out_path, str(spk_id), f'{utter_name}.p')

        tot_rows.append([spk_id, _path, os.path.abspath(pickle_file)])

        with open(pickle_file, 'wb') as f:
            pickle.dump(out, f)

    return pd.DataFrame(tot_rows, columns=['spk_id', 'flac_path', 'feature_path'])

### Compute features and store it

In [5]:
# Create directories
base_out = '/cas/DeepLearn/elperu/tmp/speech_datasets/LibriSpeech/train_test_split/'
train_out_path = os.path.join(base_out, 'train')
test_out_path = os.path.join(base_out, 'test')

shutil.rmtree(base_out, ignore_errors=True)

os.makedirs(train_out_path)
os.makedirs(test_out_path)

In [6]:
train_config_path = '/cas/DeepLearn/elperu/tmp/speech_datasets/LibriSpeech/train_config.csv'
test_config_path = '/cas/DeepLearn/elperu/tmp/speech_datasets/LibriSpeech/test_config.csv'

In [7]:
train_df = compute_and_store_features(train_config_path, train_out_path)

Number of users: 200


In [8]:
test_df = compute_and_store_features(test_config_path, test_out_path)

Number of users: 50


In [10]:
# Final check
for sp in os.listdir(train_out_path):
    n_ft = len(os.listdir(os.path.join(train_out_path, sp)))
    print(sp, n_ft)

374 40
7800 40
2514 40
1088 40
5456 40
5750 40
8238 40
1263 40
7505 40
587 40
226 40
1743 40
4214 40
5789 40
7635 40
5390 40
307 40
4362 40
233 40
1624 40
4297 40
6181 40
6367 40
3723 40
6563 40
403 40
5778 40
3112 40
7367 40
7078 40
32 40
5322 40
6818 40
5104 40
8226 40
3830 40
8324 40
163 40
6476 40
1069 40
3983 40
1183 40
4788 40
311 40
2196 40
103 40
446 40
1502 40
8975 40
8770 40
1992 40
5678 40
2182 40
7178 40
201 40
1034 40
5703 40
1363 40
6836 40
3168 40
1553 40
5163 40
19 40
5393 40
4481 40
4160 40
6415 40
87 40
7067 40
5688 40
2843 40
909 40
40 40
322 40
8797 40
6848 40
3947 40
4014 40
6531 40
3259 40
4441 40
5049 40
4018 40
4088 40
4853 40
7226 40
4859 40
78 40
3440 40
460 40
2893 40
4680 40
2518 40
4898 40
7780 40
1926 40
1963 40
3526 40
254 40
1970 40
6209 40
458 40
831 40
839 40
8425 40
200 40
1723 40
6019 40
4813 40
1455 40
2391 40
2910 40
7302 40
2817 40
445 40
8468 40
2384 40
8630 40
4267 40
26 40
118 40
328 40
3374 40
5022 40
8108 40
6081 40
8095 40
5514 40
2007 40
77

In [11]:
for sp in os.listdir(test_out_path):
    n_ft = len(os.listdir(os.path.join(test_out_path, sp)))
    print(sp, n_ft)

5808 40
3664 40
125 40
1867 40
2989 40
426 40
6385 40
150 40
5192 40
3857 40
2159 40
3242 40
289 40
7859 40
3214 40
3235 40
8419 40
250 40
6272 40
332 40
3240 40
302 40
1841 40
6925 40
8465 40
196 40
7113 40
1246 40
2416 40
1898 40
7148 40
1594 40
2911 40
4830 40
405 40
89 40
6147 40
6529 40
83 40
3807 40
8014 40
8838 40
481 40
3486 40
8312 40
1334 40
7447 40
6000 40
5339 40
2764 40
