In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np
import random
import itertools
import librosa
import IPython.display as ipd
import matplotlib.pyplot as plt
import soundfile as sf
import wave

%matplotlib inline

In [2]:
import os

original_root = './original'

path_list = []

for path, subdirs, files in os.walk(original_root):
    for name in files:
        if name.endswith(".wav"):
            # print(os.path.join(path, name))
            path_list.append(os.path.join(path, name))

In [3]:
len(path_list)

1132

In [4]:
angry_root = original_root+'/生氣-大聲'
calm_root =  original_root+'/溫柔-平靜'
taunt_root = original_root+'/威脅-挑釁-惹對方生氣'
upset_root = original_root+'/求饒-難過'

angry_padding_root = './padding/angry'
calm_padding_root =  './padding/calm'
taunt_padding_root = './padding/taunt'
upset_padding_root = './padding/upset'

angry_augmentation_root = './augmented/angry'
calm_augmentation_root =  './augmented/calm'
taunt_augmentation_root = './augmented/taunt'
upset_augmentation_root = './augmented/upset'


In [5]:
def get_wav_path_list(root):
    path_list = []

    for path, subdirs, files in os.walk(root):
        for name in files:
            if name.endswith(".wav"):
                # print(os.path.join(path, name))
                path_list.append(os.path.join(path, name))
    return path_list

def remove_duration_too_Long_path(path_list):
    durations = [librosa.get_duration(filename=p) for p in path_list]
    tooLong = [idx for idx, ele in enumerate(durations) if ele > 10.0]
    tooLong_path = [path_list[i] for i in tooLong]
    new_path_list = list(set(path_list) - set(tooLong_path))
    return new_path_list


In [6]:
angry_path_list = get_wav_path_list(angry_root)
print(len(angry_path_list))
angry_path_list = remove_duration_too_Long_path(angry_path_list)
print(len(angry_path_list))

calm_path_list = get_wav_path_list(calm_root)
print(len(calm_path_list))
calm_path_list = remove_duration_too_Long_path(calm_path_list)
print(len(calm_path_list))

taunt_path_list = get_wav_path_list(taunt_root)
print(len(taunt_path_list))
taunt_path_list = remove_duration_too_Long_path(taunt_path_list)
print(len(taunt_path_list))

upset_path_list = get_wav_path_list(upset_root)
print(len(upset_path_list))
upset_path_list = remove_duration_too_Long_path(upset_path_list)
print(len(upset_path_list))

print(len(angry_path_list)+len(calm_path_list)+len(taunt_path_list)+len(upset_path_list))

357
347
491
490
120
119
164
99
1055


In [7]:
def data_padding(padding_root, path_list):
    for path in path_list:

        f = wave.open(path)
        SampleRate = f.getframerate()
        frames = f.getnframes()
        Duration = wav_time = frames / float(SampleRate)
        wav, sr = librosa.load(path, sr=16000)
        if Duration <= 10.0:
            n = 160000-wav.shape[0]
            ndarray = np.pad(wav, (0, n), 'constant', constant_values=(0,0.0000000e+00))
            sf.write(os.path.join(padding_root, 'padding_'+os.path.basename(path)), ndarray, 16000)

## Data Padding

In [9]:
data_padding(angry_padding_root, angry_path_list)
data_padding(calm_padding_root, calm_path_list)
data_padding(taunt_padding_root, taunt_path_list)
data_padding(upset_padding_root, upset_path_list)


In [10]:
angry_padding_path_list = get_wav_path_list(angry_padding_root)
print(len(angry_padding_path_list))
angry_padding_durations = [librosa.get_duration(filename=p) for p in angry_padding_path_list]

angry_padding_stats = {
    'mean': np.mean(angry_padding_durations),
    'max': np.max(angry_padding_durations),
    'min': np.min(angry_padding_durations),
}

print(angry_padding_stats)

344
{'mean': 10.0, 'max': 10.0, 'min': 10.0}


In [11]:
def Noise_Injection(data, noise_factor):
    # Adding white noise 
    noise = np.random.randn(len(data))
    augmented_data = data + noise_factor * noise
    # Cast back to same data type
    augmented_data = augmented_data.astype(type(data[0]))
    return augmented_data

In [12]:
def Shifting_Time(data, shift=16000):
    augmented_data = np.roll(data, shift)
    # Set to silence for heading/ tailing
    if shift > 0:
        augmented_data[:shift] = 0
    else:
        augmented_data[shift:] = 0
    return augmented_data

In [13]:
def ChangingPitch(data, sampling_rate, pitch_factor):
    return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)

def ChangingSpeed(data, speed_factor):
    
    input_length = 160000
    data = librosa.effects.time_stretch(data, speed_factor)
    if len(data)>input_length:
        data = data[:input_length]
    else:
        data = np.pad(data, (0, max(0, input_length - len(data))), "constant")
        
    return data

## Data Augmentation

In [14]:
def data_augmentation(augmentation_root, path_list):
    
    for path in path_list:
        data, _ = librosa.load(path, sr=16000)
        
        data_noise = Noise_Injection(data, 0.005)
        sf.write(os.path.join(augmentation_root, 'data_noise005_'+os.path.basename(path)), data_noise, 16000)
        data_noise = Noise_Injection(data, 0.01)
        sf.write(os.path.join(augmentation_root, 'data_noise01_'+os.path.basename(path)), data_noise, 16000)
        data_noise = Noise_Injection(data, 0.05)
        sf.write(os.path.join(augmentation_root, 'data_noise05_'+os.path.basename(path)), data_noise, 16000)
        
        data_shift = Shifting_Time(data, 16000)
        sf.write(os.path.join(augmentation_root, 'data_shift16_'+os.path.basename(path)), data_shift, 16000)
        data_shift = Shifting_Time(data, 32000)
        sf.write(os.path.join(augmentation_root, 'data_shift32_'+os.path.basename(path)), data_shift, 16000)
        data_shift = Shifting_Time(data, 48000)
        sf.write(os.path.join(augmentation_root, 'data_shift48_'+os.path.basename(path)), data_shift, 16000)
        
        
        data_pitch = ChangingPitch(data, sampling_rate=16000, pitch_factor=0.8)
        sf.write(os.path.join(augmentation_root, 'data_pitch08_'+os.path.basename(path)), data_pitch, 16000)
        data_pitch = ChangingPitch(data, sampling_rate=16000, pitch_factor=0.9)
        sf.write(os.path.join(augmentation_root, 'data_pitch09_'+os.path.basename(path)), data_pitch, 16000)
        data_pitch = ChangingPitch(data, sampling_rate=16000, pitch_factor=1.1)
        sf.write(os.path.join(augmentation_root, 'data_pitch11_'+os.path.basename(path)), data_pitch, 16000)
        data_pitch = ChangingPitch(data, sampling_rate=16000, pitch_factor=1.2)
        sf.write(os.path.join(augmentation_root, 'data_pitch12_'+os.path.basename(path)), data_pitch, 16000)
        
        data_speed = ChangingSpeed(data, speed_factor=0.8)
        sf.write(os.path.join(augmentation_root, 'data_speed08_'+os.path.basename(path)), data_speed, 16000)
        data_speed = ChangingSpeed(data, speed_factor=0.9)
        sf.write(os.path.join(augmentation_root, 'data_speed09_'+os.path.basename(path)), data_speed, 16000)
        data_speed = ChangingSpeed(data, speed_factor=1.1)
        sf.write(os.path.join(augmentation_root, 'data_speed11_'+os.path.basename(path)), data_speed, 16000)
        data_speed = ChangingSpeed(data, speed_factor=1.2)
        sf.write(os.path.join(augmentation_root, 'data_speed12_'+os.path.basename(path)), data_speed, 16000)


In [15]:
angry_padding_path_list = get_wav_path_list(angry_padding_root)
data_augmentation(angry_augmentation_root, angry_padding_path_list)
calm_padding_path_list = get_wav_path_list(calm_padding_root)
data_augmentation(calm_augmentation_root, calm_padding_path_list)
taunt_padding_path_list = get_wav_path_list(taunt_padding_root)
data_augmentation(taunt_augmentation_root, taunt_padding_path_list)
upset_padding_path_list = get_wav_path_list(upset_padding_root)
data_augmentation(upset_augmentation_root, upset_padding_path_list)

  return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)
  return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)
  return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)
  return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)
  data = librosa.effects.time_stretch(data, speed_factor)
  data = librosa.effects.time_stretch(data, speed_factor)
  data = librosa.effects.time_stretch(data, speed_factor)
  data = librosa.effects.time_stretch(data, speed_factor)


In [16]:
angry_augmentation_path_list = get_wav_path_list(angry_augmentation_root)

print(len(angry_augmentation_path_list))
angry_augmentation_durations = [librosa.get_duration(filename=p) for p in angry_augmentation_path_list]

angry_augmentation_stats = {
    'mean': np.mean(angry_augmentation_durations),
    'max': np.max(angry_augmentation_durations),
    'min': np.min(angry_augmentation_durations),
}

print(angry_augmentation_stats)

calm_augmentation_path_list = get_wav_path_list(calm_augmentation_root)

print(len(calm_augmentation_path_list))
calm_augmentation_durations = [librosa.get_duration(filename=p) for p in calm_augmentation_path_list]

calm_augmentation_stats = {
    'mean': np.mean(calm_augmentation_durations),
    'max': np.max(calm_augmentation_durations),
    'min': np.min(calm_augmentation_durations),
}

print(calm_augmentation_stats)

taunt_augmentation_path_list = get_wav_path_list(taunt_augmentation_root)

print(len(taunt_augmentation_path_list))
taunt_augmentation_durations = [librosa.get_duration(filename=p) for p in taunt_augmentation_path_list]

taunt_augmentation_stats = {
    'mean': np.mean(taunt_augmentation_durations),
    'max': np.max(taunt_augmentation_durations),
    'min': np.min(taunt_augmentation_durations),
}

print(taunt_augmentation_stats)

upset_augmentation_path_list = get_wav_path_list(upset_augmentation_root)

print(len(upset_augmentation_path_list))
upset_augmentation_durations = [librosa.get_duration(filename=p) for p in upset_augmentation_path_list]

upset_augmentation_stats = {
    'mean': np.mean(upset_augmentation_durations),
    'max': np.max(upset_augmentation_durations),
    'min': np.min(upset_augmentation_durations),
}

print(upset_augmentation_stats)

4816
{'mean': 10.0, 'max': 10.0, 'min': 10.0}
6846
{'mean': 10.0, 'max': 10.0, 'min': 10.0}
1652
{'mean': 10.0, 'max': 10.0, 'min': 10.0}
1372
{'mean': 10.0, 'max': 10.0, 'min': 10.0}
