In [3]:
# !pip install numpy requests nlpaug

import nlpaug.augmenter.audio as naa
import IPython.display as ipd

import librosa
import librosa.display
import soundfile as sf

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import glob
import random


In [4]:
SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
os.environ['TF_DETERMINISTIC_OPS'] = '1'
np.random.seed(SEED)
random.seed(SEED)

In [5]:
data_split_dir = './datasets/KAGGLE/AUDIO/data_split/'
new_fake = data_split_dir + 'FAKE/'
new_real = data_split_dir + 'REAL/'
csv_path = './datasets/csv/'
eval_csv_path = './datasets/eval_csv/'

clip_len = 10

# aug_fake = data_split_dir + 'aug_fake/'
# aug_real = data_split_dir + 'aug_real/'

In [10]:
def simulateAug(augModel, audio, sr, flag) :
    augmented_data = augModel.augment(audio)

    librosa.display.waveshow(np.array(augmented_data), sr=sr, alpha=0.5, color="blue")
    librosa.display.waveshow(audio, sr=sr, alpha=0.25, color="red")

    display('origin', ipd.Audio(audio, rate=sr))
    display(flag, ipd.Audio(augmented_data, rate=sr))

    plt.tight_layout()
    plt.show()

    return augmented_data

col_name = ['mfcc1', 'mfcc2', 'mfcc3', 'mfcc4', 'mfcc5', 'mfcc6', 'mfcc7', 'mfcc8', 'mfcc9', 'mfcc10', 
            'mfcc11', 'mfcc12', 'mfcc13', 'mfcc14', 'mfcc15', 'mfcc16', 'mfcc17', 'mfcc18', 'mfcc19', 'mfcc20', 
            'zcr', 'rms', 'spec_cent', 'spec_bw', 'rolloff', 'chroma_stft1', 'chroma_stft2', 'chroma_stft3', 'chroma_stft4', 'chroma_stft5', 
            'chroma_stft6', 'chroma_stft7', 'chroma_stft8', 'chroma_stft9', 'chroma_stft10', 'chroma_stft11', 'chroma_stft12'
            ]

def getFeature(audio, fileName, flag) :
    features_df = pd.DataFrame()
    result = np.array([])

    # MFCC : Mel-Spectrogram이라는 피쳐에 대해 행렬을 압축해서 표현해주는 DCT 연산을 수행
    mfcc = np.mean(librosa.feature.mfcc(y=audio).T, axis=0)
    
    # zero cross rate : 특정 프레임이 지속 기간 동안의 신호의 부호(sign) 변화율 i.e. 신호의 부호가 바뀌는 비율
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=audio).T, axis=0)

    # chroma shift : 
    stft = np.abs(librosa.stft(audio))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft).T, axis=0)

    # Spectral Centroid : 음성의 각 프레임마다 평균(중심) 주파수를 반환
    spec_cent = np.mean(librosa.feature.spectral_centroid(y=audio).T, axis=0)

    # Spectral Bandwidth : Frequency들의 magnitude에 따른 centroid의 s.t.d.들의 weighted sum
    spec_bw = np.mean(librosa.feature.spectral_bandwidth(y=audio).T, axis=0)

    # Spectral Rolloff : 스펙트로그램에서 roll percent 위치(defalut : 0.85)에 차지하는 주파수를 구함
    rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio).T, axis=0)

    # Root Mean Square Value
    rms = np.mean(librosa.feature.rms(y=audio).T, axis=0)

    # print(audio.shape, mfcc.shape, result.shape, zcr.shape)
    result=np.hstack((result, mfcc))
    result=np.hstack((result, zcr))
    result=np.hstack((result, rms))
    result=np.hstack((result, spec_cent))
    result=np.hstack((result, spec_bw))
    result=np.hstack((result, rolloff))
    result=np.hstack((result, chroma_stft))

    # print(result)
    features_df = pd.DataFrame(result.reshape(1, -1), columns=col_name)
    features_df['fileName'] = fileName
    y_flag = 0
    if (flag == 'fake') :
        y_flag = 1
    features_df['label'] = y_flag

    return features_df

def generateVtlpAugAudio(dir, flag) :    
    voice_df = pd.DataFrame()
    
    for fileName in os.listdir(dir) :
        origin, sr = librosa.load(dir+fileName)
        audio_len = origin.shape[0]/float(sr)
        print('[{}] : audio shape : {}, sr:{}, length:{} secs'.format(fileName, origin.shape, sr, audio_len))
        if audio_len < clip_len :
            continue

        vtlpAug = naa.VtlpAug(sampling_rate=sr, zone=(0.0, 1.0), coverage=1, factor=(1, 1.003))
        augmented_vtlp_data = vtlpAug.augment(origin)
        augmented_vtlp_data = np.squeeze(np.array(augmented_vtlp_data))        

        features_df = getFeature(augmented_vtlp_data, os.path.splitext(fileName)[0]+'_vtlp.wav', flag)
        voice_df = pd.concat([voice_df, features_df], ignore_index=True)

        # output_file = os.path.join(aug_real+"vtlp", f'{os.path.splitext(fileName)[0]}_vtlp.wav')
        # if (flag == 'fake') :
        #     output_file = os.path.join(aug_fake+"vtlp", f'{os.path.splitext(fileName)[0]}_vtlp.wav')
        # sf.write(output_file, np.ravel(augmented_vtlp_data), sr)

    return voice_df


def generateAugAudio(dir, augModel, flag) :   
    voice_df = pd.DataFrame()
    
    for fileName in os.listdir(dir) :
        origin, sr = librosa.load(dir+fileName)
        audio_len = origin.shape[0]/float(sr)
        print('[{}] : audio shape : {}, sr:{}, length:{} secs'.format(fileName, origin.shape, sr, audio_len))
        if audio_len < clip_len :
            continue

        augmented_data = augModel.augment(origin)
        augmented_data = np.squeeze(np.array(augmented_data))

        features_df = getFeature(augmented_data, os.path.splitext(fileName)[0]+'_vtlp.wav', flag)
        voice_df = pd.concat([voice_df, features_df], ignore_index=True)

    return voice_df

In [44]:
# vtlp
# naa.VtlpAug(sampling_rate=sr, zone=(0.0, 1.0), coverage=1, factor=(1, 1.003))
real_vtlp_df = generateVtlpAugAudio(new_real, 'real')
fake_vtlp_df = generateVtlpAugAudio(new_fake, 'fake')

merged_df = pd.concat([real_vtlp_df, fake_vtlp_df], ignore_index=True)
merged_df.to_csv(csv_path+'feature_extracting_vtlp.csv', index=False)

[biden-original_clip_0.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_1.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_10.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_11.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_12.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_13.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_14.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_15.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_16.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_17.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_18.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_19.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden

In [32]:
# white noise
whiteNoiseAug = naa.NoiseAug(zone=(0.0, 1.0), color='white')

real_w_noise_df = generateAugAudio(new_real, whiteNoiseAug, 'real')
fake_w_noise_df = generateAugAudio(new_fake, whiteNoiseAug, 'fake')

merged_df = pd.concat([real_w_noise_df, fake_w_noise_df], ignore_index=True)
merged_df.to_csv(csv_path+'feature_extracting_w_noise.csv', index=False)

[biden-original_clip_0.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_1.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_10.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_11.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_12.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_13.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_14.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_15.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_16.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_17.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_18.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_19.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden

In [33]:
# brown noise : 백색 소음의 일종이나 더 높은 주파수에 대해 낮춘 noise
brownNoiseAug = naa.NoiseAug(zone=(0.0, 1.0), color='brown')

real_b_noise_df = generateAugAudio(new_real, brownNoiseAug, 'real')
fake_b_noise_df = generateAugAudio(new_fake, brownNoiseAug, 'fake')

merged_df = pd.concat([real_b_noise_df, fake_b_noise_df], ignore_index=True)
merged_df.to_csv(csv_path+'feature_extracting_b_noise.csv', index=False)

[biden-original_clip_0.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_1.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_10.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_11.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_12.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_13.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_14.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_15.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_16.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_17.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_18.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_19.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden

In [48]:
# mask
# maskAug = naa.MaskAug(zone=(0.2, 0.3), coverage=0.2, mask_with_noise=False)
maskAug = naa.MaskAug(zone=(0.2, 0.3), coverage=1, mask_with_noise=False)

real_mask_df = generateAugAudio(new_real, maskAug, 'real')
fake_mask_df = generateAugAudio(new_fake, maskAug, 'fake')

merged_df = pd.concat([real_mask_df, fake_mask_df], ignore_index=True)
merged_df.to_csv(csv_path+'feature_extracting_mask_1sec.csv', index=False)

[biden-original_clip_0.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_1.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_10.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_11.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_12.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_13.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_14.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_15.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_16.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_17.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_18.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_19.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden

In [46]:
# speed up
speedAug = naa.SpeedAug(zone=(0,1), coverage=1, factor=(1, 1.007))

real_speed_df = generateAugAudio(new_real, speedAug, 'real')
fake_speed_df = generateAugAudio(new_fake, speedAug, 'fake')

merged_df = pd.concat([real_speed_df, fake_speed_df], ignore_index=True)
merged_df.to_csv(csv_path+'feature_extracting_speed.csv', index=False)

[biden-original_clip_0.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_1.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_10.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_11.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_12.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_13.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_14.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_15.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_16.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_17.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_18.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_19.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden

In [49]:
# speed up
speedAug = naa.SpeedAug(zone=(0,1), coverage=1, factor=(1.1, 1.1))

real_speed_df = generateAugAudio(new_real, speedAug, 'real')
fake_speed_df = generateAugAudio(new_fake, speedAug, 'fake')

merged_df = pd.concat([real_speed_df, fake_speed_df], ignore_index=True)
merged_df.to_csv(csv_path+'feature_extracting_speed_1.csv', index=False)

[biden-original_clip_0.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_1.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_10.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_11.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_12.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_13.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_14.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_15.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_16.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_17.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_18.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_19.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden

In [53]:
# speed down
speedAug = naa.SpeedAug(zone=(0,1), coverage=1, factor=(0.9, 0.9))

real_speed_df = generateAugAudio(new_real, speedAug, 'real')
fake_speed_df = generateAugAudio(new_fake, speedAug, 'fake')

merged_df = pd.concat([real_speed_df, fake_speed_df], ignore_index=True)
merged_df.to_csv(csv_path+'feature_extracting_speed_09.csv', index=False)

[biden-original_clip_0.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_1.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_10.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_11.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_12.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_13.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_14.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_15.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_16.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_17.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_18.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_19.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden

In [47]:
# normalize
normalizeAug = naa.NormalizeAug(method='minmax', zone=(0,1), coverage=1)

real_normalize_df = generateAugAudio(new_real, normalizeAug, 'real')
fake_normalize_df = generateAugAudio(new_fake, normalizeAug, 'fake')

merged_df = pd.concat([real_normalize_df, fake_normalize_df], ignore_index=True)
merged_df.to_csv(csv_path+'feature_extracting_normalize.csv', index=False)

[biden-original_clip_0.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_1.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_10.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_11.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_12.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_13.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_14.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_15.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_16.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_17.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_18.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden-original_clip_19.wav] : audio shape : (220500,), sr:22050, length:10.0 secs
[biden