In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

import numpy as np
import pandas as pd

from tqdm import tqdm
from joblib import Parallel, delayed

In [3]:
from utils import load_config, perform_statistical_tests
from data.load.data_loader import get_wavefake_audio_id_list, load_parquet_data
from features import (
    LowLevelFeatureExtractor, 
    HighLevelFeatureExtractor, 
    plot_low_level_feature_dist, 
    plot_high_level_feature_dist, 
    perform_pca_and_plot
)

In [5]:
config    = load_config()
cache_dir = config['data_paths']['wavefake']['cache_files']
features_dir = config['data_paths']['features']
audio_ids = get_wavefake_audio_id_list()

In [6]:
len(audio_ids)

13100

#### Codecfake - Partitions: 0, 1, ... 130

ajay: np.arange(0, 35) --> 0, 1, ..., 34

keerthana: np.arange(35, 67) --> 35, 36, ..., 66

Ruohe: np.arange(67, 99) --> 67, 68, ..., 98

Prudhvi: np.arange(99, 131) --> 99, 100, ..., 130

In [22]:
partitions = np.arange(0, 35)
len(partitions)

35

In [23]:
def generate_dataframe(iterable_ds):
    rows = []
    for audio in iterable_ds:
        audio_id  = audio['audio_id']
        audio_arr = audio['audio']['array']
        srate     = audio['audio']['sampling_rate']
        real_fake = audio['real_or_fake']
    
        rows.append({
            'audio_id': audio_id,
            'audio_arr': audio_arr,
            'srate': srate,
            'real_or_fake': real_fake
        })
    df = pd.DataFrame(rows)
    return df

In [24]:
partition_id = partitions[0]
iterable_ds = load_parquet_data(partition_id=partition_id)    
partition_df = generate_dataframe(iterable_ds)
partition_df

Unnamed: 0,audio_id,audio_arr,srate,real_or_fake
0,p225_002,"[-0.0045166015625, -0.00665283203125, -0.00607...",48000,R
1,p225_002,"[0.001953125, 0.001556396484375, 0.00164794921...",16000,F03
2,p225_002,"[-0.001220703125, -0.001129150390625, -0.00103...",24000,F04
3,p225_002,"[0.001861572265625, 0.001922607421875, 0.00195...",16000,F01
4,p225_002,"[-0.00335693359375, -0.0032958984375, -0.00320...",48000,F05
...,...,...,...,...
646,p225_191,"[0.00762939453125, 0.007568359375, 0.007446289...",48000,F05
647,p225_191,"[0.00787353515625, 0.00799560546875, 0.0081176...",16000,F01
648,p225_191,"[0.002777099609375, 0.001678466796875, 0.00183...",24000,F04
649,p225_191,"[0.00311279296875, 0.003204345703125, 0.002960...",16000,F03


#### Extract Features - Sample 2 from each partitions

just to make sure everything works fine

In [25]:
audio_processor = LowLevelFeatureExtractor(target_sr=16000, include_only=['spectral', 'prosodic', 'voice_quality'])
feature_computer = HighLevelFeatureExtractor()

In [30]:
features_df_list = []

for partition_id in partitions[:20]:
    iterable_ds = load_parquet_data(partition_id=partition_id, dataset='wavefake')    
    partition_df = generate_dataframe(iterable_ds)
    print(f'Partition: {partition_id}')
    
    low_level_gen        = audio_processor.low_level_feature_generator(partition_df.sample(2))
    high_level_features  = list(feature_computer.high_level_feature_generator(low_level_gen))
    high_level_features_df = pd.DataFrame(high_level_features)
    features_df_list.append(high_level_features_df)
    


features_df = pd.concat(features_df_list, ignore_index=True)
features_df

Partition: 0


Processing Audios: 100%|██████████| 2/2 [00:02<00:00,  1.33s/it]


Partition: 1


Processing Audios: 100%|██████████| 2/2 [00:03<00:00,  1.65s/it]


Partition: 2


Processing Audios: 100%|██████████| 2/2 [00:02<00:00,  1.14s/it]


Partition: 3


Processing Audios: 100%|██████████| 2/2 [00:01<00:00,  1.30it/s]


Partition: 4


Processing Audios: 100%|██████████| 2/2 [00:01<00:00,  1.01it/s]


Partition: 5


Processing Audios: 100%|██████████| 2/2 [00:02<00:00,  1.11s/it]


Partition: 6


Processing Audios: 100%|██████████| 2/2 [00:03<00:00,  1.75s/it]


Partition: 7


Processing Audios: 100%|██████████| 2/2 [00:02<00:00,  1.18s/it]


Partition: 8


Processing Audios: 100%|██████████| 2/2 [00:02<00:00,  1.36s/it]


Partition: 9


Processing Audios: 100%|██████████| 2/2 [00:02<00:00,  1.14s/it]


Partition: 10


Processing Audios: 100%|██████████| 2/2 [00:02<00:00,  1.15s/it]


Partition: 11


Processing Audios: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]


Partition: 12


Processing Audios: 100%|██████████| 2/2 [00:03<00:00,  1.55s/it]


Partition: 13


Processing Audios: 100%|██████████| 2/2 [00:03<00:00,  1.65s/it]


Partition: 14


Processing Audios: 100%|██████████| 2/2 [00:03<00:00,  1.51s/it]


Partition: 15


Processing Audios: 100%|██████████| 2/2 [00:02<00:00,  1.31s/it]


Partition: 16


Processing Audios: 100%|██████████| 2/2 [00:02<00:00,  1.42s/it]


Partition: 17


Processing Audios: 100%|██████████| 2/2 [00:02<00:00,  1.09s/it]


Partition: 18


Processing Audios: 100%|██████████| 2/2 [00:02<00:00,  1.17s/it]


Partition: 19


Processing Audios: 100%|██████████| 2/2 [00:02<00:00,  1.47s/it]


Unnamed: 0,audio_id,real_or_fake,spectral_centroid_mean,spectral_centroid_std,spectral_centroid_var,spectral_centroid_min,spectral_centroid_max,spectral_centroid_range,spectral_centroid_25th_percentile,spectral_centroid_50th_percentile,...,shimmer_dda,hnr,voicedcount,npause,originaldur,intensity_duration,speakingrate,articulationrate,asd,totalpauseduration
0,LJ001-0006,WF4,1946.391859,1139.367355,1298158.0,561.096768,6578.741955,6017.645187,1248.482975,1594.724058,...,0.082406,12.016871,26,1,5.688889,5.688889,4.570312,4.871752,0.205265,0.352
1,LJ001-0012,WF7,2088.423236,1439.355598,2071745.0,575.664603,6852.698189,6277.033586,1193.755381,1570.352317,...,0.112374,11.696977,37,3,8.243084,8.243084,4.488611,5.125304,0.19511,1.024
2,LJ001-0153,WF1,2187.480479,1500.637887,2251914.0,615.251427,6484.635297,5869.38387,1199.811105,1632.419174,...,0.093102,11.764792,23,1,6.478367,6.478367,3.550277,3.744485,0.267059,0.336
3,LJ001-0163,WF4,2443.919047,1575.008305,2480651.0,463.493521,6452.387084,5988.893563,1340.673034,1768.278672,...,0.072869,10.979552,29,1,6.397098,6.397098,4.533306,4.810007,0.2079,0.368
4,LJ002-0043,WF4,2321.674742,1554.305726,2415866.0,421.661604,6524.94548,6103.283876,1191.582993,1755.835326,...,0.088402,11.650579,25,4,7.697415,7.697415,3.247844,4.14359,0.241337,1.664
5,LJ002-0075,WF3,1604.35571,1083.202579,1173328.0,583.827589,5641.969232,5058.141644,986.23702,1170.88345,...,0.109224,11.591573,16,0,4.643991,4.643991,3.445312,3.445312,0.290249,0.0
6,LJ002-0141,WF4,1998.182433,1300.282045,1690733.0,545.688698,6422.750731,5877.062033,1167.934211,1545.814359,...,0.091929,10.939232,21,0,4.284082,4.284082,4.901867,4.901867,0.204004,0.0
7,LJ002-0208,WF7,2034.759465,1675.674671,2807886.0,600.352094,5923.256896,5322.904803,792.043408,1137.262772,...,0.111847,11.0927,8,0,2.229116,2.229116,3.588867,3.588867,0.278639,0.0
8,LJ002-0265,R,1828.954486,1276.307062,1628960.0,417.049065,6364.786649,5947.737584,1059.552321,1351.469408,...,0.122464,12.670282,27,1,7.205306,7.205306,3.747238,4.005159,0.249678,0.464
9,LJ002-0216,WF1,2339.256945,1473.12877,2170108.0,786.541493,6510.86526,5724.323766,1216.841967,1765.221056,...,0.098896,10.049797,14,0,2.879274,2.879274,4.862336,4.862336,0.205662,0.0


### Using Parallel Processing to extract features for each partitions and save

In [31]:
def extract_features(row, audio_processor, feature_computer):
    low_level_features = audio_processor.extract_features(row)
    high_level_features = feature_computer.compute_high_level_features(low_level_features)
    return high_level_features

audio_processor  = LowLevelFeatureExtractor(target_sr=16000, include_only=['spectral', 'prosodic', 'voice_quality'])
feature_computer = HighLevelFeatureExtractor()

In [32]:
for partition_id in tqdm(partitions, total=len(partitions), desc="Processing Partitions"):
    csv_file_name = os.path.join(features_dir, f'wavefake_features_partition_{partition_id}.csv')
    iterable_ds = load_parquet_data(partition_id=partition_id, dataset='wavefake')    
    partition_df = generate_dataframe(iterable_ds)
    
    high_level_features   = Parallel(n_jobs=5)(
        delayed(extract_features)(row, audio_processor, feature_computer) 
        for _, row in partition_df.iloc[:10].iterrows() # REMOVE .iloc[:10]
    )
    high_level_feature_df = pd.DataFrame(high_level_features)
    high_level_feature_df.to_csv(csv_file_name, index=False)

Processing Partitions: 100%|██████████| 35/35 [07:36<00:00, 13.04s/it]
