In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

import numpy as np
import pandas as pd

from tqdm import tqdm
from joblib import Parallel, delayed

In [3]:
from utils import load_config, perform_statistical_tests
from data.load.data_loader import get_wavefake_audio_id_list, load_parquet_data
from features import (
    LowLevelFeatureExtractor, 
    HighLevelFeatureExtractor, 
    plot_low_level_feature_dist, 
    plot_high_level_feature_dist, 
    perform_pca_and_plot
)

In [4]:
config    = load_config()
cache_dir = config['data_paths']['wavefake']['cache_files']
features_dir = config['data_paths']['features']
audio_ids = get_wavefake_audio_id_list()

In [5]:
len(audio_ids)

13100

#### Codecfake - Partitions: 0, 1, ... 130

ajay: np.arange(0, 35) --> 0, 1, ..., 34

keerthana: np.arange(35, 67) --> 35, 36, ..., 66

Ruohe: np.arange(67, 99) --> 67, 68, ..., 98

Prudhvi: np.arange(99, 131) --> 99, 100, ..., 130

In [6]:
partitions = np.arange(67, 99)
len(partitions)

32

In [7]:
def generate_dataframe(iterable_ds):
    rows = []
    for audio in iterable_ds:
        audio_id  = audio['audio_id']
        audio_arr = audio['audio']['array']
        srate     = audio['audio']['sampling_rate']
        real_fake = audio['real_or_fake']
    
        rows.append({
            'audio_id': audio_id,
            'audio_arr': audio_arr,
            'srate': srate,
            'real_or_fake': real_fake
        })
    df = pd.DataFrame(rows)
    return df

In [8]:
partition_id = partitions[0]
iterable_ds = load_parquet_data(partition_id=partition_id, dataset='wavefake')    
partition_df = generate_dataframe(iterable_ds)
partition_df

Unnamed: 0,audio_id,audio_arr,srate,real_or_fake
0,LJ024-0106,"[0.000244140625, -0.000213623046875, -0.000793...",22050,WF1
1,LJ024-0106,"[0.00030517578125, -0.000152587890625, -0.0005...",22050,WF2
2,LJ024-0106,"[0.000579833984375, 0.000701904296875, 6.10351...",22050,WF3
3,LJ024-0106,"[-0.001983642578125, -0.00189208984375, -0.001...",22050,WF4
4,LJ024-0106,"[6.103515625e-05, -0.00042724609375, 0.0003051...",22050,WF5
...,...,...,...,...
795,LJ025-0062,"[-0.001312255859375, -0.001251220703125, -0.00...",22050,WF4
796,LJ025-0062,"[0.0, 9.1552734375e-05, 0.00042724609375, 0.00...",22050,WF5
797,LJ025-0062,"[-0.003021240234375, -0.00238037109375, -0.003...",22050,WF6
798,LJ025-0062,"[-0.0003662109375, -0.0003662109375, -0.000579...",22050,WF7


#### Extract Features - Sample 2 from each partitions

just to make sure everything works fine

In [9]:
audio_processor = LowLevelFeatureExtractor(target_sr=16000, include_only=['spectral', 'prosodic', 'voice_quality'])
feature_computer = HighLevelFeatureExtractor()

In [None]:
features_df_list = []

for partition_id in partitions[:20]:
    iterable_ds = load_parquet_data(partition_id=partition_id, dataset='wavefake')    
    partition_df = generate_dataframe(iterable_ds)
    print(f'Partition: {partition_id}')
    
    low_level_gen        = audio_processor.low_level_feature_generator(partition_df.sample(2))
    high_level_features  = list(feature_computer.high_level_feature_generator(low_level_gen))
    high_level_features_df = pd.DataFrame(high_level_features)
    features_df_list.append(high_level_features_df)
    


features_df = pd.concat(features_df_list, ignore_index=True)
features_df

Exception ignored from cffi callback <function SoundFile._init_virtual_io.<locals>.vio_read at 0xffff67583550>:
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/site-packages/soundfile.py", line 1244, in vio_read
    try:
KeyboardInterrupt: 


Partition: 0


Processing Audios:   0%|          | 0/2 [00:00<?, ?it/s]

### Using Parallel Processing to extract features for each partitions and save

In [9]:
def extract_features(row, audio_processor, feature_computer):
    low_level_features = audio_processor.extract_features(row)
    high_level_features = feature_computer.compute_high_level_features(low_level_features)
    return high_level_features

audio_processor  = LowLevelFeatureExtractor(target_sr=16000, include_only=['spectral', 'prosodic', 'voice_quality'])
feature_computer = HighLevelFeatureExtractor()

In [10]:
for partition_id in tqdm(partitions, total=len(partitions), desc="Processing Partitions"):
    csv_file_name = os.path.join(features_dir, f'wavefake_features_partition_{partition_id}.csv')
    iterable_ds = load_parquet_data(partition_id=partition_id, dataset='wavefake')    
    partition_df = generate_dataframe(iterable_ds)
    
    high_level_features   = Parallel(n_jobs=5)(
        delayed(extract_features)(row, audio_processor, feature_computer) 
        for _, row in partition_df.iterrows() # REMOVE .iloc[:10]
    )
    high_level_feature_df = pd.DataFrame(high_level_features)
    high_level_feature_df.to_csv(csv_file_name, index=False)

Processing Partitions:   6%|▋         | 2/32 [09:23<2:21:20, 282.68s/it]'(ReadTimeoutError("HTTPSConnectionPool(host='cdn-lfs-us-1.huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 80900a24-e26d-43c9-aae8-d4aabc181ee3)')' thrown while requesting GET https://huggingface.co/datasets/ajaykarthick/wavefake-audio/resolve/main/data/partition69-00000-of-00001.parquet
Retrying in 1s [Retry 1/5].
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 5f4874ba-0864-487c-8e46-750dfbcd0590)')' thrown while requesting GET https://huggingface.co/datasets/ajaykarthick/wavefake-audio/resolve/main/data/partition69-00000-of-00001.parquet
Retrying in 2s [Retry 2/5].
  stats['skew'] = skew(feature_array)
  stats['kurtosis'] = kurtosis(feature_array)
  stats['skew'] = skew(feature_array)
  stats['kurtosis'] = kurtosis(feature_array)
  stats['skew'] = skew(feature_array)
  stats['kurtosis'] = kurtosis(feature_