In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

import numpy as np
import pandas as pd

from tqdm import tqdm
from joblib import Parallel, delayed

In [3]:
from utils import load_config, perform_statistical_tests
from data.load.data_loader import get_codecfake_audio_id_list, load_parquet_data
from features import (
    LowLevelFeatureExtractor, 
    HighLevelFeatureExtractor, 
    plot_low_level_feature_dist, 
    plot_high_level_feature_dist, 
    perform_pca_and_plot
)

In [4]:
config    = load_config()
cache_dir = config['data_paths']['codecfake']['cache_files']
features_dir = config['data_paths']['features']
audio_ids = get_codecfake_audio_id_list()

#### Codecfake - Partitions: 0, 1, ... 379

ajay: np.arange(0, 95) --> 0, 1, ..., 94

keerthana: np.arange(95, 190) --> 95, 96, ..., 189

Ruohe: np.arange(190, 285) --> 190, 191, ..., 284

Prudhvi: np.arange(285, 380) --> 285, 286, ..., 379

In [5]:
partitions = np.arange(0, 380)
partitions

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [6]:
def generate_dataframe(iterable_ds):
    rows = []
    for audio in iterable_ds:
        audio_id  = audio['audio_id']
        audio_arr = audio['audio']['array']
        srate     = audio['audio']['sampling_rate']
        real_fake = audio['real_or_fake']
    
        rows.append({
            'audio_id': audio_id,
            'audio_arr': audio_arr,
            'srate': srate,
            'real_or_fake': real_fake
        })
    df = pd.DataFrame(rows)
    return df

In [7]:
partition_id = partitions[0]
iterable_ds = load_parquet_data(partition_id=partition_id)    
partition_df = generate_dataframe(iterable_ds)
partition_df

Unnamed: 0,audio_id,audio_arr,srate,real_or_fake
0,p225_002,"[-0.0045166015625, -0.00665283203125, -0.00607...",48000,R
1,p225_002,"[0.001953125, 0.001556396484375, 0.00164794921...",16000,F03
2,p225_002,"[-0.001220703125, -0.001129150390625, -0.00103...",24000,F04
3,p225_002,"[0.001861572265625, 0.001922607421875, 0.00195...",16000,F01
4,p225_002,"[-0.00335693359375, -0.0032958984375, -0.00320...",48000,F05
...,...,...,...,...
646,p225_191,"[0.00762939453125, 0.007568359375, 0.007446289...",48000,F05
647,p225_191,"[0.00787353515625, 0.00799560546875, 0.0081176...",16000,F01
648,p225_191,"[0.002777099609375, 0.001678466796875, 0.00183...",24000,F04
649,p225_191,"[0.00311279296875, 0.003204345703125, 0.002960...",16000,F03


#### Extract Features - Sample 2 from each of 95 partitions

just to make sure everything works fine

In [8]:
audio_processor = LowLevelFeatureExtractor(target_sr=16000, include_only=['spectral', 'prosodic', 'voice_quality'])
feature_computer = HighLevelFeatureExtractor()

In [9]:
features_df_list = []

for partition_id in partitions[:20]:
    iterable_ds = load_parquet_data(partition_id=partition_id)    
    partition_df = generate_dataframe(iterable_ds)
    print(f'Partition: {partition_id}')
    
    low_level_gen        = audio_processor.low_level_feature_generator(partition_df.sample(2))
    high_level_features  = list(feature_computer.high_level_feature_generator(low_level_gen))
    high_level_features_df = pd.DataFrame(high_level_features)
    features_df_list.append(high_level_features_df)
    


features_df = pd.concat(features_df_list, ignore_index=True)
features_df

Partition: 0


Processing Audios: 100%|██████████| 2/2 [00:02<00:00,  1.36s/it]


Partition: 1


Processing Audios: 100%|██████████| 2/2 [00:01<00:00,  1.31it/s]


Partition: 2


Processing Audios: 100%|██████████| 2/2 [00:01<00:00,  1.53it/s]


Partition: 3


Processing Audios: 100%|██████████| 2/2 [00:01<00:00,  1.23it/s]


Partition: 4


Processing Audios: 100%|██████████| 2/2 [00:01<00:00,  1.22it/s]


Partition: 5


Processing Audios: 100%|██████████| 2/2 [00:01<00:00,  1.06it/s]


Partition: 6


Processing Audios: 100%|██████████| 2/2 [00:01<00:00,  1.11it/s]


Partition: 7


Processing Audios: 100%|██████████| 2/2 [00:01<00:00,  1.52it/s]


Partition: 8


Processing Audios: 100%|██████████| 2/2 [00:01<00:00,  1.04it/s]


Partition: 9


Processing Audios: 100%|██████████| 2/2 [00:01<00:00,  1.12it/s]


Partition: 10


Processing Audios: 100%|██████████| 2/2 [00:02<00:00,  1.14s/it]


Partition: 11


Processing Audios: 100%|██████████| 2/2 [00:01<00:00,  1.27it/s]


Partition: 12


Processing Audios: 100%|██████████| 2/2 [00:00<00:00,  2.26it/s]


Partition: 13


Processing Audios: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s]


Partition: 14


Processing Audios: 100%|██████████| 2/2 [00:01<00:00,  1.64it/s]


Partition: 15


Processing Audios: 100%|██████████| 2/2 [00:01<00:00,  1.65it/s]


Partition: 16


Processing Audios: 100%|██████████| 2/2 [00:01<00:00,  1.02it/s]


Partition: 17


Processing Audios: 100%|██████████| 2/2 [00:01<00:00,  1.39it/s]


Partition: 18


Processing Audios: 100%|██████████| 2/2 [00:01<00:00,  1.28it/s]


Partition: 19


Processing Audios: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s]


Unnamed: 0,audio_id,real_or_fake,spectral_centroid_mean,spectral_centroid_std,spectral_centroid_var,spectral_centroid_min,spectral_centroid_max,spectral_centroid_range,spectral_centroid_25th_percentile,spectral_centroid_50th_percentile,...,shimmer_dda,hnr,voicedcount,npause,originaldur,intensity_duration,speakingrate,articulationrate,asd,totalpauseduration
0,p225_175,F05,1236.966535,1187.713811,1410664.0,233.202605,5066.872682,4833.670077,381.041369,863.791052,...,0.123237,10.474553,10,1,5.25,5.25,1.904762,4.164931,0.2401,2.849
1,p225_159,F05,964.970317,1010.546998,1021205.0,262.470468,5561.836452,5299.365985,401.386971,571.320703,...,0.072911,13.659824,5,0,3.71875,3.71875,1.344538,3.149854,0.317475,2.131375
2,p225_280,F03,1123.309759,1065.098364,1134435.0,275.499557,5114.883094,4839.383538,455.878277,816.913812,...,0.072421,15.086141,12,1,4.96,4.96,2.419355,5.0,0.2,2.56
3,p225_200,F05,1002.674268,600.053906,360064.7,201.528061,2960.134335,2758.606273,361.151247,1119.505211,...,0.093707,11.575317,5,0,1.86875,1.86875,2.675585,4.882812,0.2048,0.84475
4,p226_039,F05,1115.349031,956.951665,915756.5,299.835299,4425.381884,4125.546586,433.152356,901.639495,...,0.087269,11.790821,2,0,2.4375,2.4375,0.820513,2.45098,0.408,1.6215
5,p226_079,F04,775.612573,945.919565,894763.8,194.338628,4305.978849,4111.640221,259.872071,365.496821,...,0.136845,13.875627,3,0,2.826667,2.826667,1.061321,2.556818,0.391111,1.653333
6,p226_199,F02,715.868707,451.718799,204049.9,250.932094,2584.60354,2333.671446,448.589117,591.13104,...,0.080928,15.812191,5,0,3.0,3.0,1.666667,2.886836,0.3464,1.268
7,p226_230,F06,939.15039,829.346646,687815.9,256.543247,4417.321955,4160.778708,419.071088,594.48634,...,0.089036,12.673257,11,0,6.72,6.72,1.636905,3.696237,0.270545,3.744
8,p226_322,F02,1180.653029,1068.766171,1142261.0,208.545348,4121.87463,3913.329282,346.130363,908.033001,...,0.090745,10.4539,7,1,3.08,3.08,2.272727,3.113879,0.321143,0.832
9,p226_256,F03,936.69977,917.875898,842496.2,192.346979,4032.491059,3840.14408,320.824147,562.546959,...,0.10389,12.595169,8,0,4.72,4.72,1.694915,3.448276,0.29,2.4


### Using Parallel Processing to extract features for each partitions and save

In [7]:
def extract_features(row, audio_processor, feature_computer):
    low_level_features = audio_processor.extract_features(row)
    high_level_features = feature_computer.compute_high_level_features(low_level_features)
    return high_level_features

audio_processor  = LowLevelFeatureExtractor(target_sr=16000, include_only=['spectral', 'prosodic', 'voice_quality'])
feature_computer = HighLevelFeatureExtractor()

In [9]:
for partition_id in tqdm(partitions, total=len(partitions), desc="Processing Partitions"):
    if partition_id < 364:
        continue
    csv_file_name = os.path.join(features_dir, f'features_partition_{partition_id}.csv')
    
        
    iterable_ds = load_parquet_data(partition_id=partition_id)    
    partition_df = generate_dataframe(iterable_ds)
    
    
    if os.path.exists(csv_file_name):
        existing_df = pd.read_csv(csv_file_name)
        if existing_df.shape[0] == partition_df.shape[0]:
            continue
    
    high_level_features   = Parallel(n_jobs=5)(
        delayed(extract_features)(row, audio_processor, feature_computer) 
        for _, row in partition_df.iterrows() # REMOVE .iloc[:10]
    )
    high_level_feature_df = pd.DataFrame(high_level_features)
    high_level_feature_df.to_csv(csv_file_name, index=False)
    

  stats['skew'] = skew(feature_array)
  stats['kurtosis'] = kurtosis(feature_array)
  stats['skew'] = skew(feature_array)
  stats['kurtosis'] = kurtosis(feature_array)
  stats['skew'] = skew(feature_array)
  stats['kurtosis'] = kurtosis(feature_array)
  stats['skew'] = skew(feature_array)
  stats['kurtosis'] = kurtosis(feature_array)
  stats['skew'] = skew(feature_array)
  stats['kurtosis'] = kurtosis(feature_array)
  stats['skew'] = skew(feature_array)
  stats['kurtosis'] = kurtosis(feature_array)
  stats['skew'] = skew(feature_array)
  stats['kurtosis'] = kurtosis(feature_array)
  stats['skew'] = skew(feature_array)
  stats['kurtosis'] = kurtosis(feature_array)
Processing Partitions: 100%|██████████| 380/380 [38:43<00:00,  6.12s/it] 


In [16]:
from datasets import load_dataset

In [17]:
parquet_file = "https://huggingface.co/datasets/ajaykarthick/codecfake-audio/resolve/main/data/partition350-00000-of-00001.parquet"
dataset = load_dataset("parquet", data_files={'train': parquet_file}, split="train", streaming=True)

In [18]:
for ex in dataset:
    print(ex)

{'audio': {'path': 'F06_p345_316.flac', 'array': array([ 0.00262451,  0.0020752 ,  0.00253296, ..., -0.00457764,
       -0.00430298, -0.00442505]), 'sampling_rate': 16000}, 'audio_id': 'p345_316', 'real_or_fake': 'F06'}
{'audio': {'path': 'F02_p345_316.flac', 'array': array([ 0.00189209,  0.00192261,  0.00170898, ..., -0.00134277,
       -0.00131226, -0.00119019]), 'sampling_rate': 16000}, 'audio_id': 'p345_316', 'real_or_fake': 'F02'}
{'audio': {'path': 'F05_p345_316.flac', 'array': array([-0.0015564 , -0.0017395 , -0.00143433, ...,  0.00015259,
       -0.00021362,  0.00012207]), 'sampling_rate': 48000}, 'audio_id': 'p345_316', 'real_or_fake': 'F05'}
{'audio': {'path': 'F01_p345_316.flac', 'array': array([-0.00384521, -0.00375366, -0.00357056, ..., -0.00344849,
       -0.00341797, -0.00335693]), 'sampling_rate': 16000}, 'audio_id': 'p345_316', 'real_or_fake': 'F01'}
{'audio': {'path': 'F04_p345_316.flac', 'array': array([-0.00057983, -0.00064087, -0.00064087, ..., -0.0027771 ,
       

LibsndfileError: Internal psf_fseek() failed.

In [12]:
del dataset