In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

import numpy as np
import pandas as pd

from tqdm import tqdm
from joblib import Parallel, delayed

In [3]:
from utils import load_config, perform_statistical_tests
from data.load.data_loader import get_codecfake_audio_id_list, load_parquet_data
from features import (
    LowLevelFeatureExtractor, 
    HighLevelFeatureExtractor, 
    plot_low_level_feature_dist, 
    plot_high_level_feature_dist, 
    perform_pca_and_plot
)

In [4]:
config    = load_config()
cache_dir = config['data_paths']['codecfake']['cache_files']
features_dir = config['data_paths']['features']
audio_ids = get_codecfake_audio_id_list()

#### Codecfake - Partitions: 0, 1, ... 379

ajay: np.arange(0, 95) --> 0, 1, ..., 94

keerthana: np.arange(95, 190) --> 95, 96, ..., 189

Ruohe: np.arange(190, 285) --> 190, 191, ..., 284

Prudhvi: np.arange(285, 380) --> 285, 286, ..., 379

In [5]:
partitions = np.arange(0, 95)
partitions

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94])

In [6]:
def generate_dataframe(iterable_ds):
    rows = []
    for audio in iterable_ds:
        audio_id  = audio['audio_id']
        audio_arr = audio['audio']['array']
        srate     = audio['audio']['sampling_rate']
        real_fake = audio['real_or_fake']
    
        rows.append({
            'audio_id': audio_id,
            'audio_arr': audio_arr,
            'srate': srate,
            'real_or_fake': real_fake
        })
    df = pd.DataFrame(rows)
    return df

In [7]:
partition_id = partitions[0]
iterable_ds = load_parquet_data(partition_id=partition_id)    
partition_df = generate_dataframe(iterable_ds)
partition_df

Unnamed: 0,audio_id,audio_arr,srate,real_or_fake
0,p225_002,"[-0.0045166015625, -0.00665283203125, -0.00607...",48000,R
1,p225_002,"[0.001953125, 0.001556396484375, 0.00164794921...",16000,F03
2,p225_002,"[-0.001220703125, -0.001129150390625, -0.00103...",24000,F04
3,p225_002,"[0.001861572265625, 0.001922607421875, 0.00195...",16000,F01
4,p225_002,"[-0.00335693359375, -0.0032958984375, -0.00320...",48000,F05
...,...,...,...,...
646,p225_191,"[0.00762939453125, 0.007568359375, 0.007446289...",48000,F05
647,p225_191,"[0.00787353515625, 0.00799560546875, 0.0081176...",16000,F01
648,p225_191,"[0.002777099609375, 0.001678466796875, 0.00183...",24000,F04
649,p225_191,"[0.00311279296875, 0.003204345703125, 0.002960...",16000,F03


#### Extract Features - Sample 2 from each of 95 partitions

just to make sure everything works fine

In [9]:
audio_processor = LowLevelFeatureExtractor(target_sr=16000, include_only=['spectral', 'prosodic', 'voice_quality'])
feature_computer = HighLevelFeatureExtractor()

In [16]:
features_df_list = []

for partition_id in partitions[:20]:
    iterable_ds = load_parquet_data(partition_id=partition_id)    
    partition_df = generate_dataframe(iterable_ds)
    print(f'Partition: {partition_id}')
    
    low_level_gen        = audio_processor.low_level_feature_generator(partition_df.sample(2))
    high_level_features  = list(feature_computer.high_level_feature_generator(low_level_gen))
    high_level_features_df = pd.DataFrame(high_level_features)
    features_df_list.append(high_level_features_df)
    


features_df = pd.concat(features_df_list, ignore_index=True)
features_df

Partition: 0


Processing Audios: 100%|██████████| 2/2 [00:03<00:00,  1.74s/it]


Partition: 1


Processing Audios: 100%|██████████| 2/2 [00:02<00:00,  1.29s/it]


Partition: 2


Processing Audios: 100%|██████████| 2/2 [00:02<00:00,  1.21s/it]


Partition: 3


Processing Audios: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]


Partition: 4


Processing Audios: 100%|██████████| 2/2 [00:02<00:00,  1.00s/it]


Partition: 5


Processing Audios: 100%|██████████| 2/2 [00:01<00:00,  1.36it/s]


Partition: 6


Processing Audios: 100%|██████████| 2/2 [00:02<00:00,  1.32s/it]


Partition: 7


Processing Audios: 100%|██████████| 2/2 [00:01<00:00,  1.25it/s]


Partition: 8


Processing Audios: 100%|██████████| 2/2 [00:03<00:00,  1.54s/it]


Partition: 9


Processing Audios: 100%|██████████| 2/2 [00:02<00:00,  1.08s/it]


Partition: 10


Processing Audios: 100%|██████████| 2/2 [00:01<00:00,  1.03it/s]


Partition: 11


Processing Audios: 100%|██████████| 2/2 [00:01<00:00,  1.14it/s]


Partition: 12


Processing Audios: 100%|██████████| 2/2 [00:01<00:00,  1.33it/s]


Partition: 13


Processing Audios: 100%|██████████| 2/2 [00:01<00:00,  1.27it/s]


Partition: 14


Processing Audios: 100%|██████████| 2/2 [00:03<00:00,  1.66s/it]


Partition: 15


Processing Audios: 100%|██████████| 2/2 [00:01<00:00,  1.34it/s]


Partition: 16


Processing Audios: 100%|██████████| 2/2 [00:02<00:00,  1.16s/it]


Partition: 17


Processing Audios: 100%|██████████| 2/2 [00:01<00:00,  1.22it/s]


Partition: 18


Processing Audios: 100%|██████████| 2/2 [00:01<00:00,  1.19it/s]


Partition: 19


Processing Audios: 100%|██████████| 2/2 [00:01<00:00,  1.46it/s]


Unnamed: 0,audio_id,real_or_fake,spectral_centroid_mean,spectral_centroid_std,spectral_centroid_var,spectral_centroid_min,spectral_centroid_max,spectral_centroid_range,spectral_centroid_25th_percentile,spectral_centroid_50th_percentile,...,shimmer_dda,hnr,voicedcount,npause,originaldur,intensity_duration,speakingrate,articulationrate,asd,totalpauseduration
0,p225_141,F01,1760.885028,1260.712116,1589395.0,324.123887,5411.096884,5086.972997,1011.13756,1250.518331,...,0.079115,12.578021,6,0,2.214125,2.214125,2.709874,3.599145,0.277844,0.547063
1,p225_011,R,1232.131901,969.204525,939357.4,217.710361,5285.070014,5067.359652,558.599979,1031.901608,...,0.089444,13.588441,27,2,9.346583,9.346583,2.888756,5.175099,0.193233,4.129292
2,p225_266,R,952.156172,730.026998,532939.4,246.691421,3690.207003,3443.515581,422.594279,762.842969,...,0.076208,15.58281,10,0,4.995,4.995,2.002002,4.92126,0.2032,2.963
3,p225_285,R,818.101182,723.269819,523119.2,246.802688,4825.926011,4579.123323,342.743295,490.593613,...,0.130197,8.813912,6,0,3.202646,3.202646,1.873451,4.518072,0.221333,1.874646
4,p226_037,F04,1255.187373,1087.743718,1183186.0,268.382004,4804.149267,4535.767263,414.227471,957.466576,...,0.149483,9.066281,14,0,4.013333,4.013333,3.488372,4.959849,0.201619,1.190667
5,p226_005,F02,1628.603585,1059.546861,1122640.0,255.655158,4905.029676,4649.374519,937.917931,1317.533306,...,0.06698,12.022321,21,1,8.3,8.3,2.53012,3.586066,0.278857,2.444
6,p226_226,F04,1019.151953,881.79126,777555.8,178.978026,4253.805128,4074.827102,376.075692,794.518327,...,0.141603,9.07909,13,0,5.346667,5.346667,2.431421,4.48895,0.222769,2.450667
7,p226_161,F01,738.922318,682.115931,465282.1,228.152214,4669.957391,4441.805177,351.309092,472.060825,...,0.088126,13.593539,5,0,3.394125,3.394125,1.473134,3.90625,0.256,2.114125
8,p226_358,F06,1085.024772,1086.552258,1180596.0,216.651769,4472.004884,4255.353115,432.085595,614.544676,...,0.128125,10.711315,11,0,4.04,4.04,2.722772,4.841549,0.206545,1.768
9,p226_351,F01,1190.719311,1017.849179,1036017.0,220.781299,5128.152079,4907.37078,419.263093,884.581989,...,0.138231,9.864282,17,2,6.074125,6.074125,2.798757,4.331141,0.230886,2.149063


### Using Parallel Processing to extract features for each partitions and save

In [18]:
def extract_features(row, audio_processor, feature_computer):
    low_level_features = audio_processor.extract_features(row)
    high_level_features = feature_computer.compute_high_level_features(low_level_features)
    return high_level_features

audio_processor  = LowLevelFeatureExtractor(target_sr=16000, include_only=['spectral', 'prosodic', 'voice_quality'])
feature_computer = HighLevelFeatureExtractor()

In [24]:
for partition_id in tqdm(partitions, total=len(partitions), desc="Processing Partitions"):

    iterable_ds = load_parquet_data(partition_id=partition_id)    
    partition_df = generate_dataframe(iterable_ds)
    
    high_level_features   = Parallel(n_jobs=5)(
        delayed(extract_features)(row, audio_processor, feature_computer) 
        for _, row in partition_df.iloc[:10].iterrows() # REMOVE .iloc[:10]
    )
    high_level_feature_df = pd.DataFrame(high_level_features)
    high_level_feature_df.to_csv(os.path.join(features_dir, f'features_partition_{partition_id}.csv'), index=False)

Processing Partitions: 100%|██████████| 95/95 [10:25<00:00,  6.59s/it]
