In [1]:
%load_ext autoreload
%autoreload 2

In [33]:
import os

import numpy as np
import pandas as pd

from tqdm import tqdm
from joblib import Parallel, delayed

In [3]:
from utils import load_config, perform_statistical_tests
from data.load.data_loader import get_codecfake_audio_id_list, load_parquet_data
from features import (
    LowLevelFeatureExtractor, 
    HighLevelFeatureExtractor, 
    plot_low_level_feature_dist, 
    plot_high_level_feature_dist, 
    perform_pca_and_plot
)

In [4]:
config    = load_config()
cache_dir = config['data_paths']['codecfake']['cache_files']
features_dir = config['data_paths']['features']
audio_ids = get_codecfake_audio_id_list()

#### Codecfake - Partitions: 0, 1, ... 379

In [17]:
285 + 95

380

ajay: np.arange(0, 95) --> 0, 1, ..., 94

keerthana: np.arange(95, 190) --> 95, 96, ..., 189

Ruohe: np.arange(190, 285) --> 190, 191, ..., 284

Prudhvi: np.arange(285, 380) --> 285, 286, ..., 379

In [19]:
partitions = np.arange(0, 95)
partitions

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94])

In [20]:
def generate_dataframe(iterable_ds):
    rows = []
    for audio in iterable_ds:
        audio_id  = audio['audio_id']
        audio_arr = audio['audio']['array']
        srate     = audio['audio']['sampling_rate']
        real_fake = audio['real_or_fake']
    
        rows.append({
            'audio_id': audio_id,
            'audio_arr': audio_arr,
            'srate': srate,
            'real_or_fake': real_fake
        })
    return df

In [24]:
df_list = []

for partition_id in tqdm(partitions, total=len(partitions), desc="Processing partitions"):
    iterable_ds = load_parquet_data(partition_id=partition_id)    
    partition_df = generate_dataframe(iterable_ds)
    df_list.append(partition_df)

df = pd.concat(df_list, ignore_index=True)

df

Processing partitions: 100%|██████████| 95/95 [07:13<00:00,  4.56s/it]


Unnamed: 0,audio_id,audio_arr,srate,real_or_fake
0,p225_002,"[-0.0045166015625, -0.00665283203125, -0.00607...",48000,R
1,p225_002,"[0.001953125, 0.001556396484375, 0.00164794921...",16000,F03
2,p225_002,"[-0.001220703125, -0.001129150390625, -0.00103...",24000,F04
3,p225_002,"[0.001861572265625, 0.001922607421875, 0.00195...",16000,F01
4,p225_002,"[-0.00335693359375, -0.0032958984375, -0.00320...",48000,F05
...,...,...,...,...
61840,p225_191,"[0.00762939453125, 0.007568359375, 0.007446289...",48000,F05
61841,p225_191,"[0.00787353515625, 0.00799560546875, 0.0081176...",16000,F01
61842,p225_191,"[0.002777099609375, 0.001678466796875, 0.00183...",24000,F04
61843,p225_191,"[0.00311279296875, 0.003204345703125, 0.002960...",16000,F03


#### Extract Features

In [27]:
audio_processor = LowLevelFeatureExtractor(target_sr=16000, include_only=['spectral', 'prosodic', 'voice_quality'])
feature_computer = HighLevelFeatureExtractor()

In [28]:
low_level_gen        = audio_processor.low_level_feature_generator(df.iloc[:20])
high_level_features  = list(feature_computer.high_level_feature_generator(low_level_gen))
high_level_features_df = pd.DataFrame(high_level_features)
high_level_features_df

Processing Audios: 100%|██████████| 20/20 [00:23<00:00,  1.19s/it]


Unnamed: 0,audio_id,real_or_fake,spectral_centroid_mean,spectral_centroid_std,spectral_centroid_var,spectral_centroid_min,spectral_centroid_max,spectral_centroid_range,spectral_centroid_25th_percentile,spectral_centroid_50th_percentile,...,shimmer_dda,hnr,voicedcount,npause,originaldur,intensity_duration,speakingrate,articulationrate,asd,totalpauseduration
0,p225_002,R,1407.277479,1139.511158,1298486.0,273.979621,4860.736866,4586.757245,538.847218,1075.098178,...,0.060432,13.445119,9,0,4.058667,4.058667,2.217477,3.87931,0.257778,1.738667
1,p225_002,F03,1269.150323,1085.214354,1177690.0,319.092868,4752.988182,4433.895314,501.144619,872.714022,...,0.062136,16.470543,8,0,4.08,4.08,1.960784,3.424658,0.292,1.744
2,p225_002,F04,1336.049769,1171.33156,1372018.0,247.256673,4922.406802,4675.150129,433.114366,997.420786,...,0.123465,12.076429,9,0,4.066667,4.066667,2.213115,3.87931,0.257778,1.746667
3,p225_002,F01,1540.345739,1212.507952,1470176.0,257.468837,5074.111834,4816.642997,688.193002,1217.335638,...,0.069833,15.256534,9,0,3.394125,3.394125,2.651641,3.90625,0.256,1.090125
4,p225_002,F05,1414.71784,1149.669533,1321740.0,295.103707,4982.683428,4687.579722,604.047516,1091.148701,...,0.097402,12.662761,9,0,4.0625,4.0625,2.215385,3.85274,0.259556,1.7265
5,p225_002,F02,1514.202295,1179.175846,1390456.0,275.770102,5091.169775,4815.399673,634.001211,1128.008383,...,0.091923,15.653361,8,0,4.06,4.06,1.970443,3.424658,0.292,1.724
6,p225_002,F06,1269.150323,1085.214354,1177690.0,319.092868,4752.988182,4433.895314,501.144619,872.714022,...,0.062136,16.470543,8,0,4.08,4.08,1.960784,3.424658,0.292,1.744
7,p225_003,R,1724.88205,1358.306823,1844997.0,240.330124,5445.572581,5205.242457,856.539372,1284.577629,...,0.095516,11.531919,20,2,7.809479,7.809479,2.56099,3.918495,0.2552,2.705479
8,p225_003,F04,1686.359745,1392.656929,1939493.0,168.94052,5577.680477,5408.739957,814.810942,1251.441898,...,0.144102,10.556453,20,1,7.813333,7.813333,2.559727,3.753754,0.2664,2.485333
9,p225_003,F03,1700.782291,1308.502013,1712178.0,289.027907,5388.677656,5099.649749,845.365038,1235.607536,...,0.075705,14.383896,20,2,7.84,7.84,2.55102,4.125413,0.2424,2.992


### Using Parallel Processing

In [34]:
def extract_features(row, audio_processor, feature_computer):
    low_level_features = audio_processor.extract_features(row)
    high_level_features = feature_computer.compute_high_level_features(low_level_features)
    return high_level_features

audio_processor  = LowLevelFeatureExtractor(target_sr=16000, include_only=['spectral', 'prosodic', 'voice_quality'])
feature_computer = HighLevelFeatureExtractor()

high_level_features   = Parallel(n_jobs=5)(
    delayed(extract_features)(row, audio_processor, feature_computer) 
    for _, row in df.iloc[:1000].iterrows()
)
high_level_feature_df = pd.DataFrame(high_level_features)
high_level_feature_df.to_csv(os.path.join(features_dir, 'sample_features_1000.csv'), index=False)