In [6]:
import os
import librosa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import librosa.display
import scipy.stats as stats
import re
import soundfile as sf
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Subtract, concatenate
from tensorflow.keras.models import Model
from sklearn.utils import resample
import ast
import warnings
import time
warnings.filterwarnings('ignore')

In [7]:
# Directories for audio files
FAKE_DIR = '/kaggle/input/deep-voice-deepfake-voice-recognition/KAGGLE/AUDIO/FAKE/'
REAL_DIR = '/kaggle/input/deep-voice-deepfake-voice-recognition/KAGGLE/AUDIO/REAL/'

In [None]:
# Function to extract audio features
def extract_features(file_path,fake=False):
    y, sr = librosa.load(file_path, sr=None)
    
    # Spectral features
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, hop_length=512, n_fft=2048)
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr).mean()
    formants = librosa.feature.melspectrogram(y=y, sr=sr).mean(axis=1)

    # Compute delta and delta-delta features
    delta_mfcc = librosa.feature.delta(mfcc).mean(axis=1)
    delta2_mfcc = librosa.feature.delta(mfcc, order=2).mean(axis=1)
    
        
 
    
    # Temporal features
    zero_crossing_rate = librosa.feature.zero_crossing_rate(y).mean()
    short_time_energy = np.sum(librosa.feature.rms(y=y)**2)
    
    # Phase-based features
    phase_coherence = librosa.feature.spectral_contrast(y=y, sr=sr).mean(axis=1)
    
    # Prosodic features
    pitch_contour = librosa.yin(y, fmin=50, fmax=500).mean()
    speech_rate = np.mean(librosa.beat.tempo(y=y, sr=sr))
    
    
    if fake:
        transformed_filename = file_path.split('/')[-1]
        original_speaker, transformed_speaker = transformed_filename.split('-to-')
        transformed_speaker = transformed_speaker[:-4]
       
    else:
        original_speaker = file_path.split('/')[-1][:-13]
        transformed_speaker = 'NA'
        
    
    # Combine all features into a dictionary
    features = {
        'mfcc': mfcc,
        'spectral_centroid': spectral_centroid,
        'formants': formants,
        'zero_crossing_rate': zero_crossing_rate,
        'short_time_energy': short_time_energy,
        'phase_coherence': phase_coherence,
        'pitch_contour': pitch_contour,
        'speech_rate': speech_rate,
        'delta_mfcc': delta_mfcc,
        'delta2_mfcc': delta2_mfcc,
        'Speaker': original_speaker,
        'Transformed': transformed_speaker,
    }
    #print(features)
    return features

# Extract features from both fake and real audio files
def process_audio_files(directory,fake=False):
    features_list = []
    
    for filename in os.listdir(directory):
        if filename.endswith(".wav"):  # Assuming .wav files
            file_path = os.path.join(directory, filename)
            features = extract_features(file_path,fake)
            features_list.append(features)
        
    return features_list



# Process FAKE and REAL directories
fake_features = process_audio_files(FAKE_DIR,fake=True)
real_features = process_audio_files(REAL_DIR)

# Convert to DataFrame for analysis
fake_df = pd.DataFrame(fake_features)
real_df = pd.DataFrame(real_features)

# Add labels to differentiate fake and real audio
fake_df['File Type'] = 0
real_df['File Type'] = 1

final_df = pd.concat([fake_df, real_df], ignore_index=True)


In [11]:
final_df = pd.read_pickle('/kaggle/input/voice-features-1/features.pkl')

In [12]:
final_df.tail(10)

Unnamed: 0,mfcc,spectral_centroid,formants,zero_crossing_rate,short_time_energy,phase_coherence,pitch_contour,speech_rate,delta_mfcc,delta2_mfcc,Speaker,Transformed,File Type
54,"[[-325.60632, -285.53616, -268.17792, -256.396...",2682.568391,"[0.19327721, 0.7587629, 4.010507, 6.967394, 3....",0.069659,217.964142,"[15.493123930761197, 10.177339482137931, 13.76...",109.752943,129.199219,"[-0.0051869457, -0.0016825453, 0.0009549859, -...","[-0.0009379449, -0.0011522848, 5.4225373e-05, ...",musk,biden,0
55,"[[-299.77115, -280.1516, -290.53467, -287.9077...",2829.887145,"[0.0040864944, 1.1975402, 8.480825, 12.862288,...",0.07898,42.385757,"[19.110314978556588, 11.150577074092977, 14.53...",112.429616,139.674831,"[-0.032758456, -0.0067037935, 0.009932622, 0.0...","[-0.00025885523, 0.0008174253, 0.0012068135, 0...",margot,linus,0
56,"[[-345.81134, -361.78928, -369.84454, -368.699...",2064.594996,"[0.000600889, 0.005932241, 0.041626982, 0.0812...",0.05062,148.47406,"[17.57745221388292, 12.923688288831638, 16.612...",134.449903,122.282609,"[0.0014896352, -0.0007954228, -7.0518356e-05, ...","[4.134787e-06, -0.0008268045, -0.00019090888, ...",biden,,1
57,"[[-305.52423, -295.35477, -303.89584, -306.010...",2809.718302,"[0.001967559, 0.008416761, 0.105628245, 0.4988...",0.066211,82.574066,"[16.901627702937457, 12.847337980819148, 15.60...",125.766398,119.680851,"[0.0008868064, 0.00017322741, 0.00033618812, -...","[0.00012537696, 5.1550483e-05, 0.00034175135, ...",trump,,1
58,"[[-623.2371, -623.2371, -623.2371, -623.2371, ...",4660.40537,"[7.56502e-05, 0.0037083623, 0.017493322, 0.063...",0.141098,31.187822,"[21.83159081943465, 12.579377100789214, 15.184...",123.901752,123.046875,"[0.0040568262, 0.0016565877, 0.00042399368, 0....","[0.00043695388, 0.00028975506, 0.00024173131, ...",taylor,,1
59,"[[-549.2322, -524.7715, -523.1522, -521.7329, ...",2918.046569,"[0.0003582233, 0.0003538899, 0.00056687865, 0....",0.057659,5.406374,"[11.478167568602625, 12.083124176013559, 15.31...",163.34436,122.282609,"[0.0012006939, 0.0001258718, -0.0005294417, 3....","[-0.00016629063, -8.627721e-05, 8.430587e-05, ...",obama,,1
60,"[[-305.3076, -281.06903, -291.5267, -287.96936...",2735.72139,"[0.00062550517, 0.0018442041, 0.009221696, 0.0...",0.080685,11.702671,"[16.757350010586322, 13.695456816950893, 16.20...",128.075542,126.048018,"[-0.017495604, -0.001683808, 0.0058646877, 0.0...","[-0.0013958344, -0.00024074037, 0.0012876464, ...",margot,,1
61,"[[-551.39777, -551.39777, -551.39777, -551.397...",2395.570599,"[0.5070978, 1.2628412, 3.1402974, 7.271094, 7....",0.05435,291.351837,"[16.076778705976206, 12.63596291148848, 15.804...",128.650664,126.048018,"[-0.00013645974, 0.0004904017, 0.00047600793, ...","[0.0004617763, 0.0003584289, 5.4583543e-05, -7...",linus,,1
62,"[[-316.84277, -292.96906, -268.8252, -220.5768...",1093.442004,"[1.1014591, 1.0045148, 3.491313, 5.329056, 4.2...",0.030983,703.664612,"[12.985760113903552, 10.33519302356895, 16.077...",111.414047,120.18532,"[0.0010610336, 0.0009856296, 0.0008631868, -0....","[-0.0012509387, -0.0015557735, 0.0004258442, 0...",musk,,1
63,"[[-346.21942, -325.0981, -335.47552, -343.6375...",2928.592368,"[0.00050830166, 0.0066673914, 0.08975407, 0.28...",0.069887,4.642261,"[17.937697451238872, 11.735120762494633, 14.82...",94.393995,120.18532,"[-0.018156027, -0.007693501, -0.0005485818, -9...","[-0.016434552, -0.0050773215, 0.003649609, 0.0...",ryan,,1


In [13]:
final_df.columns

Index(['mfcc', 'spectral_centroid', 'formants', 'zero_crossing_rate',
       'short_time_energy', 'phase_coherence', 'pitch_contour', 'speech_rate',
       'delta_mfcc', 'delta2_mfcc', 'Speaker', 'Transformed', 'File Type'],
      dtype='object')

In [5]:
final_df['Speaker'] = final_df['Speaker'].str.lower()
final_df['Transformed'] = final_df['Transformed'].str.lower()

In [6]:
balanced_df = final_df.copy()

# Loop through each speaker-file type combination to balance the samples
for speaker in balanced_df['Speaker'].unique():
    # Filter for this speaker and check real/fake counts
    speaker_data = balanced_df[balanced_df['Speaker'] == speaker]
    real_count = speaker_data[speaker_data['File Type'] == 1].shape[0]
    fake_count = speaker_data[speaker_data['File Type'] == 0].shape[0]
    print(real_count ,fake_count,speaker)
    
    if real_count < fake_count:
        # Calculate the number of samples to generate
        samples_needed = fake_count - real_count
        #print(samples_needed)
        
        # Extract real samples for this speaker
        real_samples = speaker_data[speaker_data['File Type'] == 1]
        
        # Repeat the DataFrame `n` times
        expanded_real_samples = pd.concat([real_samples] * samples_needed, ignore_index=True)

        # Append the repeated samples to `balanced_df`
        balanced_df = pd.concat([balanced_df, expanded_real_samples], ignore_index=True)


1 7 trump
1 7 margot
1 7 linus
1 7 taylor
1 7 ryan
1 7 obama
1 7 musk
1 7 biden


In [7]:
final_df =  balanced_df.copy()

In [8]:
final_df.dtypes

mfcc                   object
spectral_centroid     float64
formants               object
zero_crossing_rate    float64
short_time_energy     float32
phase_coherence        object
pitch_contour         float64
speech_rate           float64
delta_mfcc             object
delta2_mfcc            object
Speaker                object
Transformed            object
File Type               int64
dtype: object

In [9]:
final_df.head()

Unnamed: 0,mfcc,spectral_centroid,formants,zero_crossing_rate,short_time_energy,phase_coherence,pitch_contour,speech_rate,delta_mfcc,delta2_mfcc,Speaker,Transformed,File Type
0,"[[-384.82144, -358.65683, -357.0009, -362.3613...",2552.698456,"[0.0044450313, 0.024562363, 0.10701934, 0.7947...",0.063105,195.509247,"[13.032592499853708, 12.104388008481168, 15.46...",130.722464,120.18532,"[-0.0004238483, 0.00037077555, 0.0009836447, -...","[0.0001629564, 4.8460606e-05, -0.00010257666, ...",trump,obama,0
1,"[[-330.45853, -308.2083, -319.194, -317.49783,...",2355.852265,"[0.052845053, 0.5077932, 3.2678761, 5.2546535,...",0.06288,45.058159,"[15.093474230932996, 11.673808085587346, 15.80...",113.527159,135.999178,"[-0.01536896, -0.0030288177, 0.011273681, 0.00...","[-0.0032674116, -0.00077354716, 0.0019706897, ...",margot,musk,0
2,"[[-527.92804, -528.36084, -528.8087, -528.4010...",2292.593244,"[0.37216926, 0.7387148, 0.6268684, 1.9482391, ...",0.054307,311.087982,"[15.303887753668844, 11.98906517755704, 15.189...",135.268156,129.199219,"[-0.00026833254, 0.00021786334, 0.00043867543,...","[0.00029822649, 0.00026397666, 9.9872006e-05, ...",linus,obama,0
3,"[[-563.4095, -564.8963, -565.24786, -565.2134,...",3448.174463,"[0.0016742082, 0.061168447, 1.0767565, 3.89086...",0.096787,154.858521,"[16.611698451205804, 11.505106174540993, 15.27...",97.917998,132.512019,"[-0.0026676354, -0.0006228533, 0.00045752467, ...","[0.00062540744, 0.0001761887, -1.784851e-05, 2...",taylor,trump,0
4,"[[-530.4259, -531.13324, -531.2714, -531.27716...",3359.212189,"[0.0018148813, 0.13198222, 2.8754227, 10.44912...",0.103719,312.088379,"[17.21981784809321, 11.110472768045783, 14.210...",111.513169,129.199219,"[-0.0021237333, -0.00079722033, 0.00026833097,...","[0.0004095786, 5.092597e-05, -4.1922845e-05, 3...",taylor,biden,0


In [10]:
final_df.columns

Index(['mfcc', 'spectral_centroid', 'formants', 'zero_crossing_rate',
       'short_time_energy', 'phase_coherence', 'pitch_contour', 'speech_rate',
       'delta_mfcc', 'delta2_mfcc', 'Speaker', 'Transformed', 'File Type'],
      dtype='object')

In [11]:
# Initialize list to store processed data
processed_data = []

# Loop through each row in final_df
for index, row in final_df.iterrows():
    # Extract features from the current row
    formants = row['formants']  # Shape (128,)
    phase_coherence = row['phase_coherence']  # Shape (7,)
    delta_mfcc = row['delta_mfcc']  # Shape (13,)
    delta2_mfcc = row['delta2_mfcc']  # Shape (13,)

    
    # Generate unique column names for each feature type
    if index == 0:  # Create column names once
        print(type(formants))
        formant_columns = [f'formant_{i}' for i in range(formants.size)]
        phase_coherence_columns = [f'phase_coherence_{i}' for i in range(phase_coherence.size)]
        delta_mfcc_columns = [f'delta_mfcc_{i}' for i in range(delta_mfcc.size)]
        delta2_mfcc_columns = [f'delta2_mfcc_{i}' for i in range(delta2_mfcc.size)]

        # Concatenate all column names
        feature_columns = formant_columns + phase_coherence_columns + delta_mfcc_columns + delta2_mfcc_columns

    # Concatenate all feature arrays into a 1D array (row)
    combined_features = np.concatenate([formants, phase_coherence, delta_mfcc, delta2_mfcc])

    # Get other single-value columns (e.g., 'Speaker', 'File Type', 'Transformed Speaker')
    other_cols = ['mfcc','spectral_centroid', 'zero_crossing_rate',
       'short_time_energy', 'pitch_contour', 'speech_rate', 'Speaker', 'Transformed', 'File Type']
    
    other_cols_values = [row[col_name] for col_name in other_cols]

    # Append the combined features with other columns
    processed_data.append(other_cols_values + combined_features.tolist())



# Combine single-value columns and feature columns
all_column_names = other_cols + feature_columns

# Convert the processed data into a DataFrame
processed_df = pd.DataFrame(processed_data, columns=all_column_names)

# Check the resulting DataFrame
print(processed_df.head())


<class 'numpy.ndarray'>
                                                mfcc  spectral_centroid  \
0  [[-384.82144, -358.65683, -357.0009, -362.3613...        2552.698456   
1  [[-330.45853, -308.2083, -319.194, -317.49783,...        2355.852265   
2  [[-527.92804, -528.36084, -528.8087, -528.4010...        2292.593244   
3  [[-563.4095, -564.8963, -565.24786, -565.2134,...        3448.174463   
4  [[-530.4259, -531.13324, -531.2714, -531.27716...        3359.212189   

   zero_crossing_rate  short_time_energy  pitch_contour  speech_rate Speaker  \
0            0.063105         195.509247     130.722464   120.185320   trump   
1            0.062880          45.058159     113.527159   135.999178  margot   
2            0.054307         311.087982     135.268156   129.199219   linus   
3            0.096787         154.858521      97.917998   132.512019  taylor   
4            0.103719         312.088379     111.513169   129.199219  taylor   

  Transformed  File Type  formant_0  ...  de

In [12]:
processed_df.head()

Unnamed: 0,mfcc,spectral_centroid,zero_crossing_rate,short_time_energy,pitch_contour,speech_rate,Speaker,Transformed,File Type,formant_0,...,delta2_mfcc_3,delta2_mfcc_4,delta2_mfcc_5,delta2_mfcc_6,delta2_mfcc_7,delta2_mfcc_8,delta2_mfcc_9,delta2_mfcc_10,delta2_mfcc_11,delta2_mfcc_12
0,"[[-384.82144, -358.65683, -357.0009, -362.3613...",2552.698456,0.063105,195.509247,130.722464,120.18532,trump,obama,0,0.004445,...,-1.9e-05,0.000112,7.5e-05,-2.5e-05,5.2e-05,-7.7e-05,-0.000138,7.3e-05,5.920192e-05,-1.1e-05
1,"[[-330.45853, -308.2083, -319.194, -317.49783,...",2355.852265,0.06288,45.058159,113.527159,135.999178,margot,musk,0,0.052845,...,0.000939,-1.9e-05,-0.000116,0.000463,-0.00099,-0.001,8.3e-05,-0.000287,-0.0003859053,0.000381
2,"[[-527.92804, -528.36084, -528.8087, -528.4010...",2292.593244,0.054307,311.087982,135.268156,129.199219,linus,obama,0,0.372169,...,1.4e-05,-6.1e-05,-6.2e-05,-0.000116,-0.000124,-9.8e-05,-7.8e-05,-6.1e-05,-8.609637e-05,-4.1e-05
3,"[[-563.4095, -564.8963, -565.24786, -565.2134,...",3448.174463,0.096787,154.858521,97.917998,132.512019,taylor,trump,0,0.001674,...,2.5e-05,-7.5e-05,1.4e-05,-7.8e-05,8.8e-05,1.6e-05,-4e-05,-5.8e-05,-2.163621e-07,-7.8e-05
4,"[[-530.4259, -531.13324, -531.2714, -531.27716...",3359.212189,0.103719,312.088379,111.513169,129.199219,taylor,biden,0,0.001815,...,3.2e-05,-9.1e-05,0.000102,-0.000118,1.1e-05,-1.7e-05,-8e-05,1.5e-05,-3.594763e-05,4.5e-05


In [13]:
print(processed_df.columns)

Index(['mfcc', 'spectral_centroid', 'zero_crossing_rate', 'short_time_energy',
       'pitch_contour', 'speech_rate', 'Speaker', 'Transformed', 'File Type',
       'formant_0',
       ...
       'delta2_mfcc_3', 'delta2_mfcc_4', 'delta2_mfcc_5', 'delta2_mfcc_6',
       'delta2_mfcc_7', 'delta2_mfcc_8', 'delta2_mfcc_9', 'delta2_mfcc_10',
       'delta2_mfcc_11', 'delta2_mfcc_12'],
      dtype='object', length=170)


In [14]:
#set(list(processed_df['Speaker'].values))
mapping = {'obama': 0,'biden': 1,'linus':2,'margot':3,'musk':4,'ryan': 5,'taylor': 6,'trump': 7}

In [15]:
processed_df['Speaker'] = processed_df['Speaker'].str.lower()
processed_df['Transformed'] = processed_df['Transformed'].str.lower()

In [16]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Assuming 'processed_df' is your DataFrame and it contains a 'Speaker' column
# One-hot encode the speakers
encoder = OneHotEncoder(sparse=False, drop='first')  # drop='first' can be used to avoid dummy variable trap
speaker_labels = encoder.fit_transform(processed_df[['Speaker']])

# Create a DataFrame from the one-hot encoded array
speaker_labels_df = pd.DataFrame(speaker_labels, columns=encoder.get_feature_names_out(['Speaker']))

# Reset index if necessary (optional)
speaker_labels_df.reset_index(drop=True, inplace=True)

# Merge the one-hot encoded speaker labels with the original DataFrame
processed_df = pd.concat([processed_df.reset_index(drop=True), speaker_labels_df], axis=1)

# Check the updated DataFrame
print(processed_df.head())


                                                mfcc  spectral_centroid  \
0  [[-384.82144, -358.65683, -357.0009, -362.3613...        2552.698456   
1  [[-330.45853, -308.2083, -319.194, -317.49783,...        2355.852265   
2  [[-527.92804, -528.36084, -528.8087, -528.4010...        2292.593244   
3  [[-563.4095, -564.8963, -565.24786, -565.2134,...        3448.174463   
4  [[-530.4259, -531.13324, -531.2714, -531.27716...        3359.212189   

   zero_crossing_rate  short_time_energy  pitch_contour  speech_rate Speaker  \
0            0.063105         195.509247     130.722464   120.185320   trump   
1            0.062880          45.058159     113.527159   135.999178  margot   
2            0.054307         311.087982     135.268156   129.199219   linus   
3            0.096787         154.858521      97.917998   132.512019  taylor   
4            0.103719         312.088379     111.513169   129.199219  taylor   

  Transformed  File Type  formant_0  ...  delta2_mfcc_10  delta2_mfc

In [17]:
#dropping transformed feature
processed_df.drop(columns=['Transformed'],inplace=True)


In [18]:
processed_df.columns

Index(['mfcc', 'spectral_centroid', 'zero_crossing_rate', 'short_time_energy',
       'pitch_contour', 'speech_rate', 'Speaker', 'File Type', 'formant_0',
       'formant_1',
       ...
       'delta2_mfcc_10', 'delta2_mfcc_11', 'delta2_mfcc_12', 'Speaker_linus',
       'Speaker_margot', 'Speaker_musk', 'Speaker_obama', 'Speaker_ryan',
       'Speaker_taylor', 'Speaker_trump'],
      dtype='object', length=176)

In [19]:
other_scalar_columns = [i for i in processed_df.columns if i != 'mfcc']

In [20]:
def pad_mfcc(array, target_shape):
    # Pad only along the time dimension (axis 1)
    if array.shape[1] < target_shape[1]:
        padding = target_shape[1] - array.shape[1]
        # Pad with zeros on the right side
        return np.pad(array, ((0, 0), (0, padding)), mode='constant')
    # If the array is longer, truncate it
    return array[:, :target_shape[1]]

# Assuming processed_df is your DataFrame with all necessary columns
# Define the target variable
target_column = 'File Type'  # 1 for real, 0 for fake

# Target shape for padding
target_shape = (13, 56293)

# Define MFCC and scalar feature columns
mfcc_columns = [f'mfcc_feature_{i}' for i in range(13 * 51680)]  # Adjust if MFCC features are structured differently
scalar_columns = other_scalar_columns  # List all scalar feature columns

# Split the DataFrame into real and fake samples
real_df = processed_df[processed_df[target_column] == 1]
fake_df = processed_df[processed_df[target_column] == 0]

# Extract MFCC, scalar features, and speaker information
for arr in real_df['mfcc'].values:
    print(arr.shape)
mfcc_real_train = np.array([pad_mfcc(arr, target_shape) for arr in real_df['mfcc'].values])
mfcc_fake_train = np.array([pad_mfcc(arr, target_shape) for arr in fake_df['mfcc'].values])
scalar_real_train = real_df[scalar_columns].values
scalar_fake_train = fake_df[scalar_columns].values
speaker_real_train = real_df[encoder.get_feature_names_out(['Speaker'])].values  # Adjust if using a different encoder
speaker_fake_train = fake_df[encoder.get_feature_names_out(['Speaker'])].values  # Adjust if using a different encoder

# Check the shapes of arrays to ensure correctness
print(f'MFCC Real Train Shape: {mfcc_real_train.shape}')
print(f'MFCC Fake Train Shape: {mfcc_fake_train.shape}')
print(f'Scalar Real Train Shape: {scalar_real_train.shape}')
print(f'Scalar Fake Train Shape: {scalar_fake_train.shape}')
print(f'Speaker Real Train Shape: {speaker_real_train.shape}')
print(f'Speaker Fake Train Shape: {speaker_fake_train.shape}')

(13, 56251)
(13, 56291)
(13, 51695)
(13, 56293)
(13, 6847)
(13, 49127)
(13, 51692)
(13, 8070)
(13, 56291)
(13, 56291)
(13, 56291)
(13, 56291)
(13, 56291)
(13, 56291)
(13, 6847)
(13, 6847)
(13, 6847)
(13, 6847)
(13, 6847)
(13, 6847)
(13, 49127)
(13, 49127)
(13, 49127)
(13, 49127)
(13, 49127)
(13, 49127)
(13, 51695)
(13, 51695)
(13, 51695)
(13, 51695)
(13, 51695)
(13, 51695)
(13, 8070)
(13, 8070)
(13, 8070)
(13, 8070)
(13, 8070)
(13, 8070)
(13, 56293)
(13, 56293)
(13, 56293)
(13, 56293)
(13, 56293)
(13, 56293)
(13, 51692)
(13, 51692)
(13, 51692)
(13, 51692)
(13, 51692)
(13, 51692)
(13, 56251)
(13, 56251)
(13, 56251)
(13, 56251)
(13, 56251)
(13, 56251)
MFCC Real Train Shape: (56, 13, 56293)
MFCC Fake Train Shape: (56, 13, 56293)
Scalar Real Train Shape: (56, 175)
Scalar Fake Train Shape: (56, 175)
Speaker Real Train Shape: (56, 7)
Speaker Fake Train Shape: (56, 7)


In [21]:
real_df.columns

Index(['mfcc', 'spectral_centroid', 'zero_crossing_rate', 'short_time_energy',
       'pitch_contour', 'speech_rate', 'Speaker', 'File Type', 'formant_0',
       'formant_1',
       ...
       'delta2_mfcc_10', 'delta2_mfcc_11', 'delta2_mfcc_12', 'Speaker_linus',
       'Speaker_margot', 'Speaker_musk', 'Speaker_obama', 'Speaker_ryan',
       'Speaker_taylor', 'Speaker_trump'],
      dtype='object', length=176)

In [22]:
import numpy as np
from keras.preprocessing.sequence import pad_sequences

def create_speaker_pairs(real_df, fake_df, other_scalar_columns):
    pairs_mfcc, pairs_scalar, labels = [], [], []
    
    # Group by speaker to ensure speaker-specific comparisons
    speakers = real_df['Speaker'].unique()
    
    for speaker in speakers:
        real_speaker_data = real_df[real_df['Speaker'] == speaker]
        fake_speaker_data = fake_df[fake_df['Speaker'] == speaker]
        
        min_samples = min(len(real_speaker_data), len(fake_speaker_data))
        
        # Pair real-real (similar), fake-fake (similar), and real-fake (dissimilar)
        for i in range(min_samples):
            # Real-real pair (label 1)
            pairs_mfcc.append([real_speaker_data['mfcc'].iloc[i], real_speaker_data['mfcc'].iloc[(i + 1) % min_samples]])
            pairs_scalar.append([real_speaker_data[other_scalar_columns].iloc[i], real_speaker_data[other_scalar_columns].iloc[(i + 1) % min_samples]])
            labels.append(1)
            
            # Fake-fake pair (label 1)
            pairs_mfcc.append([fake_speaker_data['mfcc'].iloc[i], fake_speaker_data['mfcc'].iloc[(i + 1) % min_samples]])
            pairs_scalar.append([fake_speaker_data[other_scalar_columns].iloc[i], fake_speaker_data[other_scalar_columns].iloc[(i + 1) % min_samples]])
            labels.append(1)
            
            # Real-fake pair (label 0)
            pairs_mfcc.append([real_speaker_data['mfcc'].iloc[i], fake_speaker_data['mfcc'].iloc[i]])
            pairs_scalar.append([real_speaker_data[other_scalar_columns].iloc[i], fake_speaker_data[other_scalar_columns].iloc[i]])
            labels.append(0)

    # Calculate max_length based on the second dimension of both MFCCs in each pair
    max_length = max(max(pair[0].shape[1], pair[1].shape[1]) for pair in pairs_mfcc)
    print("Maximum length for padding:", max_length)

    # Convert pairs_mfcc to a consistent shape with padding
    pairs_mfcc_padded = np.array([
        np.array([
            np.pad(mfcc, ((0, 0), (0, max_length - mfcc.shape[1])), mode='constant') 
                   for mfcc in pair])  # Apply padding to the second dimension only
        for pair in pairs_mfcc
    ])

    # Convert pairs_scalar and labels to NumPy arrays
    pairs_scalar = np.array(pairs_scalar)
    labels = np.array(labels)

    # Check the shapes of scalar features and labels
    print("Scalar pairs shape:", pairs_scalar.shape)
    print("Labels shape:", labels.shape)

    return pairs_mfcc_padded, pairs_scalar, labels


In [23]:
other_scalar_columns.remove('Speaker')
pairs_mfcc, pairs_scalar, labels = create_speaker_pairs(real_df, fake_df,other_scalar_columns)

Maximum length for padding: 56293
Scalar pairs shape: (168, 2, 174)
Labels shape: (168,)


In [24]:
# Check for NaN or Inf in pairs_mfcc and pairs_scalar
print("NaN in pairs_mfcc:", np.isnan(pairs_mfcc).any())
print("Inf in pairs_mfcc:", np.isinf(pairs_mfcc).any())
print("NaN in pairs_scalar:", np.isnan(pairs_scalar).any())
print("Inf in pairs_scalar:", np.isinf(pairs_scalar).any())


NaN in pairs_mfcc: False
Inf in pairs_mfcc: False
NaN in pairs_scalar: False
Inf in pairs_scalar: False


In [25]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import Adam

# Create an optimizer with a smaller learning rate
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, clipnorm=1.0)

# CNN for MFCC features
def create_cnn_model(input_shape):
    model = models.Sequential()
    model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
    model.add(layers.MaxPooling2D((2, 2)))

    model.add(layers.Flatten())
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.BatchNormalization())  # Batch normalization before output
    model.add(layers.Dropout(0.1))  # Dropout for regularization
    return model

# DNN for scalar and speaker data
def create_dnn_model(input_shape):
    model = models.Sequential()
    model.add(layers.Dense(128, activation='relu', input_shape=input_shape))
    model.add(layers.Dropout(0.1))  # Dropout for regularization
    model.add(layers.Dense(64, activation='relu'))
    return model

# Euclidean distance layer
def euclidean_distance(vects):
    x, y = vects
    return tf.sqrt(tf.reduce_sum(tf.square(x - y), axis=1, keepdims=True))

# Modify the loss function to print values
def contrastive_loss(y_true, y_pred):
    margin = 1.0
    # Ensure that y_pred is not too close to zero to avoid NaNs
    y_pred = tf.clip_by_value(y_pred, 1e-10, tf.reduce_max(y_pred))
    
    loss = tf.reduce_mean((1 - y_true) * tf.square(y_pred) + 
                          y_true * tf.square(tf.maximum(margin - y_pred, 0)))
    return loss

# Siamese model definition
def create_siamese_network(input_shape_mfcc, input_shape_scalar):
    # Inputs for the two branches
    input_real_mfcc = layers.Input(shape=input_shape_mfcc)
    input_fake_mfcc = layers.Input(shape=input_shape_mfcc)
    input_real_scalar = layers.Input(shape=input_shape_scalar)
    input_fake_scalar = layers.Input(shape=input_shape_scalar)
    
    # CNN model for MFCC features
    cnn_model = create_cnn_model(input_shape_mfcc)
    encoded_real_mfcc = cnn_model(input_real_mfcc)
    encoded_fake_mfcc = cnn_model(input_fake_mfcc)
    
    # DNN model for scalar features
    dnn_model = create_dnn_model(input_shape_scalar)
    encoded_real_scalar = dnn_model(input_real_scalar)
    encoded_fake_scalar = dnn_model(input_fake_scalar)
    
    # Combine MFCC and scalar encodings
    combined_real = layers.concatenate([encoded_real_mfcc, encoded_real_scalar])
    combined_fake = layers.concatenate([encoded_fake_mfcc, encoded_fake_scalar])
    
    # Euclidean distance calculation
    distance = layers.Lambda(euclidean_distance)([combined_real, combined_fake])
    
    # Output layer for similarity score
    output = layers.Dense(1, activation='sigmoid')(distance)
    
    # Siamese model
    siamese_model = models.Model(inputs=[input_real_mfcc, input_fake_mfcc, input_real_scalar, input_fake_scalar], outputs=output)
    siamese_model.compile(optimizer=optimizer, loss=contrastive_loss, metrics=['accuracy'])
    
    return siamese_model

# Define input shapes
input_shape_mfcc = (13, 56293, 1)  # Adjusted shape for MFCC features (13 coefficients, 56293 time frames, 1 channel)
input_shape_scalar = (174,)         # Shape for scalar features (174 scalars)

# Create model
siamese_model = create_siamese_network(input_shape_mfcc, input_shape_scalar)
siamese_model.summary()


In [None]:
class NaNStopping(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        if logs.get('loss') is None or np.isnan(logs.get('loss')):
            print("Stopping training, loss is NaN")
            self.model.stop_training = True

# Fit the model with the callback
siamese_model.fit(
    [pairs_mfcc[:, 0], pairs_mfcc[:, 1], pairs_scalar[:, 0], pairs_scalar[:, 1]], 
    labels, 
    epochs=10,
    batch_size=64,
    callbacks=[NaNStopping()]
)


Epoch 1/10


In [28]:
import numpy as np

def infer(model, pairs_mfcc, pairs_scalar):
    # Ensure input shapes are correct for inference
    pairs_mfcc = pairs_mfcc.astype(np.float32)  # Ensure correct dtype
    pairs_scalar = pairs_scalar.astype(np.float32)

    # Prepare the input data
    X_mfcc = [pairs_mfcc[:, 0], pairs_mfcc[:, 1]]
    X_scalar = [pairs_scalar[:, 0], pairs_scalar[:, 1]]

    # Get predictions
    predictions = model.predict(X_mfcc + X_scalar)

    # Convert predictions to binary (0 or 1) based on a threshold
    predicted_labels = (predictions > 0.5).astype(int)

    return predicted_labels.flatten()  # Return as 1D array

In [29]:
predicted_labels = infer(siamese_model, pairs_mfcc, pairs_scalar)

[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 4s/step


In [32]:
print(predicted_labels)
print(labels)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1
 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1
 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0
 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1
 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0]


In [33]:
from sklearn.metrics import classification_report, confusion_matrix

# Assuming true_labels contains the actual labels for your test set
true_labels = labels # your actual labels here

# Calculate confusion matrix
cm = confusion_matrix(true_labels, predicted_labels)

# Generate classification report
report = classification_report(true_labels, predicted_labels, target_names=[str(0), str(1)])

print(report)  # Print out the classification report


              precision    recall  f1-score   support

           0       0.33      1.00      0.50        56
           1       0.00      0.00      0.00       112

    accuracy                           0.33       168
   macro avg       0.17      0.50      0.25       168
weighted avg       0.11      0.33      0.17       168

