In [None]:
import os

import IPython.display as ipd
import keras
import librosa
import matplotlib.pyplot as plt
import numpy as np
import random
import seaborn as sns
import tensorflow as tf

from keras.models import Sequential
from keras.layers import BatchNormalization, Conv2D, Dense, Dropout, Flatten, MaxPool2D, Reshape

In [None]:
# File path to each directory
train_path_real = '/Users/joshwinnes/Library/Mobile Documents/com~apple~CloudDocs/life things/data science/data/for-2seconds/training/real/'
train_path_fake = '/Users/joshwinnes/Library/Mobile Documents/com~apple~CloudDocs/life things/data science/data/for-2seconds/training/fake/'

test_path_real = '/Users/joshwinnes/Library/Mobile Documents/com~apple~CloudDocs/life things/data science/data/for-2seconds/testing/real/'
test_path_fake = '/Users/joshwinnes/Library/Mobile Documents/com~apple~CloudDocs/life things/data science/data/for-2seconds/testing/fake/'

validation_path_real = '/Users/joshwinnes/Library/Mobile Documents/com~apple~CloudDocs/life things/data science/data/for-2seconds/validation/real/'
validation_path_fake = '/Users/joshwinnes/Library/Mobile Documents/com~apple~CloudDocs/life things/data science/data/for-2seconds/validation/fake/'

In [None]:
# Make lists of filepaths for each audio file in each directory
train_real_audio = [os.path.join(train_path_real, file) for file in os.listdir(train_path_real) if file.endswith('.wav')]
train_fake_audio = [os.path.join(train_path_fake, file) for file in os.listdir(train_path_fake) if file.endswith('.wav')]

validation_real_audio = [os.path.join(validation_path_real, file) for file in os.listdir(validation_path_real) if file.endswith('.wav')]
validation_fake_audio = [os.path.join(validation_path_fake, file) for file in os.listdir(validation_path_fake) if file.endswith('.wav')]

test_real_audio = [os.path.join(test_path_real, file) for file in os.listdir(test_path_real) if file.endswith('.wav')]
test_fake_audio = [os.path.join(test_path_fake, file) for file in os.listdir(test_path_fake) if file.endswith('.wav')]

In [None]:
# Making sure the lists work
random_audio_file = train_real_audio[random.randint(0, len(train_real_audio))]
ipd.Audio(random_audio_file)

In [None]:
random_audio_file = train_fake_audio[random.randint(0, len(train_fake_audio))]
ipd.Audio(random_audio_file)

In [None]:
random_audio_file = test_real_audio[random.randint(0, len(test_real_audio))]
ipd.Audio(random_audio_file)

In [None]:
random_audio_file = test_fake_audio[random.randint(0, len(test_fake_audio))]
ipd.Audio(random_audio_file)

In [None]:
random_audio_file = validation_real_audio[random.randint(0, len(validation_real_audio))]
ipd.Audio(random_audio_file)

In [None]:
random_audio_file = validation_fake_audio[random.randint(0, len(validation_fake_audio))]
ipd.Audio(random_audio_file)

In [None]:
# Visualize the wafeforms using Librosa
x, sr = librosa.load(random_audio_file)
plt.figure(figsize=(20,20))
librosa.display.waveshow(x, sr=sr)

In [None]:
# Convert audio file to mel-scale spectrogram

def convert_to_melscale_spectrogram(file_path):
    x, sr = librosa.load(file_path)
    mel_spectrogram = librosa.feature.melspectrogram(y=x, sr=sr)
    mel_db_spect = librosa.power_to_db(mel_spectrogram, ref=np.max)
    return mel_db_spect

In [None]:
# testing convert_to_melscale_spectrogram() function with audio file above

fig, axs = plt.subplots(2,2, figsize=(15,15))    
# Loop over the axes and plot a random spectrogram on each
for i, ax in enumerate(axs.flat):
    random_audio_file = validation_fake_audio[random.randint(0, len(validation_fake_audio))]
    test_spec = convert_to_melscale_spectrogram(random_audio_file)  # Get the spectrogram for the current file
    
    # Display the mel spectrogram on the current axis
    librosa.display.specshow(test_spec, x_axis='time', y_axis='hz', ax=ax)
    
    # Set the title and labels for the current subplot
    ax.set_title(f'Mel-Scale Spectrogram {i + 1}: AI Generated')
    ax.set_xlabel('Time')
    ax.set_ylabel('Frequency (Hz)')

# Automatically adjust the layout to avoid overlap
plt.tight_layout()

# Show the plots
plt.show()

In [None]:
# Comparing AI generated mel-scale spectrogram to real spectrogram
fig, axs = plt.subplots(2,2, figsize=(15,15))    
# Loop over the axes and plot a random spectrogram on each
for i, ax in enumerate(axs.flat):
    random_audio_file = validation_real_audio[random.randint(0, len(validation_real_audio))]
    test_spec = convert_to_melscale_spectrogram(random_audio_file)  # Get the spectrogram for the current file
    
    # Display the mel spectrogram on the current axis
    librosa.display.specshow(test_spec, x_axis='time', y_axis='hz', ax=ax)
    
    # Set the title and labels for the current subplot
    ax.set_title(f'Mel-Scale Spectrogram {i + 1}: Real Voice')
    ax.set_xlabel('Time')
    ax.set_ylabel('Frequency (Hz)')

# Automatically adjust the layout to avoid overlap
plt.tight_layout()

# Show the plots
plt.show()

In [None]:
# Use spectrograms as features to train the model
def get_features_and_labels(real_audio_files, fake_audio_files):
    spec_arr = []
    labels = []
    
    for file in real_audio_files:
        spectrogram = convert_to_melscale_spectrogram(file)
        spec_arr.append(spectrogram)
        labels.append(0)
    for file in fake_audio_files:
        spectrogram = convert_to_melscale_spectrogram(file)
        spec_arr.append(spectrogram)
        labels.append(1)
    
    return np.array(spec_arr), np.array(labels)

train_features, train_labels = get_features_and_labels(train_real_audio, train_fake_audio)
validation_features, validation_labels = get_features_and_labels(validation_real_audio, validation_fake_audio)
test_features, test_labels = get_features_and_labels(test_real_audio, test_fake_audio)

In [None]:
print("train features shape: {}".format(train_features.shape))
print("test features shape: {}".format(test_features.shape))

In [None]:
# Significantly trimmed VGG model to optimize results

trimmed_vgg = Sequential()
trimmed_vgg.add(Reshape((128, 87, 1),input_shape=train_features.shape[1:])) #input layer

trimmed_vgg.add(Conv2D(filters=64, kernel_size=(3,3), padding="same", activation='relu')) # Concolutional Layers
trimmed_vgg.add(Conv2D(filters=64, kernel_size=(3,3), padding="same", activation='relu')) # another convolutional layer
trimmed_vgg.add(MaxPool2D(pool_size=(2,2), strides=(2,2))) # Pooling layer to reduce dimension of input feature maps

trimmed_vgg.add(Conv2D(filters=128, kernel_size=(3,3), padding="same", activation='relu')) # More convolutional layers
trimmed_vgg.add(Conv2D(filters=128, kernel_size=(3,3), padding="same", activation='relu')) # more convolutional layers
trimmed_vgg.add(MaxPool2D(pool_size=(2,2), strides=(2,2))) # another pooling layer to reduce dimension of input feature maps

trimmed_vgg.add(Conv2D(filters=256, kernel_size = (3,3), padding='same', activation='relu'))
trimmed_vgg.add(Conv2D(filters=256, kernel_size=(3,3), padding='same', activation='relu'))
trimmed_vgg.add(MaxPool2D(pool_size = (2,2), strides = (2,2)))

trimmed_vgg.add(Flatten()) #flattening layer to flatten input feature maps in order to link convolutional layers to fully connected layers
trimmed_vgg.add(Dense(units=256,activation="relu")) #fully connected layer
trimmed_vgg.add(Dense(units=256,activation="relu")) #fully connected layer
trimmed_vgg.add(Dense(1, activation="sigmoid")) #sigmoid function outputs binary output based on probability

trimmed_vgg.summary()

In [None]:
trimmed_vgg.compile(optimizer=keras.optimizers.Adam(),
                    loss=keras.losses.binary_crossentropy, 
                    metrics=["accuracy"])

In [None]:
trimmed_vgg_history = trimmed_vgg.fit(train_features,
                                      train_labels,
                                      validation_data = [validation_features, validation_labels],
                                      batch_size = 32,
                                      epochs = 10,
                                      steps_per_epoch = 300)

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 7))

ax1.plot(trimmed_vgg_history.history["accuracy"])
ax1.plot(trimmed_vgg_history.history['val_accuracy'])
ax1.set_title("Accuracy")
ax1.set_xlabel("Epoch")
ax1.legend(["Accuracy","Validation Accuracy"])

ax2.plot(trimmed_vgg_history.history["loss"])
ax2.plot(trimmed_vgg_history.history["val_loss"])
ax2.set_title("Loss")
ax2.set_xlabel("Epoch")
ax2.legend(["Loss","Validation Loss"])

plt.show()

In [None]:
trimmed_vgg_loss, trimmed_vgg_accuracy = trimmed_vgg.evaluate(test_features, test_labels)

In [None]:
y_pred = trimmed_vgg.predict(test_features, )

In [None]:
sns.histplot(data=y_pred)

In [None]:
y_pred_binary = (y_pred>0.005).astype(int)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

confusion_matrix = confusion_matrix(y_pred=y_pred_binary, y_true=test_labels)
display = ConfusionMatrixDisplay(confusion_matrix)
display.plot()