# Kyle Calabro
# DATA 750 - Thesis in Data Science
# 16 September 2021
---

# Notebook Five:
---
## Generate log-mel sepctrogram values for original training data, augmented training data and testing data

In [None]:
import librosa
import librosa.display

%matplotlib inline
import matplotlib.pyplot as plt 
from matplotlib.pyplot import specgram

import seaborn as sns

import IPython.display as ipd
from IPython.display import Audio

import seaborn as sns

import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import os
import sys
import warnings

import time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Conv1D, MaxPooling1D, AveragePooling1D
from tensorflow.keras.layers import Input, Flatten, Dropout, Activation, BatchNormalization, Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.applications import vgg16

from google.colab import drive
drive.mount("/content/drive")

# To ignore deprecation warnings...
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category = DeprecationWarning)

np.random.seed(42)
#tf.random.set_random_seed(42)

Mounted at /content/drive


# Utility Functions
---

In [None]:
# To generate the log-mel sepctrogram values of a given audio file
# Params:
    # audio_data: audio time series
    # sr: target sampling rate

def generate_log_spectrogram(audio_data, sr):
    spectrogram = librosa.feature.melspectrogram(y = audio_data, sr = sr, n_mels = 128, fmax = 8000)
    spectrogram = librosa.power_to_db(spectrogram)
    return spectrogram

In [None]:
# To save the log-mel sepctrogram image of a given audio file
# Params:
    # spectrogram: nparray of log-mel spectrogram values
    # path: path to save the image file to

def save_spectrogram(spectrogram, path):
    # Strip the figure of the axes and labels so it represents only the audio content
    fig = plt.figure()
    ax = fig.add_subplot()
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)
    ax.set_frame_on(False)

    spec = librosa.display.specshow(spectrogram, fmax = 8000);

    plt.savefig(path, bbox_inches = "tight", pad_inches = 0)
    plt.close()

# Data Augmentation Functions
---

In [None]:
# To add random noise to the audio file
# Params:
    # data: The data to augment (audio time series)
    
def add_noise(data):
    noise = .05 * np.random.uniform() * np.amax(data)
    data_noise = data.astype("float64") + noise * np.random.normal(size = data.shape[0])
    return data_noise

In [None]:
# To shift the audio in a given audio file left or right by a random value
# Params:
    # data: The data to augment (audio time series)
    
def shift_audio(data):
    return np.roll(data, 1600)

In [None]:
# To stretch a given audio file by a fixed rate (change the speed)
# Params:
    # data: The data to augment (audio time series)
    # rate: The stretch factor (> 1 = signal sped up, < 1 = signal slowed down)
    
def stretch_audio(data, rate = .8):
    return librosa.effects.time_stretch(data, rate)

In [None]:
# To shift the pitch of a waveform up by a major third
# Params:
    # data: The data to augment (audio time series)
    # sr: The audio sampling rate of the data
    
def pitch_majorThird(data, sr):
    return librosa.effects.pitch_shift(data, sr, n_steps = 4)

In [None]:
# To shift the pitch of a waveform down by a tritone
# Params:
    # data: The data to augment (audio time series)
    # sr: The audio sampling rate of the data
    
def pitch_tritone(data, sr):
        return librosa.effects.pitch_shift(data, sr, n_steps = -6)

In [None]:
# To shift the pitch of a waveform by three quarter-tones
# Params:
    # data: The data to augment (audio time series)
    # sr: The audio sampling rate of the data
    
def pitch_quarter_tone(data, sr):
        # Bins_per_octave -> number of steps per octave
        return librosa.effects.pitch_shift(data, sr, n_steps = 3, bins_per_octave = 24)

# Bringing in the Data
----

In [None]:
audio_df = pd.read_csv("/content/drive/My Drive/Thesis/audio_df.csv", usecols = ["Emotion", "path", "Gender", "Actor"])

In [None]:
# Convert path to work properly with google drive/colab
audio_df.path = audio_df.path.apply(lambda x: "/content/drive/My Drive/Thesis" + x[1:])

In [None]:
audio_df.head()

Unnamed: 0,Gender,Emotion,Actor,path
0,male,Surprise,1,/content/drive/My Drive/Thesis/data/audio_file...
1,male,Surprise,1,/content/drive/My Drive/Thesis/data/audio_file...
2,male,Angry,1,/content/drive/My Drive/Thesis/data/audio_file...
3,male,Fear,1,/content/drive/My Drive/Thesis/data/audio_file...
4,male,Fear,1,/content/drive/My Drive/Thesis/data/audio_file...


## Splitting the data into training and test sets, 80% train, 20% test
---

In [None]:
train_data, test_data = train_test_split(audio_df, test_size = .2, random_state = 42, stratify = audio_df[["Emotion", "Gender", "Actor"]])

In [None]:
train_data.reset_index(drop = True, inplace = True)
train_data.head()

Unnamed: 0,Gender,Emotion,Actor,path
0,male,Disgust,13,/content/drive/My Drive/Thesis/data/audio_file...
1,female,Disgust,4,/content/drive/My Drive/Thesis/data/audio_file...
2,male,Angry,3,/content/drive/My Drive/Thesis/data/audio_file...
3,male,Angry,5,/content/drive/My Drive/Thesis/data/audio_file...
4,male,Fear,15,/content/drive/My Drive/Thesis/data/audio_file...


In [None]:
test_data.reset_index(drop = True, inplace = True)
test_data.head()

Unnamed: 0,Gender,Emotion,Actor,path
0,female,Disgust,22,/content/drive/My Drive/Thesis/data/audio_file...
1,female,Disgust,12,/content/drive/My Drive/Thesis/data/audio_file...
2,female,Neutral,20,/content/drive/My Drive/Thesis/data/audio_file...
3,male,Happy,13,/content/drive/My Drive/Thesis/data/audio_file...
4,female,Sad,12,/content/drive/My Drive/Thesis/data/audio_file...


# Feature Extraction
---
### Iterating over all the audio files, generate the values for the log-mel spectrograms

## Training Data
---

In [None]:
train_df = pd.DataFrame(columns = ["mel_spectrogram"])

counter = 0

for index, path in enumerate(train_data.path):
    data, sr = librosa.load(path, res_type = "kaiser_fast", duration = 3, sr = 44100, offset = .5)

    # Retrieve the mel-scaled spectrograms for all augmentations, transforming both the y-axis (frequency) to log scale,
    # and the x-axis (color/amplitude) to Decibels, i.e. log scale of amplitudes

    # Get the spectrogram of the original, unaugmented data
    orig_spectrogram = generate_log_spectrogram(data, sr)
    train_df.loc[counter] = [orig_spectrogram]

    counter = counter + 1

In [None]:
train_df = pd.concat([train_data.drop(columns = "path"), pd.DataFrame(train_df["mel_spectrogram"])], axis = 1)

In [None]:
train_df.head()

Unnamed: 0,Gender,Emotion,Actor,mel_spectrogram
0,male,Disgust,13,"[[-42.014637, -41.229786, -43.67878, -45.01442..."
1,female,Disgust,4,"[[-68.426315, -68.426315, -68.426315, -68.4263..."
2,male,Angry,3,"[[-67.182045, -67.14102, -64.01513, -66.515755..."
3,male,Angry,5,"[[-54.92186, -55.697502, -57.20272, -52.536354..."
4,male,Fear,15,"[[-41.92892, -44.034023, -47.75731, -43.322655..."


## Testing Data
---

In [None]:
test_df = pd.DataFrame(columns = ["mel_spectrogram"])

counter = 0

# Retrieve the mel-scaled spectrograms for all testing data, transforming both the y-axis (frequency) to log scale,
# and the x-axis (color/amplitude) to Decibels, i.e. log scale of amplitudes
for index, path in enumerate(test_data.path):
    data, sr = librosa.load(path, res_type = "kaiser_fast", duration = 3, sr = 44100, offset = .5)

    # Get the spectrogram of the original, unaugmented data
    orig_spectrogram = generate_log_spectrogram(data, sr)
    test_df.loc[counter] = [orig_spectrogram]

    counter = counter + 1

In [None]:
test_df = pd.concat([test_data.drop(columns = "path"), pd.DataFrame(test_df["mel_spectrogram"])], axis = 1)

In [None]:
test_df.head()

Unnamed: 0,Gender,Emotion,Actor,mel_spectrogram
0,female,Disgust,22,"[[-50.69049, -54.978943, -64.96961, -63.848305..."
1,female,Disgust,12,"[[-52.43086, -49.4553, -53.096275, -59.1045, -..."
2,female,Neutral,20,"[[-78.72657, -78.72657, -78.72657, -78.72657, ..."
3,male,Happy,13,"[[-52.144207, -46.53559, -44.30996, -48.127254..."
4,female,Sad,12,"[[-59.734955, -59.734955, -59.734955, -59.7349..."


---

In [None]:
train_df.to_csv("/content/drive/My Drive/Thesis/Orig_Train/orig_train_data.csv")
test_df.to_csv("/content/drive/My Drive/Thesis/Orig_Test/orig_test_data.csv")

---

# Feature Extraction for Data Augmentation
---
### Iterating over all the training data files, generate the values for the log-mel spectrograms for the various augmentations

In [None]:
# Dataframes to hold augmented data

noise_df = pd.DataFrame(columns = ["mel_spectrogram"])
shift_df = pd.DataFrame(columns = ["mel_spectrogram"])
stretch_half_df = pd.DataFrame(columns = ["mel_spectrogram"])
stretch_double_df = pd.DataFrame(columns = ["mel_spectrogram"])
pitch_majorThird_df = pd.DataFrame(columns = ["mel_spectrogram"])
pitch_tritone_df = pd.DataFrame(columns = ["mel_spectrogram"])
pitch_quarter_tone_df = pd.DataFrame(columns = ["mel_spectrogram"])
aug_train_df = pd.DataFrame(columns = ["mel_spectrogram"])

counter = 0

for index, path in enumerate(train_data.path):
    data, sr = librosa.load(path, res_type = "kaiser_fast", duration = 3, sr = 44100, offset = .5)

    # Retrieve the mel-scaled spectrograms for all augmentations, transforming both the y-axis (frequency) to log scale,
    # and the x-axis (color/amplitude) to Decibels, i.e. log scale of amplitudes
    noise_augmentation = add_noise(data)
    shift_augmentation = shift_audio(data)
    stretch_augmentation_half = stretch_audio(data, .8)
    stretch_augmentation_double = stretch_audio(data, 1.2)
    pitch_majorThird_augmentation = pitch_majorThird(data, sr)
    pitch_tritone_augmentation = pitch_tritone(data, sr)
    pitch_quarter_tone_augmentation = pitch_quarter_tone(data, sr)

    # Get the spectrogram of the data augmented with noise
    noise_spectrogram = generate_log_spectrogram(noise_augmentation, sr)
    noise_df.loc[counter] = [noise_spectrogram]

    # Get the spectrogram of the data augmented via shift
    shift_spectrogram = generate_log_spectrogram(shift_augmentation, sr)
    shift_df.loc[counter] = [shift_spectrogram]

    # Get the spectrogram of the data augmented via stretch by .8
    stretch_half_spectrogram = generate_log_spectrogram(stretch_augmentation_half, sr)
    stretch_half_df.loc[counter] = [stretch_half_spectrogram]

    # Get the spectrogram of the data augmented via stretch by factor of 1.2
    stretch_double_spectrogram = generate_log_spectrogram(stretch_augmentation_double, sr)
    stretch_double_df.loc[counter] = [stretch_double_spectrogram]

    # Get the spectrogram of the data augmented with pitch (Major Third)
    majorThird_spectrogram = generate_log_spectrogram(pitch_majorThird_augmentation, sr)
    pitch_majorThird_df.loc[counter] = [majorThird_spectrogram]

    # Get the spectrogram of the data augmented with pitch (Tri-tone)
    tritone_spectrogram = generate_log_spectrogram(pitch_tritone_augmentation, sr)
    pitch_tritone_df.loc[counter] = [tritone_spectrogram]

    # Get the spectrogram of the data augmented with pitch (Quarter Tone)
    quartertone_spectrogram = generate_log_spectrogram(pitch_quarter_tone_augmentation, sr)
    pitch_quarter_tone_df.loc[counter] = [quartertone_spectrogram]

    # Get the spectrogram of the original, unaugmented data
    orig_spectrogram = generate_log_spectrogram(data, sr)
    aug_train_df.loc[counter] = [orig_spectrogram]

    counter = counter + 1

In [None]:
# Label the augmentation to make selecting certain augmentations a breeze later
noise_df["Augmentation"] = "Noise"
shift_df["Augmentation"] = "Shift"
stretch_half_df["Augmentation"] = "Stretch .8"
stretch_double_df["Augmentation"] = "Stretch 1.2"
pitch_majorThird_df["Augmentation"] = "Pitch Major Third"
pitch_tritone_df["Augmentation"] = "Pitch Tritone"
pitch_quarter_tone_df["Augmentation"] = "Pitch Quarter tone"
aug_train_df["Augmentation"] = "None"

In [None]:
# Build out the master dataframe of augmented data, bringing back in the metadata of the audio file as well
aug_df = pd.DataFrame(columns = ["Augmentation"])

aug_df = pd.concat([noise_df.Augmentation, train_data, pd.DataFrame(noise_df["mel_spectrogram"])], axis = 1)

aug_df = aug_df.append(pd.concat([shift_df.Augmentation, train_data, pd.DataFrame(shift_df["mel_spectrogram"])], axis = 1))
aug_df = aug_df.append(pd.concat([pitch_majorThird_df.Augmentation, train_data, pd.DataFrame(pitch_majorThird_df["mel_spectrogram"])], axis = 1))
aug_df = aug_df.append(pd.concat([pitch_tritone_df.Augmentation, train_data, pd.DataFrame(pitch_tritone_df["mel_spectrogram"])], axis = 1))
aug_df = aug_df.append(pd.concat([pitch_quarter_tone_df.Augmentation, train_data, pd.DataFrame(pitch_quarter_tone_df["mel_spectrogram"])], axis = 1))
aug_df = aug_df.append(pd.concat([stretch_half_df.Augmentation, train_data, pd.DataFrame(stretch_half_df["mel_spectrogram"])], axis = 1))
aug_df = aug_df.append(pd.concat([stretch_double_df.Augmentation, train_data, pd.DataFrame(stretch_double_df["mel_spectrogram"])], axis = 1))
aug_df = aug_df.append(pd.concat([aug_train_df.Augmentation, train_data, pd.DataFrame(aug_train_df["mel_spectrogram"])], axis = 1))

aug_df.drop(columns = "path", inplace = True)

In [None]:
aug_df.head()

Unnamed: 0,Augmentation,Gender,Emotion,Actor,mel_spectrogram
0,Noise,male,Disgust,13,"[[-55.6770425595965, -42.43659733561067, -42.8..."
1,Noise,female,Disgust,4,"[[-41.64856659323732, -39.50736436345467, -37...."
2,Noise,male,Angry,3,"[[-34.20474933265983, -37.84025022266927, -38...."
3,Noise,male,Angry,5,"[[-52.49080599303481, -49.924579023838014, -44..."
4,Noise,male,Fear,15,"[[-38.079510217392055, -31.096275481650686, -3..."


In [None]:
aug_df.shape

(9216, 5)

In [None]:
aug_df.to_csv("/content/drive/My Drive/Thesis/Aug_Train/aug_train.csv")

# Convert Testing Data to Images
----

In [None]:
test_df.shape

(288, 4)

In [None]:
# Saving Testing Data Locally
for index, mel_spectrogram in enumerate(test_df.mel_spectrogram):
    img_path = "/content/drive/My Drive/Thesis/Orig_Test/{0}.jpeg".format(index)

    save_spectrogram(mel_spectrogram, img_path)

# Convert Original Training Data to Images
----

In [None]:
train_df.shape

(1152, 4)

In [None]:
# Saving Original Training Data Locally
for index, mel_spectrogram in enumerate(train_df.mel_spectrogram):
    img_path = "/content/drive/My Drive/Thesis/Orig_Train/{0}.jpeg".format(index)

    save_spectrogram(mel_spectrogram, img_path)

# Convert Augmented Training Data to Images
----

In [None]:
aug_df.shape

(9216, 5)

In [None]:
# Saving Augmented Training Data Locally
for index, mel_spectrogram in enumerate(aug_df.mel_spectrogram):
    img_path = "/content/drive/My Drive/Thesis/Aug_Train/{0}.jpeg".format(index)

    save_spectrogram(mel_spectrogram, img_path)