In [None]:
import numpy as np
import pandas as pd
import librosa
from os import listdir
from librosa import display
import matplotlib.pyplot as plt
import pickle
from scipy import signal
from scipy import stats
from scipy import fftpack
import timeit

In [None]:
#Read in the paths to all the audio files. audio_paths is a dictionary.
audio_paths = pickle.load(open( "all_audio_paths", "rb" ) )

In [None]:
#This is a dataframe with metadata of each individual.
df = pd.read_csv('clean_meta_data')

In [None]:
df.head()

In [None]:
#Determining where the index is for the ID of celeb
index = df.iloc[df[df['VoxCeleb2 ID'] == 'id04789'].index]
index, df['VoxCeleb2 ID'][3204]

In [None]:
#Loading in two audio files for testing purposes
y, sr = librosa.load(audio_paths['id04239'][0], duration = 4.0, sr = 16384)
y_f, sr = librosa.load(audio_paths['id04175'][4], duration = 4.0, sr = 16384)

In [None]:
keys_list = list(audio_paths.keys())

In [None]:
#Takes audio path, length, and sampling rate and returns the absolute value of the fourier transformation. This gives you the magnitude/power of each frequency in the audio.
import librosa
from scipy import fftpack

def fft(path, length, sr):
    y, sr = librosa.load(path, duration = length, sr = sr)
    n = len(y)//2 # length of the signal
    half_second = sr//2
    y=y[range(n-half_second, n+half_second)]
    y_fft = fftpack.fft(y) # fft computing
    y_fft = y_fft[range(70,260)] # Keeps the values for the frequency from 70-260 (human speech)
    return abs(y_fft)

In [None]:
#Goes through each audio file path and creates a dataframe with the fft and gender data.
count = 0
for key in df['VoxCeleb2 ID'][]:
    if key in audio_paths.keys():
        count+=1
        pickle_df = pd.DataFrame()
        gender = df.iloc[df[df['VoxCeleb2 ID'] == key].index]['Gender'].values[0]
        for paths in audio_paths[key]:
            temp_df = pd.DataFrame(fft(paths, 4, 16384)).T
            temp_df['Gender'] = gender
            temp_df['ID'] = key
            pickle_df = pickle_df.append(temp_df)
        pickle.dump(pickle_df, open("/home/ubuntu/Pickles/"+str(key), "wb" ) )           
        print(key, count)


In [None]:
#Same as above but I was running short on time and wanted to gather more audio from female speakers so I altered
#the for loop to capture all female and only some male files starting at the 3204 index.
count = 0
for key in df['VoxCeleb2 ID'][3204:]:
    if key in audio_paths.keys():
        count+=1
        pickle_df = pd.DataFrame()
        gender = df.iloc[df[df['VoxCeleb2 ID'] == key].index]['Gender'].values[0]
        if gender == 'f':
            for paths in audio_paths[key]:
                temp_df = pd.DataFrame(fft(paths, 4, 16384)).T
                temp_df['Gender'] = gender
                temp_df['ID'] = key
                pickle_df = pickle_df.append(temp_df)
            pickle.dump(pickle_df, open("/home/ubuntu/Pickles/"+str(key), "wb" ) )
        else:
            for paths in audio_paths[key][:10]:
                temp_df = pd.DataFrame(fft(paths, 4, 16384)).T
                temp_df['Gender'] = gender
                temp_df['ID'] = key
                pickle_df = pickle_df.append(temp_df)
            pickle.dump(pickle_df, open("/home/ubuntu/Pickles/"+str(key), "wb" ) )            
        print(key, count)

In [None]:
#Testing out my model on my classmate, Chris
pd.DataFrame(fft('/home/ubuntu/Chris.m4a', 4, 16384)).T.to_csv('Chris_fft')

In [None]:
#Making sure that my FFT was working well.
pd.DataFrame(fft(audio_paths['id08737'][1], 4, 16384)).T

In [None]:
#Extracting additional features from FFT.
def extract_features(path):
    y, sr = librosa.load(path, sr=None, duration=4.0)
    fourier = librosa.core.stft(y)
    magnitude, phase = librosa.magphase(fourier)
#     components, activations = librosa.decompose.decompose(magnitude, n_components=8, sort=True)
    
    #Calculating spectral sentroid
    sc = librosa.feature.spectral_centroid(y, sr=sr).mean()
    sc_std = librosa.feature.spectral_centroid(y, sr=sr).std()
    
    #Calculating skew, kurtosis, and rmse
    skew = stats.skew(y)
    kurtosis = stats.kurtosis(y)
    rmse = librosa.feature.rmse(S=magnitude).mean()
    rmse_std = librosa.feature.rmse(S=magnitude).std()
    #Calculating mean spectral roll off
    sr = librosa.feature.spectral_rolloff(y).mean()
    
    #Calculating Zero-Crossing-Rate
    zcr = librosa.feature.zero_crossing_rate(y).mean()
    zcr_std = librosa.feature.zero_crossing_rate(y).std()
    
    #Calculating tempo
    tempo = librosa.beat.tempo(y, sr=sr)[0]
    
    #Finding first 3 mel cepstral coefficients
    coefficients = librosa.feature.mfcc(y=y, sr=sr, n_mfcc = 3)
    mfcc_1 = coefficients[0].mean()
    mfcc_2 = coefficients[1].mean()
    mfcc_3 = coefficients[2].mean()
    
    #Creating a dictionary of values
    features = dict()
    features['Spectral_Centroid'] = [sc]
    features['Spectral_Centroid_std'] = [sc_std]
    features['Skew'] = [skew]
    features['Kurtosis'] = [kurtosis]
    features['RMSE'] = [rmse]
    features['RMSE_std'] = [rmse_std]
    features['Spectral_rolloff'] = [sr]
    features['Tempo'] = [tempo]
    features['mfcc_1'] = [mfcc_1]
    features['mfcc_2'] = [mfcc_2]
    features['mfcc_3'] = [mfcc_3]
    features['Zero_Crossing_Rate'] = [zcr]
    features['Zero_Crossing_std'] = [zcr_std]
    
    return features
      