In [8]:
import numpy as np
import joblib as joblib
# librosa is a Python library for analyzing audio and music. It can be used to extract the data from the audio files we will see it later.
import librosa
import os
import sys

from sklearn.preprocessing import StandardScaler, OneHotEncoder
import tensorflow
from tensorflow import keras

import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning) 

model = keras.models.load_model("C:/Users/Jonathan Munoz/Projects/ToneTendency/AudioAnalysis/ModelData/mymodel")
encoder = joblib.load("C:/Users/Jonathan Munoz/Projects/ToneTendency/AudioAnalysis/ModelData/encoder.save") 
scaler = joblib.load("C:/Users/Jonathan Munoz/Projects/ToneTendency/AudioAnalysis/ModelData/scaler.save") 
actorpath = "C:/Users/Jonathan Munoz/Projects/ToneData/Audio_Speech_Actors_01-24/Actor_01/"

def extract_features(data, sample_rate):
    # ZCR
    result = np.array([])
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result=np.hstack((result, zcr)) # stacking horizontally

    # Chroma_stft
    stft = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    result = np.hstack((result, chroma_stft)) # stacking horizontally

    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc)) # stacking horizontally

    # Root Mean Square Value
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms)) # stacking horizontally

    # MelSpectogram
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mel)) # stacking horizontally
    
    return result

def predict(path) :
    data, sample_rate = librosa.load(path)
    data, _ = librosa.effects.trim(data, top_db=20, frame_length=256, hop_length=64)
    X = []
    X.append(np.array(extract_features(data, sample_rate)))
    X = scaler.transform(X)
    x_pred = np.expand_dims(X, axis=2)
    x_pred = model.predict(x_pred)
    print(x_pred*100)
    final_pred = encoder.inverse_transform(x_pred)
    return final_pred[0,0]

for file in os.listdir("C:/Users/Jonathan Munoz/Projects/ToneData/Audio_Speech_Actors_01-24/Actor_01/"):
    print(predict(actorpath+file))

[[0.08202264 0.00596875 0.92052335 0.6863171  1.489085   1.6170586
  4.9544334  0.24459139]]
sad
[[2.7396500e-02 1.7067316e-05 1.4680328e+00 7.9801548e-01 7.0944226e-01
  7.0644444e-01 6.0720963e+00 2.1855505e-01]]
sad
[[2.6881031e-03 1.2985578e-04 9.3038365e-02 1.2090658e+00 7.0367986e-01
  2.0947771e-01 7.3443537e+00 4.3756700e-01]]
sad
[[1.1091964e-02 8.1417353e-09 1.3044189e-01 1.5036411e+00 8.3403879e-01
  2.4985380e-02 5.3453331e+00 2.1504683e+00]]
sad
[[1.5117420e-01 2.9966808e-03 4.1292444e-01 9.1462785e-01 3.3049226e+00
  8.1383938e-01 4.0513148e+00 3.4819990e-01]]
sad
[[2.7950618e-02 6.1917189e-04 7.4204957e-01 4.9909753e-01 1.0098782e+00
  1.7808611e+00 5.5624628e+00 3.7708077e-01]]
sad
[[0.30143514 0.7421655  1.0782198  0.6106361  2.2274926  3.0399556
  1.8668385  0.13325772]]
neutral
[[0.3071924  0.22652134 0.93942374 0.9146035  2.507498   1.863129
  2.878601   0.36303067]]
sad
[[0.01695549 0.20735386 1.4272587  0.16586807 0.25779927 3.9664674
  3.9354208  0.02287569]]
neu