In [9]:
import numpy as np
import joblib as joblib
# librosa is a Python library for analyzing audio and music. It can be used to extract the data from the audio files we will see it later.
import librosa
import os
import sys

from sklearn.preprocessing import StandardScaler, OneHotEncoder
import tensorflow
from tensorflow import keras

import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning) 

model = keras.models.load_model("C:/Users/Jonathan Munoz/Projects/ToneTendency/AudioAnalysis/ModelData/mymodel")
encoder = joblib.load("C:/Users/Jonathan Munoz/Projects/ToneTendency/AudioAnalysis/ModelData/encoder.save") 
scaler = joblib.load("C:/Users/Jonathan Munoz/Projects/ToneTendency/AudioAnalysis/ModelData/scaler.save") 
actorpath = "C:/Users/Jonathan Munoz/Projects/ToneData/Audio_Speech_Actors_01-24/Actor_01/"

def extract_features(data, sample_rate):
    # ZCR
    result = np.array([])
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result=np.hstack((result, zcr)) # stacking horizontally

    # Chroma_stft
    stft = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    result = np.hstack((result, chroma_stft)) # stacking horizontally

    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc)) # stacking horizontally

    # Root Mean Square Value
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms)) # stacking horizontally

    # MelSpectogram
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mel)) # stacking horizontally
    
    return result

def predict(path) :
    data, sample_rate = librosa.load(path)
    data, _ = librosa.effects.trim(data, top_db=20, frame_length=256, hop_length=64)
    X = []
    X.append(np.array(extract_features(data, sample_rate)))
    X = scaler.transform(X)
    x_pred = np.expand_dims(X, axis=2)
    x_pred = model.predict(x_pred)
    print(x_pred)
    final_pred = encoder.inverse_transform(x_pred)
    return final_pred[0,0]

for file in os.listdir("C:/Users/Jonathan Munoz/Projects/ToneData/Audio_Speech_Actors_01-24/Actor_01/"):
    print(predict(actorpath+file))

[[ 0.82022643  0.05968746  9.205234    6.8631706  14.89085    16.170588
  49.544334    2.4459138 ]]
sad
[[2.7396500e-01 1.7067316e-04 1.4680329e+01 7.9801545e+00 7.0944228e+00
  7.0644445e+00 6.0720963e+01 2.1855505e+00]]
sad
[[2.6881032e-02 1.2985579e-03 9.3038362e-01 1.2090658e+01 7.0367985e+00
  2.0947771e+00 7.3443535e+01 4.3756700e+00]]
sad
[[1.1091965e-01 8.1417348e-08 1.3044189e+00 1.5036411e+01 8.3403873e+00
  2.4985380e-01 5.3453331e+01 2.1504684e+01]]
sad
[[1.5117420e+00 2.9966809e-02 4.1292443e+00 9.1462784e+00 3.3049225e+01
  8.1383934e+00 4.0513149e+01 3.4819989e+00]]
sad
[[2.7950618e-01 6.1917193e-03 7.4204960e+00 4.9909754e+00 1.0098783e+01
  1.7808611e+01 5.5624626e+01 3.7708077e+00]]
sad
[[ 3.0143514  7.421655  10.782198   6.106361  22.274925  30.399555
  18.668385   1.3325772]]
neutral
[[ 3.0719237  2.2652135  9.3942375  9.146034  25.07498   18.63129
  28.786009   3.6303065]]
sad
[[ 0.16955489  2.0735388  14.272587    1.6586807   2.5779927  39.664673
  39.35421     0.