<a href="https://colab.research.google.com/github/hakseong1231/DACD-AudioSignalAnalysis/blob/main/Comparison_of_performance_of_age%26gender_predicting_models_in_voice_b%26a_combining_stat_method.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Modeling with mean MFCC for each .wav files

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install praat-parselmouth



In [None]:
import os
import librosa
import pandas as pd
import scipy.io.wavfile as wavf
import numpy as np
import random
from sklearn.model_selection import train_test_split
import parselmouth  # Extract f0, jitter, shimmer, NHR from .wav file
from parselmouth.praat import call
from scipy.stats import norm  # Calculate value of probability density function of normal distribution
from sklearn import linear_model  # Logistic Regression module
from sklearn.ensemble import RandomForestClassifier  # Random Forest module
from sklearn.svm import SVC  # SVM module
from math import ceil
from sklearn.externals import joblib  # Use to save models
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import warnings  # Remove warnings
warnings.filterwarnings(action='ignore')

sr = 16000  # Signal Rate
categories = ['20F', '20M', '30M', '40F', '50F', '50M']
wav_unprocessed = "/content/drive/My Drive/DACD/DACD_unprocessed/"  # num of files [18590, 16433, 17661, 18589, 7698, 8077]
wav_processed = "/content/drive/My Drive/DACD/DACD_processed/"
wav_pickle = "/content/drive/MyDrive/DACD/DACD_pickle/"

# Part 1. Generate Machine Learning Models

Preprocessing

In [None]:
def remove_silence(wav_dir):
  """Remove data in .wav file if it's too small"""

  # y: Sound Pressure, sr: Number of datas per 1 second
  y, _ = librosa.load(wav_dir, sr=sr)

  # cut: Reference that determines whether to remove data
  cut = max(y) / 300
  y = pd.DataFrame(y)
  y = y[abs(y[0]) > cut].to_numpy()
  return y.T[0]

In [None]:
"""Run remove_silence() for every .wav file in wav_unprocessed directory and save results"""

run = input("Run? [Y/N] ")
if run in ['Y', 'y']:
  X_mfcc = pd.DataFrame([])
  y_label = pd.DataFrame([], columns=["y_label"])
  for category in os.listdir(wav_unprocessed):
    if category not in categories:
      continue

    print("[Now Processing]", category)

    save_dir = wav_processed + "processed_" + category + "/"
    if not os.path.isdir(save_dir):
      os.mkdir(save_dir)

    for folder in os.listdir(wav_unprocessed + category):
      folder_dir = wav_unprocessed + category + "/"

      for file in os.listdir(folder_dir + folder):
        if file.endswith(".wav"):
          wav = folder_dir + folder + "/" + file  # Path of unprocessed sound files
          save = save_dir + category + "_" + file[5:]  # Save path
          try:
            # Save means of MFCC vectors of processed sound files
            y = remove_silence(wav)
            X_mfcc = pd.concat([X_mfcc, pd.DataFrame(pd.DataFrame(librosa.feature.mfcc(y=y, sr=sr)).mean(axis=1)[1:]).T], axis=0, ignore_index=True)  # [1:]: Remove data on [0] with poor information
            y_label = pd.concat([y_label, pd.DataFrame([category[-3:]], columns=["y_label"])], axis=0, ignore_index=True)
          except:
            continue

  print("[Finished]")

Run? [Y/N] Y
[Now Processing] 20F
[Now Processing] 20M
[Now Processing] 30M
[Now Processing] 50F
[Now Processing] 40F
[Now Processing] 50M
[Finished]


In [None]:
"""Save mean value of MFCC vectors of .wav files"""

result.to_pickle(wav_pickle + "X_mfcc.pkl")
y_label.to_pickle(wav_pickle + "y_label.pkl")

Load MFCC vectors

In [None]:
X_mfcc = pd.read_pickle(wav_pickle + "X_mfcc.pkl")
y_label = pd.read_pickle(wav_pickle + "y_label.pkl")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_mfcc, y_label, test_size=0.3, random_state=1)

Generate & Save Models

In [None]:
"""1. Logistic Regression"""

logreg = linear_model.LogisticRegression()
logreg.fit(X_train, y_train)

# Save models
if not os.path.isfile(wav_pickle + "model_LogisticRegression.pkl"):
  joblib.dump(logreg, wav_pickle + "model_LogisticRegression.pkl")

# Save prediction results
y_pred = pd.DataFrame(logreg.predict(X_test), columns=["y_label"])
if not os.path.isfile(wav_pickle + "y_pred_LR.pkl"):
  y_pred.to_pickle(wav_pickle + "y_pred_LR.pkl")

In [None]:
"""2. Random Forest"""

forest = RandomForestClassifier(n_estimators=50, max_depth=15)
forest.fit(X_train, y_train)

# Save models
if not os.path.isfile(wav_pickle + "model_RandomForest.pkl"):
  joblib.dump(forest, wav_pickle + "model_RandomForest.pkl")

# Save prediction results
y_pred = pd.DataFrame(forest.predict(X_test), columns=["y_label"])
if not os.path.isfile(wav_pickle + "y_pred_RF.pkl"):
  y_pred.to_pickle(wav_pickle + "y_pred_RF.pkl")

In [None]:
"""3. Support Vector Machine"""

svm = SVC()
svm.fit(X_train, y_train)

# Save models
if not os.path.isfile(wav_pickle + "model_SVM.pkl"):
  joblib.dump(svm, wav_pickle + "model_SVM.pkl")

# Save prediction results
y_pred = pd.DataFrame(svm.predict(X_test), columns=["y_label"])
if not os.path.isfile(wav_pickle + "y_pred_SVM.pkl"):
  y_pred.to_pickle(wav_pickle + "y_pred_SVM.pkl")

Validate Classification Models

In [None]:
"""1. Logistic Regression"""

y_pred = pd.read_pickle(wav_pickle + "y_pred_LR.pkl")

# Validate model performance
print(confusion_matrix(y_test, y_pred), "\n")
print(classification_report(y_test, y_pred, digits=4))

[[5002   55   14  406  124   28]
 [  78 3970  630   27   18  247]
 [  10  577 4180    4   11  417]
 [ 478   61   31 4456  447   73]
 [ 306   18   79 1087  758   70]
 [  53  417  455  121   46 1360]] 

              precision    recall  f1-score   support

         20F     0.8439    0.8886    0.8657      5629
         20M     0.7787    0.7988    0.7886      4970
         30M     0.7757    0.8040    0.7896      5199
         40F     0.7304    0.8035    0.7652      5546
         50F     0.5399    0.3270    0.4073      2318
         50M     0.6196    0.5546    0.5853      2452

    accuracy                         0.7554     26114
   macro avg     0.7147    0.6961    0.7003     26114
weighted avg     0.7458    0.7554    0.7475     26114



In [None]:
"""2. Random Forest"""

y_pred = pd.read_pickle(wav_pickle + "y_pred_RF.pkl")

# Validate model performance
print(confusion_matrix(y_test, y_pred), "\n")
print(classification_report(y_test, y_pred, digits=4))

[[5497   34    8   77   10    3]
 [  11 4866   58    5    2   28]
 [   1   67 5104    2    1   24]
 [ 136   17    9 5356   23    5]
 [ 118   19    3  229 1944    5]
 [  16   58   70    8   11 2289]] 

              precision    recall  f1-score   support

         20F     0.9512    0.9766    0.9637      5629
         20M     0.9615    0.9791    0.9702      4970
         30M     0.9718    0.9817    0.9767      5199
         40F     0.9435    0.9657    0.9545      5546
         50F     0.9764    0.8387    0.9023      2318
         50M     0.9724    0.9335    0.9526      2452

    accuracy                         0.9595     26114
   macro avg     0.9628    0.9459    0.9533     26114
weighted avg     0.9598    0.9595    0.9591     26114



In [None]:
"""3. Support Vector Machine"""

y_pred = pd.read_pickle(wav_pickle + "y_pred_SVM.pkl")

# Validate model performance
print(confusion_matrix(y_test, y_pred), "\n")
print(classification_report(y_test, y_pred, digits=4))

[[5327   14    3  215   64    6]
 [  18 4711  155    2    6   78]
 [   1  183 4935    4    2   74]
 [ 326   33    9 5094   77    7]
 [ 294   18    4  728 1256   18]
 [  34  127  243   42   27 1979]] 

              precision    recall  f1-score   support

         20F     0.8878    0.9463    0.9162      5629
         20M     0.9263    0.9479    0.9370      4970
         30M     0.9226    0.9492    0.9357      5199
         40F     0.8371    0.9185    0.8759      5546
         50F     0.8771    0.5418    0.6699      2318
         50M     0.9154    0.8071    0.8578      2452

    accuracy                         0.8923     26114
   macro avg     0.8944    0.8518    0.8654     26114
weighted avg     0.8929    0.8923    0.8881     26114



# Part 2. Generate Classifiers

Load Sample File

In [None]:
sample_dir = wav_unprocessed + "20F/fv01/fv01_t01_s01.wav"
sample = remove_silence(sample_dir)
print(sample.shape)

(32665,)


Mean Value and Standard Error extracted from wav file

In [None]:
"""mean f0, jitter, shimmer, NHR by age and gender group"""

# m = E(X): Mean value
# s = s.e.(X): standard error
# y = P(X) = 1/(s*(2*np.pi)**.5) * np.exp(-(X-m)**2/(2*s**2)): Value of probability density function

cat_num = len(categories)  # = 6, categories = ['20F', '20M', '30M', '40F', '50F', '50M']
attrs = ["f0", "jitter", "shimmer", "NHR"]
attr_num = len(attrs)  # = 4


# 20F, 20M, 30M, 40F, 50F, 50M

m_f0 = [206.26, 111.75, 116.53, 198.81, 199.38, 126.24]
s_f0 = [20.04, 15.47, 19.34, 25.47, 30.15, 25.73]

m_jitter = [0.14, 0.23, 0.22, 0.14, 0.15, 0.27]
s_jitter = [0.12, 0.11, 0.10, 0.12, 0.12, 0.22]

m_shimmer = [5.67, 5.72, 6.12, 5.53, 7.20, 7.82]
s_shimmer = [4.20, 4.41, 5.10, 4.30, 5.87, 7.21]

m_nhr = [0.014, 0.017, 0.018, 0.011, 0.017, 0.032]
s_nhr = [0.02, 0.02, 0.02, 0.02, 0.03, 0.06]


# f0, jitter, shimmer, NHR

m_20f = [206.26, 0.14, 5.67, 0.014]
s_20f = [20.04, 0.12, 4.20, 0.02]

m_20m = [111.75, 0.23, 5.72, 0.017]
s_20m = [15.47, 0.11, 4.41, 0.02]

m_30m = [116.53, 0.22, 6.12, 0.018]
s_30m = [19.34, 0.10, 5.10, 0.02]

m_40f = [198.81, 0.14, 5.53, 0.011]
s_40f = [25.47, 0.12, 4.30, 0.02]

m_50f = [199.38, 0.15, 7.20, 0.017]
s_50f = [30.15, 0.12, 5.87, 0.03]

m_50m = [126.24, 0.27, 7.82, 0.032]
s_50m = [25.73, 0.22, 7.21, 0.06]

m_list = [m_20f, m_20m, m_30m, m_40f, m_50f, m_50m]
s_list = [s_20f, s_20m, s_30m, s_40f, s_50f, s_50m]

Find f0, Jitter, Shimmer, NHR with ParselMouth

In [None]:
def get_features(wav_dir):
  """Returns mean f0, nhr, localJitter, localShimmer"""

  # 75Hz: min F0, 500Hz: max F0
  sound = parselmouth.Sound(wav_dir)
  pitch = call(sound, "To Pitch", 0.0, 75, 500)  # Create praat pitch object
  meanF0 = call(pitch, "Get mean", 0, 0, "Hertz")
  harmonicity = call(sound, "To Harmonicity (cc)", 0.01, 75, 0.1, 1.0)
  hnr = call(harmonicity, "Get mean", 0, 0)
  nhr = 1/(10**(hnr / 10) + 1)  # HNR->NHR transmutation
  pointProcess = call(sound, "To PointProcess (periodic, cc)", 75, 500)
  localJitter = call(pointProcess, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3)
  localShimmer =  call([sound, pointProcess], "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
  
  return meanF0, localJitter, localShimmer, nhr

Classifiers

In [None]:
def statistical_probability(x, m, s):
  """Calculates the probability that an object belongs to each category"""

  #  Calculate the probability density function value for category i of object x for each attribute
  y = []
  for i in range(cat_num):
    y.append(list())
    for j in range(attr_num):
      y[i].append(norm.pdf(x[j], m[i][j], s[i][j]))
  df = pd.DataFrame(y, index=categories, columns=attrs)

  # Normalize to make the sum of each column be 1
  for col in df.columns:
    df[col] = df[col] / df[col].sum(axis=0)

  # Calculate the sum for each column
  df = df.sum(axis=1)

  # Normalize to make the sum of each sum be 1
  df = df/df.sum(axis=0)

  return df

In [None]:
logreg = joblib.load(wav_pickle + "model_LogisticRegression.pkl")
rand = joblib.load(wav_pickle + "model_RandomForest.pkl")
svm = joblib.load(wav_pickle + "model_SVM.pkl")

In [None]:
def logreg_probability(wav_dir):
  """Returns the probability that an object belongs to each category using Logistic Regression"""
  
  # # 1. Classify using mean value of whole MFCC vectors of a wav file
  # y = remove_silence(wav_dir)
  # mfcc = pd.DataFrame(librosa.feature.mfcc(y=y, sr=sr))[1:].T
  # result = pd.DataFrame(logreg.predict(mfcc)).value_counts()/len(mfcc)  # Result of classification of Logistic Regression

  # # Modify index from ('20F', ) to '20F'
  # result.index = [idx[0] for idx in result.index]


  # # 2. Divide a wav file in 'Division' and get probability of each category (prob = counts / Division)
  # division = 7
  # y = remove_silence(wav_dir)
  # result = []
  # for i in range(0, len(y), ceil(len(y)/division)):
  #   mfcc = pd.DataFrame(pd.DataFrame(librosa.feature.mfcc(y=y[i:i + ceil(len(y)/division)], sr=sr)).mean(axis=1)[1:]).T
  #   result.append(logreg.predict(mfcc))

  # result = pd.DataFrame(result).value_counts() / division
  # result.index = [idx[0] for idx in result.index]

  
  # 3. Extract MFCC vectors randomly from wav file at a constant rate
  y = remove_silence(wav_dir)
  mfcc = pd.DataFrame(librosa.feature.mfcc(y=y, sr=sr))[1:].T
  n = 10
  result = []
  for i in range(n):
    sample = pd.DataFrame(mfcc.sample(frac=1/n).mean(axis=0)).T
    result.append(logreg.predict(sample))
  
  result = pd.DataFrame(result).value_counts() / n
  result.index = [idx[0] for idx in result.index]

  return result


logreg_probability(sample_dir)

50F    0.4
20M    0.3
20F    0.2
40F    0.1
dtype: float64

In [None]:
def rand_probability(wav_dir):
  """Returns the probability that an object belongs to each category using Random Forest"""
  
  # # 1. Classify using mean value of whole MFCC vectors of a wav file
  # y = remove_silence(wav_dir)
  # mfcc = pd.DataFrame(librosa.feature.mfcc(y=y, sr=sr))[1:].T
  # result = pd.DataFrame(rand.predict(mfcc)).value_counts()/len(mfcc)  # Result of classification of Random Forest

  # # Modify index from ('20F', ) to '20F'
  # result.index = [idx[0] for idx in result.index]

  
  # # 2. Divide a wav file in 'Division' and get probability of each category (prob = counts / Division)
  # division = 7
  # y = remove_silence(wav_dir)
  # result = []
  # for i in range(0, len(y), ceil(len(y)/division)):
  #   mfcc = pd.DataFrame(pd.DataFrame(librosa.feature.mfcc(y=y[i:i + ceil(len(y)/division)], sr=sr)).mean(axis=1)[1:]).T
  #   result.append(rand.predict(mfcc))

  # result = pd.DataFrame(result).value_counts() / division
  # result.index = [idx[0] for idx in result.index]


  # 3. Extract MFCC vectors randomly from wav file at a constant rate
  y = remove_silence(wav_dir)
  mfcc = pd.DataFrame(librosa.feature.mfcc(y=y, sr=sr))[1:].T
  n = 10
  result = []
  for i in range(n):
    sample = pd.DataFrame(mfcc.sample(frac=1/n).mean(axis=0)).T
    result.append(rand.predict(sample))
  
  result = pd.DataFrame(result).value_counts() / n
  result.index = [idx[0] for idx in result.index]

  return result


rand_probability(sample_dir)

20F    0.6
50F    0.3
40F    0.1
dtype: float64

In [None]:
def svm_probability(wav_dir):
  """Returns the probability that an object belongs to each category using SVM"""
  
  # # 1. Classify using mean value of whole MFCC vectors of a wav file
  # y = remove_silence(wav_dir)
  # mfcc = pd.DataFrame(librosa.feature.mfcc(y=y, sr=sr))[1:].T
  # result = pd.DataFrame(svm.predict(mfcc)).value_counts()/len(mfcc)  # Result of classification of SVM

  # # Modify index from ('20F', ) to '20F'
  # result.index = [idx[0] for idx in result.index]

  # # 2. Divide a wav file in 'Division' and get probability of each category (prob = counts / Division)
  # division = 7
  # y = remove_silence(wav_dir)
  # result = []
  # for i in range(0, len(y), ceil(len(y)/division)):
  #   mfcc = pd.DataFrame(pd.DataFrame(librosa.feature.mfcc(y=y[i:i + ceil(len(y)/division)], sr=sr)).mean(axis=1)[1:]).T
  #   result.append(svm.predict(mfcc))

  # result = pd.DataFrame(result).value_counts() / division
  # result.index = [idx[0] for idx in result.index]


  # 3. Extract MFCC vectors randomly from wav file at a constant rate
  y = remove_silence(wav_dir)
  mfcc = pd.DataFrame(librosa.feature.mfcc(y=y, sr=sr))[1:].T
  n = 10
  result = []
  for i in range(n):
    sample = pd.DataFrame(mfcc.sample(frac=1/n).mean(axis=0)).T
    result.append(rand.predict(sample))
  
  result = pd.DataFrame(result).value_counts() / n
  result.index = [idx[0] for idx in result.index]

  return result


svm_probability(sample_dir)

20F    0.4
40F    0.3
50M    0.1
50F    0.1
20M    0.1
dtype: float64

In [None]:
def table_to_category(df):
  """Returns the category and probability that the object is most likely to belong to among each category"""

  return [df.idxmax(), df.max()]

# Part 3. Validation

7-1. Classify and Save Results

In [None]:
"""Predict age and gender of a speech before processing"""

pred_dir = wav_pickle + "predict_results_20div/"  # Directory to save prediction results

for category in os.listdir(wav_unprocessed):
  if category not in categories:
    continue

  folder_dir = wav_unprocessed + category + "/"  # Directory of a parent folder of .wav files

  # List to store prediction results(True/False)
  stat_list = []  # result by statistical method
  LR_list = []
  RF_list = []
  SVM_list = []
  LR_stat_list = []
  RF_stat_list = []
  SVM_stat_list = []
  LR_RF_list = []
  LR_SVM_list = []
  RF_SVM_list = []
  LR_RF_stat_list = []
  LR_SVM_stat_list = []
  RF_SVM_stat_list = []
  LR_RF_SVM_list = []
  LR_RF_SVM_stat_list = []
  actual_list = []

  for folder in os.listdir(folder_dir):
    file_dir = folder_dir + folder + "/"
    print("[Now Processing] /" + category + "/" + folder)

    for file in os.listdir(file_dir):
      if file.endswith(".wav"):

        # Directory of a .wav file
        wav = file_dir + file

        try:
          mfcc = pd.DataFrame(pd.DataFrame(librosa.feature.mfcc(y=remove_silence(wav), sr=sr)).mean(axis=1)[1:]).T  # [1:]: Remove data on [0] with poor information

          # Probability of belonging to each category
          stat_prob = statistical_probability(get_features(wav), m_list, s_list)
          LR_prob = logreg_probability(wav)
          RF_prob = rand_probability(wav)
          SVM_prob = svm_probability(wav)

          # Save predicted categories in the list
          stat_list.append(table_to_category(stat_prob))
          LR_list.append(table_to_category(LR_prob))
          RF_list.append(table_to_category(RF_prob))
          SVM_list.append(table_to_category(SVM_prob))
          LR_stat_list.append(table_to_category((LR_prob + stat_prob).fillna(0)/2))
          RF_stat_list.append(table_to_category((RF_prob + stat_prob).fillna(0)/2))
          SVM_stat_list.append(table_to_category((SVM_prob + stat_prob).fillna(0)/2))
          LR_RF_list.append(table_to_category((LR_prob + RF_prob).fillna(0)/2))
          LR_SVM_list.append(table_to_category((LR_prob + SVM_prob).fillna(0)/2))
          RF_SVM_list.append(table_to_category((RF_prob + SVM_prob).fillna(0)/2))
          LR_RF_stat_list.append(table_to_category((LR_prob + RF_prob + stat_prob).fillna(0)/3))
          LR_SVM_stat_list.append(table_to_category((LR_prob + SVM_prob + stat_prob).fillna(0)/3))
          RF_SVM_stat_list.append(table_to_category((RF_prob + SVM_prob + stat_prob).fillna(0)/3))
          LR_RF_SVM_list.append(table_to_category((LR_prob + RF_prob + SVM_prob).fillna(0)/3))
          LR_RF_SVM_stat_list.append(table_to_category((LR_prob + RF_prob + SVM_prob + stat_prob).fillna(0)/4))
          actual_list.append(category)

        except:
          print("[Error] {}".format(wav))
          continue

  pd.DataFrame(stat_list).to_pickle(pred_dir + "predict_stat_" + category + ".pkl")
  pd.DataFrame(LR_list).to_pickle(pred_dir + "predict_LR_" + category + ".pkl")
  pd.DataFrame(RF_list).to_pickle(pred_dir + "predict_RF_" + category + ".pkl")
  pd.DataFrame(SVM_list).to_pickle(pred_dir + "predict_SVM_" + category + ".pkl")
  pd.DataFrame(LR_stat_list).to_pickle(pred_dir + "predict_LR_stat_" + category + ".pkl")
  pd.DataFrame(RF_stat_list).to_pickle(pred_dir + "predict_RF_stat_" + category + ".pkl")
  pd.DataFrame(SVM_stat_list).to_pickle(pred_dir + "predict_SVM_stat_" + category + ".pkl")
  pd.DataFrame(LR_RF_list).to_pickle(pred_dir + "predict_LR_RF_" + category + ".pkl")
  pd.DataFrame(LR_SVM_list).to_pickle(pred_dir + "predict_LR_SVM_" + category + ".pkl")
  pd.DataFrame(RF_SVM_list).to_pickle(pred_dir + "predict_RF_SVM_" + category + ".pkl")
  pd.DataFrame(LR_RF_stat_list).to_pickle(pred_dir + "predict_LR_RF_stat_" + category + ".pkl")
  pd.DataFrame(LR_SVM_stat_list).to_pickle(pred_dir + "predict_LR_SVM_stat_" + category + ".pkl")
  pd.DataFrame(RF_SVM_stat_list).to_pickle(pred_dir + "predict_RF_SVM_stat_" + category + ".pkl")
  pd.DataFrame(LR_RF_SVM_list).to_pickle(pred_dir + "predict_LR_RF_SVM_" + category + ".pkl")
  pd.DataFrame(LR_RF_SVM_stat_list).to_pickle(pred_dir + "predict_LR_RF_SVM_stat_" + category + ".pkl")
  pd.DataFrame(actual_list).to_pickle(pred_dir + "predict_actual_" + category + ".pkl")

print("[Finished]")

[Now Processing] /20F/fv01
[Now Processing] /20F/fv02
[Now Processing] /20F/fv07
[Now Processing] /20F/fv04
[Now Processing] /20F/fv03
[Now Processing] /20F/fv10
[Now Processing] /20F/fv05
[Now Processing] /20F/fv08
[Now Processing] /20F/fv11
[Now Processing] /20F/fv12
[Now Processing] /20F/fv06
[Now Processing] /20F/fv09
[Now Processing] /20F/fv20
[Now Processing] /20F/fv15
[Now Processing] /20F/fv14
[Now Processing] /20F/fv18
[Now Processing] /20F/fv17
[Now Processing] /20F/fv19
[Now Processing] /20F/fv13
[Now Processing] /20F/fv16
[Now Processing] /20M/mv01
[Now Processing] /20M/mv02
[Now Processing] /20M/mv07
[Now Processing] /20M/mv11
[Now Processing] /20M/mv03
[Now Processing] /20M/mv08
[Now Processing] /20M/mv10
[Now Processing] /20M/mv04
[Now Processing] /20M/mv12
[Now Processing] /20M/mv06
[Now Processing] /20M/mv05
[Now Processing] /20M/mv09
[Now Processing] /20M/mv19
[Now Processing] /20M/mv14
[Now Processing] /20M/mv15
[Now Processing] /20M/mv13
[Now Processing] /20M/mv16
[

Load prediction data

In [None]:
pred_dir = wav_pickle + "predict_results_20div/"

stat_predict = pd.DataFrame([])
lr_predict = pd.DataFrame([])
rf_predict = pd.DataFrame([])
svm_predict = pd.DataFrame([])
lr_stat_predict = pd.DataFrame([])
rf_stat_predict = pd.DataFrame([])
svm_stat_predict = pd.DataFrame([])
lr_rf_predict = pd.DataFrame([])
lr_svm_predict = pd.DataFrame([])
rf_svm_predict = pd.DataFrame([])
lr_rf_stat_predict = pd.DataFrame([])
lr_svm_stat_predict = pd.DataFrame([])
rf_svm_stat_predict = pd.DataFrame([])
lr_rf_svm_predict = pd.DataFrame([])
lr_rf_svm_stat_predict = pd.DataFrame([])
actual_predict = pd.DataFrame([])

for file in os.listdir(pred_dir):
  for category in categories:
    if file[-7:-4] == category:
      if file[8:-8] == "LR_RF_SVM_stat":
        lr_rf_svm_stat_predict = pd.concat([lr_rf_svm_stat_predict, pd.read_pickle(pred_dir + file)['0']])
      elif file[8:-8] == "LR_RF_SVM":
        lr_rf_svm_predict = pd.concat([lr_rf_svm_predict, pd.read_pickle(pred_dir + file)['0']])
      elif file[8:-8] == "LR_RF_stat":
        lr_rf_stat_predict = pd.concat([lr_rf_stat_predict, pd.read_pickle(pred_dir + file)['0']])
      elif file[8:-8] == "LR_SVM_stat":
        lr_svm_stat_predict = pd.concat([lr_svm_stat_predict, pd.read_pickle(pred_dir + file)['0']])
      elif file[8:-8] == "RF_SVM_stat":
        rf_svm_stat_predict = pd.concat([rf_svm_stat_predict, pd.read_pickle(pred_dir + file)['0']])
      elif file[8:-8] == "LR_RF":
        lr_rf_predict = pd.concat([lr_rf_predict, pd.read_pickle(pred_dir + file)['0']])
      elif file[8:-8] == "LR_SVM":
        lr_svm_predict = pd.concat([lr_svm_predict, pd.read_pickle(pred_dir + file)['0']])
      elif file[8:-8] == "RF_SVM":
        rf_svm_predict = pd.concat([rf_svm_predict, pd.read_pickle(pred_dir + file)['0']])
      elif file[8:-8] == "LR_stat":
        lr_stat_predict = pd.concat([lr_stat_predict, pd.read_pickle(pred_dir + file)['0']])
      elif file[8:-8] == "RF_stat":
        rf_stat_predict = pd.concat([rf_stat_predict, pd.read_pickle(pred_dir + file)['0']])
      elif file[8:-8] == "SVM_stat":
        svm_stat_predict = pd.concat([svm_stat_predict, pd.read_pickle(pred_dir + file)['0']])
      elif file[8:-8] == "stat":
        stat_predict = pd.concat([stat_predict, pd.read_pickle(pred_dir + file)['0']])
      elif file[8:-8] == "LR":
        lr_predict = pd.concat([lr_predict, pd.read_pickle(pred_dir + file)['0']])
      elif file[8:-8] == "RF":
        rf_predict = pd.concat([rf_predict, pd.read_pickle(pred_dir + file)['0']])
      elif file[8:-8] == "SVM":
        svm_predict = pd.concat([svm_predict, pd.read_pickle(pred_dir + file)['0']])
      else:
        actual_predict = pd.concat([actual_predict, pd.read_pickle(pred_dir + file)['0']])

Classification Performance of Statistical Method

In [None]:
print(confusion_matrix(actual_predict, stat_predict), "\n")
print(classification_report(actual_predict, stat_predict, target_names = categories, digits=4))

[[82668     0    18 14988 12894   972]
 [   42 42636 26022  2418   348 27132]
 [    6 44454 23490  1596   228 36168]
 [79812     0     0 15792 15360   570]
 [28860     6   270 12270  3096  1686]
 [  180 19254  8340  6564   870 13254]] 

              precision    recall  f1-score   support

         20F     0.4315    0.7412    0.5455    111540
         20M     0.4009    0.4324    0.4161     98598
         30M     0.4040    0.2217    0.2863    105942
         40F     0.2945    0.1416    0.1912    111534
         50F     0.0944    0.0670    0.0784     46188
         50M     0.1661    0.2735    0.2067     48462

    accuracy                         0.3464    522264
   macro avg     0.2986    0.3129    0.2874    522264
weighted avg     0.3365    0.3464    0.3201    522264



Comparison of performance before and after combining statistical method on each model


Logistic Regression

In [None]:
print(confusion_matrix(actual_predict, lr_predict), "\n")
print(classification_report(actual_predict, lr_predict, target_names = categories, digits=4))

[[99150  1434   666  7284  2250   756]
 [ 1902 75240 14688   600   264  5904]
 [  270 11256 84948   108   324  9036]
 [11958  1842   936 85770  9234  1794]
 [ 7230   624  1596 20514 14604  1620]
 [ 1284  8190  9300  1896   798 26994]] 

              precision    recall  f1-score   support

         20F     0.8141    0.8889    0.8499    111540
         20M     0.7632    0.7631    0.7631     98598
         30M     0.7576    0.8018    0.7791    105942
         40F     0.7383    0.7690    0.7533    111534
         50F     0.5316    0.3162    0.3965     46188
         50M     0.5855    0.5570    0.5709     48462

    accuracy                         0.7404    522264
   macro avg     0.6984    0.6827    0.6855    522264
weighted avg     0.7306    0.7404    0.7325    522264



In [None]:
print(confusion_matrix(actual_predict, lr_stat_predict), "\n")
print(classification_report(actual_predict, lr_stat_predict, target_names = categories, digits=4))

[[101622    786    282   6660   1872    318]
 [  1782  76476  13782    558    240   5760]
 [   252  12174  84450     96    252   8718]
 [ 14310   1302    558  86082   8364    918]
 [  8502    456   1296  21114  13950    870]
 [  1368   8652   9384   2070    870  26118]] 

              precision    recall  f1-score   support

         20F     0.7949    0.9111    0.8491    111540
         20M     0.7659    0.7756    0.7708     98598
         30M     0.7695    0.7971    0.7831    105942
         40F     0.7384    0.7718    0.7547    111534
         50F     0.5460    0.3020    0.3889     46188
         50M     0.6116    0.5389    0.5730     48462

    accuracy                         0.7443    522264
   macro avg     0.7044    0.6828    0.6866    522264
weighted avg     0.7332    0.7443    0.7344    522264



Random Forest

In [None]:
print(confusion_matrix(actual_predict, rf_predict), "\n")
print(classification_report(actual_predict, rf_predict, target_names = categories, digits=4))

[[104778    978    654   4518    438    174]
 [   342  92388   3786    186     42   1854]
 [    60   1206 102252    120     24   2280]
 [  3432    606    882 105756    552    306]
 [  3642    354    342  11208  30246    396]
 [   498    810   2124    480    210  44340]] 

              precision    recall  f1-score   support

         20F     0.9293    0.9394    0.9343    111540
         20M     0.9590    0.9370    0.9479     98598
         30M     0.9292    0.9652    0.9469    105942
         40F     0.8650    0.9482    0.9047    111534
         50F     0.9598    0.6548    0.7785     46188
         50M     0.8985    0.9149    0.9066     48462

    accuracy                         0.9186    522264
   macro avg     0.9235    0.8933    0.9031    522264
weighted avg     0.9210    0.9186    0.9167    522264



In [None]:
print(confusion_matrix(actual_predict, rf_stat_predict), "\n")
print(classification_report(actual_predict, rf_stat_predict, target_names = categories, digits=4))

[[106386    432    282   4068    306     66]
 [   318  92886   3384    150     54   1806]
 [    54   1428 102204    126     12   2118]
 [  4296    360    540 105804    432    102]
 [  4680    180    228  11766  29100    234]
 [   558    798   1980    660    294  44172]] 

              precision    recall  f1-score   support

         20F     0.9148    0.9538    0.9339    111540
         20M     0.9667    0.9421    0.9542     98598
         30M     0.9409    0.9647    0.9527    105942
         40F     0.8632    0.9486    0.9039    111534
         50F     0.9636    0.6300    0.7619     46188
         50M     0.9108    0.9115    0.9111     48462

    accuracy                         0.9201    522264
   macro avg     0.9267    0.8918    0.9030    522264
weighted avg     0.9228    0.9201    0.9178    522264



SVM

In [None]:
print(confusion_matrix(actual_predict, svm_predict), "\n")
print(classification_report(actual_predict, svm_predict, target_names = categories, digits=4))

[[104736    942    564   4716    390    192]
 [   366  92526   3612    204     60   1830]
 [    90   1248 102096     90     60   2358]
 [  3450    582    918 105606    654    324]
 [  3726    336    300  11076  30276    474]
 [   528    666   2070    468    168  44562]] 

              precision    recall  f1-score   support

         20F     0.9277    0.9390    0.9333    111540
         20M     0.9608    0.9384    0.9495     98598
         30M     0.9319    0.9637    0.9475    105942
         40F     0.8645    0.9469    0.9038    111534
         50F     0.9579    0.6555    0.7783     46188
         50M     0.8959    0.9195    0.9076     48462

    accuracy                         0.9187    522264
   macro avg     0.9231    0.8938    0.9033    522264
weighted avg     0.9210    0.9187    0.9169    522264



In [None]:
print(confusion_matrix(actual_predict, svm_stat_predict), "\n")
print(classification_report(actual_predict, svm_stat_predict, target_names = categories, digits=4))

[[60599  1127  2121  3105   410 44178]
 [46414 47524  1779  1797   148   936]
 [  224 46420 57069   123    51  2055]
 [ 2791   826 47059 59479   302  1077]
 [ 1716   109   152 43952   201    58]
 [ 4602   210   174 14028 29242   206]] 

              precision    recall  f1-score   support

         20F     0.5209    0.5433    0.5318    111540
         20M     0.4939    0.4820    0.4879     98598
         30M     0.5267    0.5387    0.5326    105942
         40F     0.4856    0.5333    0.5083    111534
         50F     0.0066    0.0044    0.0053     46188
         50M     0.0042    0.0043    0.0042     48462

    accuracy                         0.4310    522264
   macro avg     0.3397    0.3510    0.3450    522264
weighted avg     0.4160    0.4310    0.4232    522264



Logistic Regression + Random Forest

In [None]:
print(confusion_matrix(actual_predict, lr_rf_predict), "\n")
print(classification_report(actual_predict, lr_rf_predict, target_names = categories, digits=4))

[[106182    870    420   3462    378    228]
 [   990  90828   4518    312    126   1824]
 [   180   4188  98742     48     48   2736]
 [  7212   1194    960 100002   1614    552]
 [  5928    420    762  15834  22710    534]
 [   918   3252   5298    948    252  37794]] 

              precision    recall  f1-score   support

         20F     0.8746    0.9520    0.9116    111540
         20M     0.9015    0.9212    0.9112     98598
         30M     0.8920    0.9320    0.9116    105942
         40F     0.8292    0.8966    0.8616    111534
         50F     0.9038    0.4917    0.6369     46188
         50M     0.8655    0.7799    0.8204     48462

    accuracy                         0.8736    522264
   macro avg     0.8777    0.8289    0.8422    522264
weighted avg     0.8752    0.8736    0.8681    522264



In [None]:
print(confusion_matrix(actual_predict, lr_rf_stat_predict), "\n")
print(classification_report(actual_predict, lr_rf_stat_predict, target_names = categories, digits=4))

[[106494    666    300   3546    402    132]
 [  1110  90168   4836    306    144   2034]
 [   144   3990  98688     54     48   3018]
 [  7284    948    732 100416   1758    396]
 [  6288    276    528  15870  22812    414]
 [   948   3012   4950   1074    258  38220]] 

              precision    recall  f1-score   support

         20F     0.8710    0.9548    0.9110    111540
         20M     0.9102    0.9145    0.9124     98598
         30M     0.8969    0.9315    0.9139    105942
         40F     0.8281    0.9003    0.8627    111534
         50F     0.8973    0.4939    0.6371     46188
         50M     0.8644    0.7887    0.8248     48462

    accuracy                         0.8746    522264
   macro avg     0.8780    0.8306    0.8436    522264
weighted avg     0.8762    0.8746    0.8693    522264



Logistic Regression + SVM

In [None]:
print(confusion_matrix(actual_predict, lr_svm_predict), "\n")
print(classification_report(actual_predict, lr_svm_predict, target_names = categories, digits=4))

[[106212    894    390   3450    414    180]
 [   924  91164   4296    360    102   1752]
 [   114   4146  98880     72     72   2658]
 [  7296   1236    966  99972   1578    486]
 [  5934    384    714  16128  22518    510]
 [   954   3150   5214    942    270  37932]] 

              precision    recall  f1-score   support

         20F     0.8746    0.9522    0.9118    111540
         20M     0.9028    0.9246    0.9136     98598
         30M     0.8952    0.9333    0.9139    105942
         40F     0.8267    0.8963    0.8601    111534
         50F     0.9024    0.4875    0.6330     46188
         50M     0.8716    0.7827    0.8248     48462

    accuracy                         0.8744    522264
   macro avg     0.8789    0.8295    0.8429    522264
weighted avg     0.8761    0.8744    0.8688    522264



In [None]:
print(confusion_matrix(actual_predict, lr_svm_stat_predict), "\n")
print(classification_report(actual_predict, lr_svm_stat_predict, target_names = categories, digits=4))

[[106512    666    258   3588    402    114]
 [   996  90570   4524    384    108   2016]
 [   102   4014  98796     66     72   2892]
 [  7500    942    630 100392   1656    414]
 [  6324    276    522  16182  22470    414]
 [  1014   3012   4938   1008    294  38196]] 

              precision    recall  f1-score   support

         20F     0.8699    0.9549    0.9104    111540
         20M     0.9104    0.9186    0.9145     98598
         30M     0.9009    0.9325    0.9164    105942
         40F     0.8255    0.9001    0.8612    111534
         50F     0.8987    0.4865    0.6313     46188
         50M     0.8672    0.7882    0.8258     48462

    accuracy                         0.8749    522264
   macro avg     0.8788    0.8301    0.8433    522264
weighted avg     0.8766    0.8749    0.8693    522264



Random Forest + SVM

In [None]:
print(confusion_matrix(actual_predict, rf_svm_predict), "\n")
print(classification_report(actual_predict, rf_svm_predict, target_names = categories, digits=4))

[[106482    714    414   3636    210     84]
 [   390  93924   2748    114     36   1386]
 [    54   1140 102960     78     18   1692]
 [  3726    612    834 105912    252    198]
 [  4002    330    318  11532  29730    276]
 [   552    684   2112    504    150  44460]] 

              precision    recall  f1-score   support

         20F     0.9243    0.9547    0.9392    111540
         20M     0.9643    0.9526    0.9584     98598
         30M     0.9413    0.9719    0.9563    105942
         40F     0.8697    0.9496    0.9079    111534
         50F     0.9781    0.6437    0.7764     46188
         50M     0.9244    0.9174    0.9209     48462

    accuracy                         0.9257    522264
   macro avg     0.9337    0.8983    0.9099    522264
weighted avg     0.9284    0.9257    0.9235    522264



In [None]:
print(confusion_matrix(actual_predict, rf_svm_stat_predict), "\n")
print(classification_report(actual_predict, rf_svm_stat_predict, target_names = categories, digits=4))

[[106758    504    294   3726    216     42]
 [   270  93738   2880    126     42   1542]
 [    36   1068 102972     72     18   1776]
 [  3756    438    606 106320    270    144]
 [  4206    216    234  11586  29730    216]
 [   558    570   1926    522    180  44706]] 

              precision    recall  f1-score   support

         20F     0.9236    0.9571    0.9401    111540
         20M     0.9710    0.9507    0.9608     98598
         30M     0.9455    0.9720    0.9585    105942
         40F     0.8690    0.9533    0.9092    111534
         50F     0.9762    0.6437    0.7758     46188
         50M     0.9232    0.9225    0.9228     48462

    accuracy                         0.9272    522264
   macro avg     0.9347    0.8999    0.9112    522264
weighted avg     0.9299    0.9272    0.9250    522264



Logistic Regression + Random Forest + SVM

In [None]:
print(confusion_matrix(actual_predict, lr_rf_svm_predict), "\n")
print(classification_report(actual_predict, lr_rf_svm_predict, target_names = categories, digits=4))

[[17832   111    49   542    39    17]
 [  167 15398   554    40    17   257]
 [   17   321 16956    10     6   347]
 [  942   145   129 17175   140    58]
 [  931    43    67  2390  4212    55]
 [  148   289   601   144    34  6861]] 

              precision    recall  f1-score   support

         20F     0.8900    0.9592    0.9233     18590
         20M     0.9443    0.9370    0.9406     16433
         30M     0.9237    0.9603    0.9417     17657
         40F     0.8460    0.9239    0.8833     18589
         50F     0.9469    0.5472    0.6936      7698
         50M     0.9034    0.8494    0.8756      8077

    accuracy                         0.9011     87044
   macro avg     0.9090    0.8628    0.8763     87044
weighted avg     0.9040    0.9011    0.8970     87044



In [None]:
print(confusion_matrix(actual_predict, lr_rf_svm_stat_predict), "\n")
print(classification_report(actual_predict, lr_rf_svm_stat_predict, target_names = categories, digits=4))

[[17832   111    49   542    39    17]
 [  167 15398   554    40    17   257]
 [   17   321 16956    10     6   347]
 [  942   145   129 17175   140    58]
 [  931    43    67  2390  4212    55]
 [  148   289   601   144    34  6861]] 

              precision    recall  f1-score   support

         20F     0.8900    0.9592    0.9233     18590
         20M     0.9443    0.9370    0.9406     16433
         30M     0.9237    0.9603    0.9417     17657
         40F     0.8460    0.9239    0.8833     18589
         50F     0.9469    0.5472    0.6936      7698
         50M     0.9034    0.8494    0.8756      8077

    accuracy                         0.9011     87044
   macro avg     0.9090    0.8628    0.8763     87044
weighted avg     0.9040    0.9011    0.8970     87044



Classification Performance Comparison(For Accuracy)  
*   Stat(0.3464)  
*   LR(0.7404) < LR+Stat(0.7443) (+0.0039)  
*   RF(0.9186) < RF+Stat(0.9201) (+0.0015)  
*   SVM(0.9187) > SVM+Stat(0.4310) (-0.4877)  
*   LR+RF(0.8736) < LR+RF+Stat(0.8746) (+0.0010)  
*   LR+SVM(0.8744) < LR+SVM+Stat(0.8749) (+0.0005)  
*   RF+SVM(0.9257) < RF+SVM+Stat(0.9272) (+0.0015)  
*   LR+RF+SVM(0.9011) = LR+RF+SVM+Stat(0.9011) (+-0.0000)  

SVM+Stat : 0.9187 -> 0.4310 (-0.4877)  
LR+RF+SVM+Stat : 0.9011 -> 0.9011 (+-0.0000)  
Performance of all other models improved after combining statistical method.

# Hypothesis Test

H0: mu_bef >= mu_aft  
HA: mu_bef < mu_aft  
a = 0.05  

i) Paired t-test for all models  
*   n = 7
*   m1 = 0.8789, m2 = 0.8105
*   Var1 = 0.004179, Var2 = 0.03175
*   P(T<=t) = 0.1825 > a  
Combining statistical methods does not result in improved classification performance

ii) Paired t-test for all models except outliers(SVM, SVM+Stat)
*   n = 6
*   m1 = 0.8723, m2 = 8737
*   Var1 = 0.004646, Var2 = 0.004503
*   P(T<=t) = 0.02633 < a  
Combining statistical methods results in improved classification performance  