### Install dependencies

In [None]:
!pip install opensmile

Collecting opensmile
  Downloading opensmile-2.4.1-py3-none-any.whl (4.5 MB)
Collecting audinterface>=0.7.0
  Downloading audinterface-0.8.1-py3-none-any.whl (21 kB)
Collecting audobject>=0.6.1
  Downloading audobject-0.6.2-py3-none-any.whl (22 kB)
Collecting audformat<2.0.0,>=0.10.1
  Downloading audformat-0.14.1-py3-none-any.whl (47 kB)
Collecting audresample<2.0.0,>=1.1.0
  Downloading audresample-1.1.0-py3-none-any.whl (635 kB)
Collecting oyaml
  Downloading oyaml-1.0-py2.py3-none-any.whl (3.0 kB)
Collecting audeer>=1.7.0
  Downloading audeer-1.18.0-py3-none-any.whl (20 kB)
Collecting iso-639
  Downloading iso-639-0.4.5.tar.gz (167 kB)
Collecting pyyaml>=5.4.1
  Downloading PyYAML-6.0-cp38-cp38-win_amd64.whl (155 kB)
Collecting audiofile>=0.4.0
  Downloading audiofile-1.0.3-py3-none-any.whl (10 kB)
Collecting pandas!=1.3.0,!=1.3.1,!=1.3.2,!=1.3.3,<1.4.0,>=1.1.5
  Downloading pandas-1.3.5-cp38-cp38-win_amd64.whl (10.2 MB)
Collecting sox
  Downloading sox-1.4.1-py2.py3-none-any.whl (

ERROR: Cannot uninstall 'PyYAML'. It is a distutils installed project and thus we cannot accurately determine which files belong to it which would lead to only a partial uninstall.


### File preprocessing functions

In [None]:
import numpy as np 
import pandas as pd 
# import opensmile
import IPython
import os

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler


In [None]:
# Load data

def preprocess():

  csv_path = 'C:/Users/junxi/Desktop/ADReSSo21/diagnosis/train/adresso-train-mmse-scores.csv'
  df = pd.read_csv(csv_path)

  df['path'] = '/content/drive/MyDrive/ADReSSo21/diagnosis/train/audio/' + df['dx'].astype(str) + '/' + df['adressfname'].astype(str) + '.wav'

  # Replacing AD = 1 (Have alzheimer's) 
  # Replacing CN = 0 (No alzheimer's)
  df['dx'] = df['dx'].replace(['ad'], 1)
  df['dx'] = df['dx'].replace(['cn'], 0)

  df = df[['path', 'dx']]
  
  return df


### Functions to extract features

In [None]:
# Function to extract features

def feature_extraction(df):

  # Loading opensmile and using eGeMAPS feature set
  smile = opensmile.Smile(
      feature_set=opensmile.FeatureSet.eGeMAPSv02,
      feature_level=opensmile.FeatureLevel.Functionals,
  )

  features_df = pd.DataFrame(columns=smile.feature_names)

  # Extract features for each audio file
  for i in range(len(df)):
    audio_path = df['path'][i]
    temp_features = smile.process_file(audio_path)
    features_df = features_df.append(temp_features.iloc[0], ignore_index=True)

    # Append filename and labels
    features_df.at[i, 'file'] = audio_path
    features_df.at[i, 'label'] = df['dx'].loc[df['path'] == audio_path].values[0]

    print(f'Row {i} done.' )

  return features_df


In [None]:
def get_features_df():

  # Get dataframe with path and labels
  df = preprocess()

  # Extract features 
  features_df = feature_extraction(df)

  # Convert labels to integer for AD classification
  features_df['label'] = features_df['label'].astype(int)

  return features_df

### Extracting features

In [None]:
# Extract features
features_df = get_features_df()

# Save features csv file
features_df.to_csv('/content/features_df.csv', index=False)

### Load features and data preprocessing

In [None]:
# Since we saved the features to a csv file, we can load it

def split_data(features_df):

  features_df = features_df.drop('file', 1)

  # Split to features and labels
  Y = features_df['label']
  X = features_df.drop('label', 1)

  # Normalize data
  scaler = StandardScaler()
  X2 = scaler.fit_transform(X)

  return X2, Y

# Load features
features_df = pd.read_csv('C:/Users/j/Desktop/ADReSSo21/Audio_Classification/features_df.csv')

# Split data into features / labels
X_train, y_train = split_data(features_df)

  features_df = features_df.drop('file', 1)
  X = features_df.drop('label', 1)


### Machine Learning Classifiers

#### Import libraries

In [None]:
# Sklearn for machine learning classifiers
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.svm import SVC 
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# For multi processing
import joblib
import sys
sys.modules['sklearn.externals.joblib'] = joblib
from sklearn.externals.joblib import parallel_backend

#### Decision Tree

Functions to perform grid search cross validation for each classifiers


In [None]:
def decision_tree(X_train, y_train):
  
  param_grid = {
  'max_depth': np.arange(1,11),
  }

  tree = DecisionTreeClassifier()

  # Grid Search for 10 fold cv
  grid = GridSearchCV(tree, param_grid, refit=True, n_jobs=2, cv=10)
  with parallel_backend('threading'):

    grid.fit(X_train, y_train)
    MS = (grid.cv_results_['mean_test_score'][grid.best_index_])
    STD = (grid.cv_results_['std_test_score'][grid.best_index_])
    print('Mean Test Score:', MS)
    print('Std Test Score:', STD) 

#### Decision Tree Bagger

In [None]:
def decision_tree_bagger(X_train, y_train):
  param_grid = {
      'base_estimator__max_depth' : np.arange(1,11),
  }

  tree = DecisionTreeClassifier()

  # Grid Search
  grid = GridSearchCV(BaggingClassifier(tree), param_grid, refit=True, n_jobs=2, cv=10)
  with parallel_backend('threading'):

    grid.fit(X_train, y_train)
    MS = (grid.cv_results_['mean_test_score'][grid.best_index_])
    STD = (grid.cv_results_['std_test_score'][grid.best_index_])
    print('Mean Test Score:', MS)
    print('Std Test Score:', STD) 

#### Random Forest

In [None]:
def random_forest(X_train, y_train):
  param_grid = { 
      'n_estimators': [50],
      'max_depth' : np.arange(1,11),
    }

  rf = RandomForestClassifier()

  # Grid Search
  grid = GridSearchCV(rf, param_grid, refit=True, n_jobs=2, cv=10)
  with parallel_backend('threading'):

    grid.fit(X_train, y_train)
    MS = (grid.cv_results_['mean_test_score'][grid.best_index_])
    STD = (grid.cv_results_['std_test_score'][grid.best_index_])
    print('Mean Test Score:', MS)
    print('Std Test Score:', STD) 

#### Random Forest Bagger

In [None]:
def random_forest_bagger(X_train, y_train):
  param_grid = {
      'base_estimator__max_depth' : np.arange(1,11),
      'base_estimator__n_estimators' : [50] }

  rf = RandomForestClassifier()

  # Grid Search
  grid = GridSearchCV(BaggingClassifier(rf), param_grid, refit=True, n_jobs=2, cv=10)
  with parallel_backend('threading'):

    grid.fit(X_train, y_train)
    MS = (grid.cv_results_['mean_test_score'][grid.best_index_])
    STD = (grid.cv_results_['std_test_score'][grid.best_index_])
    print('Mean Test Score:', MS)
    print('Std Test Score:', STD) 

#### Support Vector 

In [None]:
def support_vector(X_train, y_train):
  param_grid = {
      'C': np.arange(0.5, 1.6, 0.1),  
      'kernel': ['linear', 'rbf']
      }  

  svc = SVC(max_iter=30000)  

  # Grid Search
  grid = GridSearchCV(svc, param_grid, refit=True, n_jobs=2, cv=10)
  with parallel_backend('threading'):

      grid.fit(X_train, y_train)
      MS = (grid.cv_results_['mean_test_score'][grid.best_index_])
      STD = (grid.cv_results_['std_test_score'][grid.best_index_])
      print('Mean Test Score:', MS)
      print('Std Test Score:', STD) 

#### Logistic Regression

In [None]:
def log_reg(X_train, y_train):
  param_grid = {
      'C': np.arange(0.5, 1.6, 0.1),  
      }  

  logreg = LogisticRegression(max_iter=10000)
  
  # Grid Search
  grid = GridSearchCV(logreg, param_grid, refit=True, n_jobs=2, cv=10)
  with parallel_backend('threading'):

      grid.fit(X_train, y_train)
      print('Best Params:', grid.best_params_)
      MS = (grid.cv_results_['mean_test_score'][grid.best_index_])
      STD = (grid.cv_results_['std_test_score'][grid.best_index_])
      print('Mean Test Score:', MS)
      print('Std Test Score:', STD) 

#### Training 10 fold cross validation

In [None]:
# Run all functions

print('Decision Tree')
decision_tree(X_train, y_train)

print('Decision Tree Bagger')
decision_tree_bagger(X_train, y_train)

print('Random Forest')
random_forest(X_train, y_train)

print('Random Forest Bagger')
random_forest_bagger(X_train, y_train)

print('SVM')
support_vector(X_train, y_train)

print('Logistic Regression')
log_reg(X_train, y_train)


Decision Tree
Mean Test Score: 0.6323529411764707
Std Test Score: 0.10727485072753966
Decision Tree Bagger
Mean Test Score: 0.68125
Std Test Score: 0.08990530509421034
Random Forest
Mean Test Score: 0.6871323529411765
Std Test Score: 0.08291969501752466
Random Forest Bagger
Mean Test Score: 0.6566176470588235
Std Test Score: 0.1347535556534434
SVM
Mean Test Score: 0.7110294117647059
Std Test Score: 0.08779300915646644
Logistic Regression
Best Params: {'C': 0.9999999999999999}
Mean Test Score: 0.7058823529411764
Std Test Score: 0.09756164318754212


### Neural Network

#### Import libraries

In [None]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import classification_report, confusion_matrix 

from keras.layers import Dense, Dropout, LeakyReLU
from keras.models import Sequential
from keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.optimizers import Adam

import tensorflow as tf
tf.random.set_seed(42)
np.random.seed(42)


#### Function to create model

In [None]:
# Define neural network model

def create_model(neuron, leaky_alpha, dropout_rate, learning_rate):
    
  print ('Creating model...')

  model = Sequential()

  # Total of 88 features from eGeMAPS
  model.add(Dense(neuron, input_dim=88))
  model.add(LeakyReLU(alpha=leaky_alpha))
  model.add(Dropout(dropout_rate))

  model.add(Dense(neuron))
  model.add(LeakyReLU(alpha=leaky_alpha))
  model.add(Dropout(dropout_rate))

  model.add(Dense(neuron))
  model.add(LeakyReLU(alpha=leaky_alpha))

  model.add(Dense(1, activation='sigmoid'))

  print ('Compiling...')
  optimizer = Adam(learning_rate=learning_rate)

  # For binary classification AD or non AD
  model.compile(loss='binary_crossentropy',
                optimizer=optimizer,
                metrics=['accuracy'])
  
  return model

#### Training and tuning model

In [None]:
# 10 fold cross validation
cvkf = KFold(n_splits=10, shuffle=True)

# Tune hyper-parameters
params = {
'batch_size':[1,2,4], 
'epochs':[50],
'leaky_alpha':[0.2],
'dropout_rate':[0.3,0.5],
'learning_rate':[1e-3],
'neuron':[16, 32],
}

model = KerasClassifier(build_fn=create_model)

# Grid Search
grid = GridSearchCV(model, 
                param_grid=params,
                n_jobs=2,
                cv=cvkf,
                refit=True)  

with parallel_backend('threading'):
  grid = grid.fit(X_train, y_train)

print('Mean Test Score:', grid.cv_results_['mean_test_score'][grid.best_index_])
print('Std Test Score:', grid.cv_results_['std_test_score'][grid.best_index_])

# Return the best model
best_model = grid.best_estimator_

  model = KerasClassifier(build_fn=create_model)


Creating model...
Creating model...
Compiling...
Compiling...
Epoch 1/50
Epoch 1/50
Epoch 2/50
Epoch 2/50
Epoch 3/50
Epoch 3/50
Epoch 4/50
Epoch 4/50
Epoch 5/50
Epoch 5/50
Epoch 6/50
Epoch 6/50
Epoch 7/50
Epoch 7/50
Epoch 8/50
Epoch 8/50
Epoch 9/50
Epoch 9/50
Epoch 10/50
Epoch 10/50
Epoch 11/50
Epoch 11/50
Epoch 12/50
Epoch 12/50
Epoch 13/50
Epoch 13/50
Epoch 14/50
Epoch 14/50
Epoch 15/50
Epoch 15/50
Epoch 16/50
Epoch 16/50
Epoch 17/50
Epoch 17/50
Epoch 18/50
Epoch 18/50
Epoch 19/50
Epoch 19/50
Epoch 20/50
Epoch 20/50
Epoch 21/50
Epoch 21/50
Epoch 22/50
Epoch 22/50
Epoch 23/50
Epoch 23/50
Epoch 24/50
Epoch 24/50
Epoch 25/50
Epoch 25/50
Epoch 26/50
Epoch 26/50
Epoch 27/50
Epoch 27/50
Epoch 28/50
Epoch 28/50
Epoch 29/50
Epoch 29/50
Epoch 30/50
Epoch 30/50
Epoch 31/50
Epoch 31/50
Epoch 32/50
Epoch 32/50
Epoch 33/50
Epoch 33/50
Epoch 34/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 36/50
Epoch 37/50
Epoch 37/50
Epoch 38/50
Epoch 38/50
Epoch 39/50
Epoch 39/50
Epoch 40/50
Epoch 40/50
Epoch 41

In [None]:
print(grid.best_params_)

In [None]:
# Train on best parameters

# Best parameters
# params = {
# 'batch_size':[4], 
# 'epochs':[50],
# 'leaky_alpha':[0.2],
# 'dropout_rate':[0.5],
# 'learning_rate':[1e-3],
# 'neuron':[32],
# }

final_model = create_model(32, 0.2, 0.5, 1e-3)
final_model.fit(X_train, y_train, epochs=50, batch_size=4, validation_split=0.2, shuffle=True)

Creating model...
Compiling...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x25ebf7404c0>

### Save audio model

In [None]:
# Save the final model

final_model.save('C:/Users/j/Desktop/ADReSSo21/audio_model2')

INFO:tensorflow:Assets written to: C:/Users/j/Desktop/ADReSSo21/audio_model2\assets
