## Sample notebook for seminar project ML for SSP

Pay attention to name of variables in PATH

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import Markdown, display

# Scalers
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline

# Metrics
from sklearn import metrics

# Cross validation
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold

np.random.seed(123)

You will probably need to install aif360 package, so refer to this link: https://github.com/Trusted-AI/AIF360

Or this: https://jakevdp.github.io/blog/2017/12/05/installing-python-packages-from-jupyter/

Use this: https://anaconda.org/conda-forge/aif360 or https://github.com/conda-forge/aif360-feedstock if previous doesn't work



In [3]:
# Datapathes

TRAIN_DATAPATH = r'D:\Uni\SS 2022\Seminar ML for SSP\Debiasing_Clinical_Data\Data\train\functionals_train_all.csv'
TEST_DATAPATH = r'D:\Uni\SS 2022\Seminar ML for SSP\Debiasing_Clinical_Data\Data\test\functionals_test_all.csv'

In [4]:
# reading data

train = pd.read_csv(TRAIN_DATAPATH)
test = pd.read_csv(TEST_DATAPATH)

In [5]:
train.head()

Unnamed: 0.1,Unnamed: 0,id,age,gender,mmse,diagnosis,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,...,slopeUV0-500_sma3nz_amean,slopeUV500-1500_sma3nz_amean,spectralFluxUV_sma3nz_amean,loudnessPeaksPerSec,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp
0,0,S001,74,male,,0,26.158813,0.256494,23.50483,25.390327,...,0.033982,-0.007878,0.025659,2.082161,1.610542,0.398252,0.39551,0.250614,0.467176,-29.529222
1,1,S002,62,female,30.0,0,35.0381,0.222455,32.09415,33.33882,...,0.029004,-0.006766,0.018389,0.105116,0.666316,0.105,0.070366,1.412162,1.798986,-43.936977
2,2,S003,69,female,29.0,0,33.408146,0.167676,30.763927,33.17551,...,0.027397,-0.006148,0.020994,2.31448,1.375416,0.455579,0.40864,0.291481,0.738442,-25.290459
3,3,S004,71,female,30.0,0,31.709782,0.220399,29.273027,31.521568,...,0.027672,-0.003709,0.038795,2.695713,1.712295,0.367209,0.345697,0.235282,0.462315,-27.811728
4,4,S005,74,female,30.0,0,33.117634,0.210555,29.349773,31.528109,...,0.021428,-0.001211,0.005301,0.405904,1.046154,0.105647,0.085536,0.800909,1.251056,-49.826008


In [6]:
test.head()

#comment: 1 - female, 0 - male

Unnamed: 0.1,Unnamed: 0,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,F0semitoneFrom27.5Hz_sma3nz_percentile80.0,F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2,F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope,F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope,...,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp,id,age,gender,diagnosis,mmse
0,0,31.396942,0.154061,30.213425,31.718218,33.79687,3.583446,177.98373,132.60632,216.58685,...,0.344048,0.34238,0.302286,0.547483,-22.97572,S160,63,1,0,28
1,1,29.956886,0.118317,28.500713,29.87461,31.048702,2.547989,163.17845,170.15639,146.55635,...,0.140119,0.154052,0.72494,1.355301,-57.48494,S161,55,1,0,29
2,2,19.63384,0.402905,13.825209,14.831374,31.307632,17.482424,209.11266,314.10315,78.85801,...,1.483673,1.388053,0.055946,0.046988,-25.28987,S162,67,1,1,24
3,3,28.083347,0.266278,24.390467,26.107916,29.03663,4.646164,416.172,769.89624,225.06831,...,0.368082,0.375981,0.1516,0.184023,-29.287306,S163,71,0,0,30
4,4,34.36999,0.07803,33.367874,34.61786,35.44266,2.074787,58.16399,31.168308,367.9199,...,0.074,0.034409,5.053333,6.706737,-66.156525,S164,73,1,1,21


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 94 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Unnamed: 0                                      108 non-null    int64  
 1   id                                              108 non-null    object 
 2   age                                             108 non-null    int64  
 3   gender                                          108 non-null    object 
 4   mmse                                            108 non-null    object 
 5   diagnosis                                       108 non-null    int64  
 6   F0semitoneFrom27.5Hz_sma3nz_amean               108 non-null    float64
 7   F0semitoneFrom27.5Hz_sma3nz_stddevNorm          108 non-null    float64
 8   F0semitoneFrom27.5Hz_sma3nz_percentile20.0      108 non-null    float64
 9   F0semitoneFrom27.5Hz_sma3nz_percentile50.0 

In [8]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 94 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Unnamed: 0                                      48 non-null     int64  
 1   F0semitoneFrom27.5Hz_sma3nz_amean               48 non-null     float64
 2   F0semitoneFrom27.5Hz_sma3nz_stddevNorm          48 non-null     float64
 3   F0semitoneFrom27.5Hz_sma3nz_percentile20.0      48 non-null     float64
 4   F0semitoneFrom27.5Hz_sma3nz_percentile50.0      48 non-null     float64
 5   F0semitoneFrom27.5Hz_sma3nz_percentile80.0      48 non-null     float64
 6   F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2        48 non-null     float64
 7   F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope     48 non-null     float64
 8   F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope   48 non-null     float64
 9   F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope 

In [9]:
# preprocessing
features = ["F2frequency_sma3nz_amean", 
             "HNRdBACF_sma3nz_amean", 
             "F0semitoneFrom27.5Hz_sma3nz_percentile80.0", 
             "F0semitoneFrom27.5Hz_sma3nz_amean",
             "F1frequency_sma3nz_amean",
             "gender", "diagnosis"]

train['gender'] = train['gender'].replace(['female', 'male'], [1, 0])

train_df = train[features]
test_df = test[features]

In [10]:
# function for calculating accuracy, sensitivity and specificity

def classification_results(y_true, y_pred):
    print(metrics.confusion_matrix(y_true, y_pred))
    tn, fp, fn, tp = metrics.confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn+fp)
    sensitivity = tp / (tp + fn)
    accuracy = metrics.accuracy_score(y_pred=y_pred, y_true=y_true)
    print('Accuracy: ', accuracy)
    print('Sensitivity: ', sensitivity)
    print('Specificity: ', specificity)

In [11]:
# function for evaluating during CV, full train and test datasets

def evaluate_model(model, data_x, data_y, X_test, y_test, k=5):
    '''returns trained classifier'''
    k_fold = KFold(k, shuffle=True, random_state=1)

    predicted_targets = np.array([])
    actual_targets = np.array([])
    
    display(Markdown("#### Results of k-fold CV on training set"))
    for train_ix, test_ix in k_fold.split(data_x):
        train_x, train_y, test_x, test_y = data_x.iloc[train_ix], data_y.iloc[train_ix], data_x.iloc[test_ix], data_y.iloc[test_ix]

        # Fit the classifier
        classifier = model.fit(train_x, train_y)

        # Predict the labels of the test set samples
        predicted_labels_fold = classifier.predict(test_x)

        predicted_targets = np.append(predicted_targets, predicted_labels_fold)
        actual_targets = np.append(actual_targets, test_y)
        
        classification_results(test_y, predicted_labels_fold)
        print('')
        
    display(Markdown("#### Results on the whole training set"))
    classification_results(actual_targets, predicted_targets)
    
    
    display(Markdown("#### Results on test set"))
    classification_results(y_test, classifier.predict(X_test))
    
    return classifier
        
    

### Logistic regression model

In [12]:
Y_train = train_df["gender"]
X_train = train_df[["F0semitoneFrom27.5Hz_sma3nz_percentile80.0", 
                 "F1frequency_sma3nz_amean", 
                 "HNRdBACF_sma3nz_amean", 
                 "F0semitoneFrom27.5Hz_sma3nz_amean"]]

Y_test = test_df["gender"]
X_test = test_df[["F0semitoneFrom27.5Hz_sma3nz_percentile80.0", 
                 "F1frequency_sma3nz_amean", 
                 "HNRdBACF_sma3nz_amean", 
                 "F0semitoneFrom27.5Hz_sma3nz_amean"]]


In [13]:
model_lr = make_pipeline(StandardScaler(), LogisticRegression(solver='liblinear'))

model_lr = evaluate_model(model_lr, X_train, Y_train, X_test, Y_test)

#### Results of k-fold CV on training set

[[ 5  4]
 [ 2 11]]
Accuracy:  0.7272727272727273
Sensitivity:  0.8461538461538461
Specificity:  0.5555555555555556

[[ 7  3]
 [ 1 11]]
Accuracy:  0.8181818181818182
Sensitivity:  0.9166666666666666
Specificity:  0.7

[[ 7  1]
 [ 3 11]]
Accuracy:  0.8181818181818182
Sensitivity:  0.7857142857142857
Specificity:  0.875

[[8 2]
 [3 8]]
Accuracy:  0.7619047619047619
Sensitivity:  0.7272727272727273
Specificity:  0.8

[[8 3]
 [1 9]]
Accuracy:  0.8095238095238095
Sensitivity:  0.9
Specificity:  0.7272727272727273



#### Results on the whole training set

[[35 13]
 [10 50]]
Accuracy:  0.7870370370370371
Sensitivity:  0.8333333333333334
Specificity:  0.7291666666666666


#### Results on test set

[[14  8]
 [ 6 20]]
Accuracy:  0.7083333333333334
Sensitivity:  0.7692307692307693
Specificity:  0.6363636363636364


### Random forest model

In [14]:
Y_train = train_df["gender"]
X_train = train_df[["F0semitoneFrom27.5Hz_sma3nz_percentile80.0", 
                 "F1frequency_sma3nz_amean", 
                 "HNRdBACF_sma3nz_amean", 
                 "F0semitoneFrom27.5Hz_sma3nz_amean"]]

Y_test = test_df["gender"]
X_test = test_df[["F0semitoneFrom27.5Hz_sma3nz_percentile80.0", 
                 "F1frequency_sma3nz_amean", 
                 "HNRdBACF_sma3nz_amean", 
                 "F0semitoneFrom27.5Hz_sma3nz_amean"]]

In [15]:
model_rf = make_pipeline(MinMaxScaler(), RandomForestClassifier(n_estimators=500, min_samples_leaf=25))

model_rf = evaluate_model(model_rf, X_train, Y_train, X_test, Y_test)

#### Results of k-fold CV on training set

[[ 5  4]
 [ 0 13]]
Accuracy:  0.8181818181818182
Sensitivity:  1.0
Specificity:  0.5555555555555556

[[ 9  1]
 [ 1 11]]
Accuracy:  0.9090909090909091
Sensitivity:  0.9166666666666666
Specificity:  0.9

[[ 7  1]
 [ 3 11]]
Accuracy:  0.8181818181818182
Sensitivity:  0.7857142857142857
Specificity:  0.875

[[8 2]
 [2 9]]
Accuracy:  0.8095238095238095
Sensitivity:  0.8181818181818182
Specificity:  0.8

[[6 5]
 [1 9]]
Accuracy:  0.7142857142857143
Sensitivity:  0.9
Specificity:  0.5454545454545454



#### Results on the whole training set

[[35 13]
 [ 7 53]]
Accuracy:  0.8148148148148148
Sensitivity:  0.8833333333333333
Specificity:  0.7291666666666666


#### Results on test set

[[15  7]
 [ 2 24]]
Accuracy:  0.8125
Sensitivity:  0.9230769230769231
Specificity:  0.6818181818181818


### kNN model

In [16]:
Y_train = train_df["gender"]
X_train = train_df[["F0semitoneFrom27.5Hz_sma3nz_percentile80.0", 
                 "HNRdBACF_sma3nz_amean", 
                 "F0semitoneFrom27.5Hz_sma3nz_amean"]]

Y_test = test_df["gender"]
X_test = test_df[["F0semitoneFrom27.5Hz_sma3nz_percentile80.0", 
                 "HNRdBACF_sma3nz_amean", 
                 "F0semitoneFrom27.5Hz_sma3nz_amean"]]

In [17]:
model_knn = make_pipeline(MinMaxScaler(),
                      KNeighborsClassifier(n_neighbors=3))

model_knn = evaluate_model(model_knn, X_train, Y_train, X_test, Y_test)

#### Results of k-fold CV on training set

[[ 5  4]
 [ 2 11]]
Accuracy:  0.7272727272727273
Sensitivity:  0.8461538461538461
Specificity:  0.5555555555555556

[[10  0]
 [ 4  8]]
Accuracy:  0.8181818181818182
Sensitivity:  0.6666666666666666
Specificity:  1.0

[[ 4  4]
 [ 3 11]]
Accuracy:  0.6818181818181818
Sensitivity:  0.7857142857142857
Specificity:  0.5

[[7 3]
 [2 9]]
Accuracy:  0.7619047619047619
Sensitivity:  0.8181818181818182
Specificity:  0.7

[[7 4]
 [1 9]]
Accuracy:  0.7619047619047619
Sensitivity:  0.9
Specificity:  0.6363636363636364



#### Results on the whole training set

[[33 15]
 [12 48]]
Accuracy:  0.75
Sensitivity:  0.8
Specificity:  0.6875


#### Results on test set

[[18  4]
 [ 7 19]]
Accuracy:  0.7708333333333334
Sensitivity:  0.7307692307692307
Specificity:  0.8181818181818182


### Radial based SVM

In [18]:
Y_train = train_df["gender"]
X_train = train_df[["F2frequency_sma3nz_amean", 
                    "F0semitoneFrom27.5Hz_sma3nz_percentile80.0", 
                    "F0semitoneFrom27.5Hz_sma3nz_amean"]]

Y_test = test_df["gender"]
X_test = test_df[["F2frequency_sma3nz_amean", 
                    "F0semitoneFrom27.5Hz_sma3nz_percentile80.0", 
                    "F0semitoneFrom27.5Hz_sma3nz_amean"]]

In [19]:
model_svm = make_pipeline(StandardScaler(), SVC(kernel = 'rbf', gamma = 2.0, C = 0.5))

model_svm = evaluate_model(model_svm, X_train, Y_train, X_test, Y_test)

#### Results of k-fold CV on training set

[[ 6  3]
 [ 0 13]]
Accuracy:  0.8636363636363636
Sensitivity:  1.0
Specificity:  0.6666666666666666

[[ 9  1]
 [ 1 11]]
Accuracy:  0.9090909090909091
Sensitivity:  0.9166666666666666
Specificity:  0.9

[[ 5  3]
 [ 2 12]]
Accuracy:  0.7727272727272727
Sensitivity:  0.8571428571428571
Specificity:  0.625

[[9 1]
 [2 9]]
Accuracy:  0.8571428571428571
Sensitivity:  0.8181818181818182
Specificity:  0.9

[[4 7]
 [1 9]]
Accuracy:  0.6190476190476191
Sensitivity:  0.9
Specificity:  0.36363636363636365



#### Results on the whole training set

[[33 15]
 [ 6 54]]
Accuracy:  0.8055555555555556
Sensitivity:  0.9
Specificity:  0.6875


#### Results on test set

[[18  4]
 [ 3 23]]
Accuracy:  0.8541666666666666
Sensitivity:  0.8846153846153846
Specificity:  0.8181818181818182


## Applying de-biasing techniques

### in-processing algorithms

In [24]:
# AIF360 Metrics

from aif360.metrics import *

# AIF360 in-processing algorithms

from aif360.

ImportError: cannot import name 'GridSearchReduction' from 'aif360.algorithms.inprocessing' (C:\Users\ihetman\anaconda3\lib\site-packages\aif360\algorithms\inprocessing\__init__.py)