## Sample notebook for seminar project ML for SSP

Pay attention to name of variables in PATH

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import Markdown, display

# Scalers
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline

# Metrics
from sklearn import metrics

# Cross validation
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold

np.random.seed(123)

You will probably need to install aif360 package, so refer to this link: https://github.com/Trusted-AI/AIF360

Or this: https://jakevdp.github.io/blog/2017/12/05/installing-python-packages-from-jupyter/

Use this: https://anaconda.org/conda-forge/aif360 or https://github.com/conda-forge/aif360-feedstock if previous doesn't work


UPD: it worked using only `pip`

UPD: you will probably need to install fairlearn package `pip install fairlearn`

In [2]:
# Datapathes

TRAIN_DATAPATH = r'D:\Uni\SS 2022\Seminar ML for SSP\Debiasing_Clinical_Data\Data\train\functionals_train_all.csv'
TEST_DATAPATH = r'D:\Uni\SS 2022\Seminar ML for SSP\Debiasing_Clinical_Data\Data\test\functionals_test_all.csv'

In [3]:
# reading data

train = pd.read_csv(TRAIN_DATAPATH)
test = pd.read_csv(TEST_DATAPATH)

In [4]:
train.head()

Unnamed: 0.1,Unnamed: 0,id,age,gender,mmse,diagnosis,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,...,slopeUV0-500_sma3nz_amean,slopeUV500-1500_sma3nz_amean,spectralFluxUV_sma3nz_amean,loudnessPeaksPerSec,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp
0,0,S001,74,male,,0,26.158813,0.256494,23.50483,25.390327,...,0.033982,-0.007878,0.025659,2.082161,1.610542,0.398252,0.39551,0.250614,0.467176,-29.529222
1,1,S002,62,female,30.0,0,35.0381,0.222455,32.09415,33.33882,...,0.029004,-0.006766,0.018389,0.105116,0.666316,0.105,0.070366,1.412162,1.798986,-43.936977
2,2,S003,69,female,29.0,0,33.408146,0.167676,30.763927,33.17551,...,0.027397,-0.006148,0.020994,2.31448,1.375416,0.455579,0.40864,0.291481,0.738442,-25.290459
3,3,S004,71,female,30.0,0,31.709782,0.220399,29.273027,31.521568,...,0.027672,-0.003709,0.038795,2.695713,1.712295,0.367209,0.345697,0.235282,0.462315,-27.811728
4,4,S005,74,female,30.0,0,33.117634,0.210555,29.349773,31.528109,...,0.021428,-0.001211,0.005301,0.405904,1.046154,0.105647,0.085536,0.800909,1.251056,-49.826008


In [5]:
test.head()

#comment: 1 - female, 0 - male

Unnamed: 0.1,Unnamed: 0,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,F0semitoneFrom27.5Hz_sma3nz_percentile80.0,F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2,F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope,F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope,...,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp,id,age,gender,diagnosis,mmse
0,0,31.396942,0.154061,30.213425,31.718218,33.79687,3.583446,177.98373,132.60632,216.58685,...,0.344048,0.34238,0.302286,0.547483,-22.97572,S160,63,1,0,28
1,1,29.956886,0.118317,28.500713,29.87461,31.048702,2.547989,163.17845,170.15639,146.55635,...,0.140119,0.154052,0.72494,1.355301,-57.48494,S161,55,1,0,29
2,2,19.63384,0.402905,13.825209,14.831374,31.307632,17.482424,209.11266,314.10315,78.85801,...,1.483673,1.388053,0.055946,0.046988,-25.28987,S162,67,1,1,24
3,3,28.083347,0.266278,24.390467,26.107916,29.03663,4.646164,416.172,769.89624,225.06831,...,0.368082,0.375981,0.1516,0.184023,-29.287306,S163,71,0,0,30
4,4,34.36999,0.07803,33.367874,34.61786,35.44266,2.074787,58.16399,31.168308,367.9199,...,0.074,0.034409,5.053333,6.706737,-66.156525,S164,73,1,1,21


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 94 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Unnamed: 0                                      108 non-null    int64  
 1   id                                              108 non-null    object 
 2   age                                             108 non-null    int64  
 3   gender                                          108 non-null    object 
 4   mmse                                            108 non-null    object 
 5   diagnosis                                       108 non-null    int64  
 6   F0semitoneFrom27.5Hz_sma3nz_amean               108 non-null    float64
 7   F0semitoneFrom27.5Hz_sma3nz_stddevNorm          108 non-null    float64
 8   F0semitoneFrom27.5Hz_sma3nz_percentile20.0      108 non-null    float64
 9   F0semitoneFrom27.5Hz_sma3nz_percentile50.0 

In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 94 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Unnamed: 0                                      48 non-null     int64  
 1   F0semitoneFrom27.5Hz_sma3nz_amean               48 non-null     float64
 2   F0semitoneFrom27.5Hz_sma3nz_stddevNorm          48 non-null     float64
 3   F0semitoneFrom27.5Hz_sma3nz_percentile20.0      48 non-null     float64
 4   F0semitoneFrom27.5Hz_sma3nz_percentile50.0      48 non-null     float64
 5   F0semitoneFrom27.5Hz_sma3nz_percentile80.0      48 non-null     float64
 6   F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2        48 non-null     float64
 7   F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope     48 non-null     float64
 8   F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope   48 non-null     float64
 9   F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope 

In [8]:
# preprocessing
features = ["F2frequency_sma3nz_amean", 
             "HNRdBACF_sma3nz_amean", 
             "F0semitoneFrom27.5Hz_sma3nz_percentile80.0", 
             "F0semitoneFrom27.5Hz_sma3nz_amean",
             "F1frequency_sma3nz_amean",
             "gender", "diagnosis"]

train['gender'] = train['gender'].replace(['female', 'male'], [1, 0])

train_df = train[features]
test_df = test[features]

In [9]:
# function for calculating accuracy, sensitivity and specificity

def classification_results(y_true, y_pred):
    print(metrics.confusion_matrix(y_true, y_pred))
    tn, fp, fn, tp = metrics.confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn + fp)
    sensitivity = tp / (tp + fn)
    accuracy = metrics.accuracy_score(y_pred=y_pred, y_true=y_true)
    print('Accuracy: ', accuracy)
    print('Sensitivity: ', sensitivity)
    print('Specificity: ', specificity)

In [10]:
# function for evaluating during CV, full train and test datasets

def evaluate_model(model, data_x, data_y, X_test, y_test, k=5):
    '''returns trained classifier'''
    k_fold = KFold(k, shuffle=True, random_state=1)

    predicted_targets = np.array([])
    actual_targets = np.array([])
    
    display(Markdown("#### Results of k-fold CV on training set"))
    for train_ix, test_ix in k_fold.split(data_x):
        train_x, train_y, test_x, test_y = data_x.iloc[train_ix], data_y.iloc[train_ix], data_x.iloc[test_ix], data_y.iloc[test_ix]

        # Fit the classifier
        classifier = model.fit(train_x, train_y)

        # Predict the labels of the test set samples
        predicted_labels_fold = classifier.predict(test_x)

        predicted_targets = np.append(predicted_targets, predicted_labels_fold)
        actual_targets = np.append(actual_targets, test_y)
        
        classification_results(test_y, predicted_labels_fold)
        print('')
        
    display(Markdown("#### Results on the whole training set"))
    classification_results(actual_targets, predicted_targets)
    
    
    display(Markdown("#### Results on test set"))
    classification_results(y_test, classifier.predict(X_test))
    
    return classifier
        
    

### Logistic regression model

In [11]:
features_lr = ["F0semitoneFrom27.5Hz_sma3nz_percentile80.0", 
                 "F1frequency_sma3nz_amean", 
                 "HNRdBACF_sma3nz_amean", 
                 "F0semitoneFrom27.5Hz_sma3nz_amean",
                 "gender"]

Y_train = train_df["diagnosis"]
X_train = train_df[features_lr]

Y_test = test_df["diagnosis"]
X_test = test_df[features_lr]


In [12]:
model_lr = make_pipeline(LogisticRegression(solver='liblinear', penalty = 'l2', class_weight = 'balanced'))

model_lr = evaluate_model(model_lr, X_train, Y_train, X_test, Y_test)

#### Results of k-fold CV on training set

[[ 2  2]
 [ 5 13]]
Accuracy:  0.6818181818181818
Sensitivity:  0.7222222222222222
Specificity:  0.5

[[13  1]
 [ 5  3]]
Accuracy:  0.7272727272727273
Sensitivity:  0.375
Specificity:  0.9285714285714286

[[7 4]
 [2 9]]
Accuracy:  0.7272727272727273
Sensitivity:  0.8181818181818182
Specificity:  0.6363636363636364

[[7 4]
 [4 6]]
Accuracy:  0.6190476190476191
Sensitivity:  0.6
Specificity:  0.6363636363636364

[[8 6]
 [3 4]]
Accuracy:  0.5714285714285714
Sensitivity:  0.5714285714285714
Specificity:  0.5714285714285714



#### Results on the whole training set

[[37 17]
 [19 35]]
Accuracy:  0.6666666666666666
Sensitivity:  0.6481481481481481
Specificity:  0.6851851851851852


#### Results on test set

[[16  8]
 [ 7 17]]
Accuracy:  0.6875
Sensitivity:  0.7083333333333334
Specificity:  0.6666666666666666


### Random forest model

In [13]:
features_rf = ["F0semitoneFrom27.5Hz_sma3nz_percentile80.0", 
                 "F1frequency_sma3nz_amean", 
                 "HNRdBACF_sma3nz_amean", 
                 "F0semitoneFrom27.5Hz_sma3nz_amean",
                 "gender"]

Y_train = train_df["diagnosis"]
X_train = train_df[features_rf]

Y_test = test_df["diagnosis"]
X_test = test_df[features_rf]

In [14]:
model_rf = make_pipeline(MinMaxScaler(), RandomForestClassifier(n_estimators=200, min_samples_leaf=20,
                                                               class_weight='balanced'))

model_rf = evaluate_model(model_rf, X_train, Y_train, X_test, Y_test)

#### Results of k-fold CV on training set

[[ 1  3]
 [ 6 12]]
Accuracy:  0.5909090909090909
Sensitivity:  0.6666666666666666
Specificity:  0.25

[[11  3]
 [ 4  4]]
Accuracy:  0.6818181818181818
Sensitivity:  0.5
Specificity:  0.7857142857142857

[[6 5]
 [7 4]]
Accuracy:  0.45454545454545453
Sensitivity:  0.36363636363636365
Specificity:  0.5454545454545454

[[9 2]
 [4 6]]
Accuracy:  0.7142857142857143
Sensitivity:  0.6
Specificity:  0.8181818181818182

[[8 6]
 [2 5]]
Accuracy:  0.6190476190476191
Sensitivity:  0.7142857142857143
Specificity:  0.5714285714285714



#### Results on the whole training set

[[35 19]
 [23 31]]
Accuracy:  0.6111111111111112
Sensitivity:  0.5740740740740741
Specificity:  0.6481481481481481


#### Results on test set

[[16  8]
 [ 8 16]]
Accuracy:  0.6666666666666666
Sensitivity:  0.6666666666666666
Specificity:  0.6666666666666666


### kNN model

In [15]:
features_knn = ["F0semitoneFrom27.5Hz_sma3nz_percentile80.0", 
                 "HNRdBACF_sma3nz_amean", 
                 "F0semitoneFrom27.5Hz_sma3nz_amean",
                 "gender"]

Y_train = train_df["diagnosis"]
X_train = train_df[features_knn]

Y_test = test_df["diagnosis"]
X_test = test_df[features_knn]

In [16]:
model_knn = make_pipeline(MinMaxScaler(),
                      KNeighborsClassifier(n_neighbors=3))

model_knn = evaluate_model(model_knn, X_train, Y_train, X_test, Y_test)

#### Results of k-fold CV on training set

[[ 4  0]
 [ 8 10]]
Accuracy:  0.6363636363636364
Sensitivity:  0.5555555555555556
Specificity:  1.0

[[5 9]
 [3 5]]
Accuracy:  0.45454545454545453
Sensitivity:  0.625
Specificity:  0.35714285714285715

[[7 4]
 [6 5]]
Accuracy:  0.5454545454545454
Sensitivity:  0.45454545454545453
Specificity:  0.6363636363636364

[[3 8]
 [4 6]]
Accuracy:  0.42857142857142855
Sensitivity:  0.6
Specificity:  0.2727272727272727

[[5 9]
 [1 6]]
Accuracy:  0.5238095238095238
Sensitivity:  0.8571428571428571
Specificity:  0.35714285714285715



#### Results on the whole training set

[[24 30]
 [22 32]]
Accuracy:  0.5185185185185185
Sensitivity:  0.5925925925925926
Specificity:  0.4444444444444444


#### Results on test set

[[14 10]
 [ 5 19]]
Accuracy:  0.6875
Sensitivity:  0.7916666666666666
Specificity:  0.5833333333333334


### Radial based SVM

In [17]:
features_svm = ["F2frequency_sma3nz_amean", 
                    "F0semitoneFrom27.5Hz_sma3nz_percentile80.0", 
                    "F0semitoneFrom27.5Hz_sma3nz_amean",
                     "gender"]

Y_train = train_df["diagnosis"]
X_train = train_df[features_svm]

Y_test = test_df["diagnosis"]
X_test = test_df[features_svm]

In [18]:
model_svm = make_pipeline(MinMaxScaler(), SVC(kernel = 'rbf', gamma = 4.0, C = 3.0))

model_svm = evaluate_model(model_svm, X_train, Y_train, X_test, Y_test)

#### Results of k-fold CV on training set

[[ 4  0]
 [ 8 10]]
Accuracy:  0.6363636363636364
Sensitivity:  0.5555555555555556
Specificity:  1.0

[[9 5]
 [1 7]]
Accuracy:  0.7272727272727273
Sensitivity:  0.875
Specificity:  0.6428571428571429

[[5 6]
 [5 6]]
Accuracy:  0.5
Sensitivity:  0.5454545454545454
Specificity:  0.45454545454545453

[[6 5]
 [1 9]]
Accuracy:  0.7142857142857143
Sensitivity:  0.9
Specificity:  0.5454545454545454

[[9 5]
 [2 5]]
Accuracy:  0.6666666666666666
Sensitivity:  0.7142857142857143
Specificity:  0.6428571428571429



#### Results on the whole training set

[[33 21]
 [17 37]]
Accuracy:  0.6481481481481481
Sensitivity:  0.6851851851851852
Specificity:  0.6111111111111112


#### Results on test set

[[18  6]
 [ 8 16]]
Accuracy:  0.7083333333333334
Sensitivity:  0.6666666666666666
Specificity:  0.75


#### Results on the test set without debiasing

| Model       | Logistic regression | Random forest | kNN    |  SVM   |  
|------       |------               | ------        |------  | ------ |
| Accuracy    |  0.6875             |  0.666        | 0.6875 | 0.7083 | 
| Sensitivity |  0.7083             |  0.666        | 0.7916 | 0.6666 | 
| Specificity |  0.6666             |  0.666        | 0.5833 | 0.75   | 

## Applying debiasing techniques

### in-processing algorithms

In [19]:
# AIF360 Metrics
from aif360.sklearn.metrics import statistical_parity_difference, equal_opportunity_difference, average_odds_difference
from aif360.sklearn.metrics import disparate_impact_ratio, theil_index

# AIF360 in-processing algorithms

from aif360.sklearn.inprocessing import ExponentiatedGradientReduction
from aif360.sklearn.inprocessing import GridSearchReduction

# AIF360 dataset utils
from aif360.sklearn.datasets import standardize_dataset

# import of constraint moments
import fairlearn.reductions as red

In [20]:
def get_values_of_aif_metrics(y_test, y_pred):
    print('Statistical parity difference: ', statistical_parity_difference(y_test, y_pred))
    print('Equal opportunity difference: ', equal_opportunity_difference(y_test, y_pred))
    print('Average odds difference: ', average_odds_difference(y_test, y_pred))
    print('Disparate impact ratio: ', disparate_impact_ratio(y_test, y_pred))
    print('Theil index: ', theil_index(y_pred))

In [21]:
prot_attr_cols = [colname for colname in train_df if "gender" in colname]
prot_attr_cols

['gender']

### Logistic regression

In [22]:
# creating aif360 dataset

X_train_aif, y_train_aif = standardize_dataset(df = train_df, prot_attr=prot_attr_cols, 
                                               target = 'diagnosis', usecols=features_lr)
X_test_aif, y_test_aif = standardize_dataset(df = test_df, prot_attr=prot_attr_cols, target = 'diagnosis',
                                            usecols=features_lr)

In [23]:
get_values_of_aif_metrics(y_test_aif, model_lr.predict(X_test_aif))

Statistical parity difference:  0.12937062937062938
Equal opportunity difference:  0.2027972027972028
Average odds difference:  0.12937062937062938
Disparate impact ratio:  1.2803030303030303
Theil index:  0.6523251860396903


In [32]:
model_lr_aif = LogisticRegression(solver='liblinear', 
                                            penalty = 'l2', 
                                            class_weight = 'balanced')

grid_search_red = GridSearchReduction(prot_attr=prot_attr_cols, 
                                      estimator=model_lr_aif, 
                                      constraints="EqualizedOdds",
                                      grid_size=20,
                                      drop_prot_attr=False)
grid_search_red.fit(X_train_aif, y_train_aif)

Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.sum(level=1) should use df.groupby(level=1).sum().
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.sum(level=1) should use df.groupby(level=1).sum().
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.sum(level=1) should use df.groupby(level=1).sum().
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.sum(level=1) should use df.groupby(level=1).sum().
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.sum(level=1) should use df.groupby(level=1).sum().
Using the level keyword in DataFrame and Seri

AttributeError: 'GridSearchReduction' object has no attribute 'constraints'

AttributeError: 'GridSearchReduction' object has no attribute 'constraints'

In [31]:
exp_grad_red = ExponentiatedGradientReduction(prot_attr=prot_attr_cols, 
                                              estimator=model_lr_aif, 
                                              constraints="EqualizedOdds",
                                              drop_prot_attr=False)
exp_grad_red.fit(X_train_aif, y_train_aif)

Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.sum(level=1) should use df.groupby(level=1).sum().
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.sum(level=1) should use df.groupby(level=1).sum().
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.sum(level=1) should use df.groupby(level=1).sum().
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.sum(level=1) should use df.groupby(level=1).sum().
Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.sum(level=1) should use df.groupby(level=1).sum().
Using the level keyword in DataFrame and Seri

AttributeError: 'ExponentiatedGradientReduction' object has no attribute 'constraints'

AttributeError: 'ExponentiatedGradientReduction' object has no attribute 'constraints'