# Model Evaluation of Performance on Individual Outcomes
This notebook compares and evaluates the performance of previously fitted models in predicting individual outcomes. Unlike the previous model evaluations which used the entire sample, these evaluations use reduced datasets comprising of only indidivudals with the outcome of interest, and a control group with no suicide-related outcomes at all. These reduced datasets circumvent problems with calculating metrics such as specificity and precision for individual outcomes in the full data.

In [2]:
from IPython.core.interactiveshell import InteractiveShell
from matplotlib import pyplot 
from numpy import mean
from numpy import std
from sklearn.calibration import calibration_curve
from sklearn.metrics import confusion_matrix as confusion
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import precision_score as ppv
from sklearn.metrics import recall_score as recall
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
import joblib
import numpy as np
import pandas as pd
InteractiveShell.ast_node_interactivity = "all"

Suicidal ideation model evaluation

In [4]:
#Recreating full test dataset
Xi_hold=pd.read_csv(r'C:\Users\z5291979\OneDrive - UNSW\Documents\lsac-data\Xi_hold.csv')
y_hold=pd.read_csv(r'C:\Users\z5291979\OneDrive - UNSW\Documents\lsac-data\processed_data\y_hold.csv')
Xi_hold.shape
y_hold.shape
testdata=Xi_hold.join(y_hold)
testdata

(729, 1011)

(729, 4)

Unnamed: 0,fd20a1,fd20c4,pe05c,pe06c2,fd24c1,f13ip1,f17ip1,f17zip1,f17aip1,f17bip1,...,y9test,y9gram,y9num,y9read,y9spel,y9write,si,nssi,att,sitbs
0,-9.0,-9.0,-9.0,-9.0,-9.0,1.0,2.0,2.0,0.0,0.0,...,0.277500,0.742737,0.542185,0.482281,0.769311,0.371608,0.0,0,0.0,0
1,-9.0,-9.0,-9.0,-9.0,-9.0,1.0,2.0,2.0,0.0,0.0,...,0.275579,0.600097,0.520104,0.453224,0.392045,0.371608,1.0,0,0.0,1
2,-9.0,-9.0,-9.0,-9.0,-9.0,1.0,2.0,2.0,0.0,0.0,...,0.277500,0.541766,0.995751,1.102658,0.628022,0.657920,0.0,0,0.0,0
3,-9.0,-9.0,-9.0,-9.0,-9.0,1.0,2.0,2.0,0.0,0.0,...,0.277500,0.109922,-0.049383,-0.033489,-0.384795,-0.459493,0.0,0,0.0,0
4,-9.0,-9.0,-9.0,-9.0,-9.0,1.0,2.0,2.0,0.0,0.0,...,0.277500,0.742737,0.698127,0.716194,0.507059,1.106774,0.0,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
724,-9.0,-9.0,-9.0,-9.0,-9.0,2.0,1.0,1.0,0.0,0.0,...,0.275579,0.146195,-2.510878,-1.214192,0.138221,0.310965,0.0,0,0.0,0
725,-9.0,-9.0,-9.0,-9.0,-9.0,1.0,2.0,2.0,0.0,0.0,...,0.277500,-0.080756,-0.000623,0.031890,0.345940,-0.035989,1.0,0,0.0,1
726,-9.0,-9.0,-9.0,-9.0,-9.0,1.0,2.0,2.0,0.0,0.0,...,0.277500,0.742737,0.942850,0.999020,0.628022,1.033705,0.0,0,0.0,0
727,-9.0,-9.0,-9.0,-9.0,-9.0,1.0,2.0,2.0,0.0,0.0,...,0.279420,0.651075,0.530685,0.419808,0.420799,0.371608,0.0,0,0.0,0


In [5]:
#Examining frequencies of the different combinations of outcomes
testdata[['si', 'nssi', 'att']].value_counts().reset_index(name='count')
#Creating the different subsets of data for evaluating performance on individual outcomes
si_test=testdata[(testdata['si']==1) | ((testdata['att']==0) & (testdata['nssi']==0))]
nssi_test=testdata[(testdata['nssi']==1) | ((testdata['att']==0) & (testdata['si']==0))]
att_test=testdata[(testdata['att']==1) | ((testdata['nssi']==0) & (testdata['si']==0))]

#Checking to see if the datasets were created properly
sre=['si', 'att', 'nssi']
for i in sre:
    print(f'{i}_test frequencies by outcome')
    #Retrieves variable value and concatenates 'i' and '_test'. locals() function is used to construct variable names dynamically
    locals()[f'{i}_test'][['si', 'nssi', 'att']].value_counts().reset_index(name='count')

#Splitting the reduced datasets back into x and y 
#y_holds_si and so on are created just for these evaluations, as opposed to y_hold_si, which contains the outcome of si for the full sample 
#s standing for small, indicating a reduced dataset
for i in sre:
    locals()[f'Xi_holds_{i}']=locals()[f'{i}_test'].drop(columns=['si', 'att', 'nssi','sitbs'])
    locals()[f'y_holds_{i}']=locals()[f'{i}_test'][f'{i}']

Unnamed: 0,si,nssi,att,count
0,0.0,0,0.0,608
1,1.0,0,0.0,48
2,1.0,1,0.0,23
3,0.0,1,0.0,18
4,1.0,1,1.0,14
5,1.0,0,1.0,11
6,0.0,0,1.0,6
7,0.0,1,1.0,1


si_test frequencies by outcome


Unnamed: 0,si,nssi,att,count
0,0.0,0,0.0,608
1,1.0,0,0.0,48
2,1.0,1,0.0,23
3,1.0,1,1.0,14
4,1.0,0,1.0,11


att_test frequencies by outcome


Unnamed: 0,si,nssi,att,count
0,0.0,0,0.0,608
1,1.0,1,1.0,14
2,1.0,0,1.0,11
3,0.0,0,1.0,6
4,0.0,1,1.0,1


nssi_test frequencies by outcome


Unnamed: 0,si,nssi,att,count
0,0.0,0,0.0,608
1,1.0,1,0.0,23
2,0.0,1,0.0,18
3,1.0,1,1.0,14
4,0.0,1,1.0,1


In [6]:
for i in sre:
    locals()[f'{i}_test'].to_csv(f'{i}_test.csv', index=False)

In [7]:
for i in sre:
    locals()[f'{i}_test'].shape

(704, 1015)

(640, 1015)

(664, 1015)

Evaluating the models' ability to predict suicidal ideation

In [8]:
#Creating a function that predicts probabilities and evaluates the model
def eval(model, Xi_hold, y_hold, thresh):
    #Generating probability predictions
    proba=model.predict_proba(Xi_hold)
    proba=proba[:, 1]
    print('Probs: %.3f (%.3f)' % (mean(proba), std(proba)))
    #Evaluating probability predictions
    ypred=np.where(proba>thresh, 1, 0)
    f1= f1_score(y_hold, ypred)
    print(f'F1= {f1:.3f}')
    sens= recall(y_hold, ypred)
    print(f'Sensitivity= {sens:.3f}')
    tn, fp, fn, tp= confusion(y_hold, ypred).ravel()
    spec=tn/(tn+fp)
    print(f'Specificity= {spec:.3f}')
    auc= roc_auc_score(y_hold, proba)
    print(f'AUROC= {auc:.3f}')
    prec=ppv(y_hold, ypred)
    print(f'Precision= {prec:.3f}')

    return f1, sens, spec, auc, prec

In [9]:
#Creating dictionary of component and composite models
algos=['LR', 'RF', 'XGB']
threshs=[0.188868, 0.175579, 0.119263]
models1= {a: joblib.load(f'{a}_si.sav') for a in algos}
models2 = {a: joblib.load(f'{a}_sitbs.sav') for a in algos}


#Iterating through tuples to evaluate LR, RF, and XGB Models 
#Value of the tuple algo is paired to the other corresponding value in threshs, i.e. 'LR' is paired with threshold of 0.188868
def runevals(models1, Xi_hold, y_hold):
    for a, t in zip(algos, threshs):
        clf=models1[a]
        print(f'Metrics for {a} using threshold at {t}')
        eval(clf, Xi_hold, y_hold, t)
        print('\n')

In [10]:
runevals(models1, Xi_holds_si, y_holds_si)

Metrics for LR using threshold at 0.188868
Probs: 0.122 (0.106)
F1= 0.393
Sensitivity= 0.479
Specificity= 0.849
AUROC= 0.747
Precision= 0.333


Metrics for RF using threshold at 0.175579
Probs: 0.139 (0.155)
F1= 0.476
Sensitivity= 0.719
Specificity= 0.794
AUROC= 0.833
Precision= 0.356


Metrics for XGB using threshold at 0.119263
Probs: 0.122 (0.146)
F1= 0.444
Sensitivity= 0.833
Specificity= 0.697
AUROC= 0.839
Precision= 0.303




In [11]:
runevals(models2, Xi_holds_si, y_holds_si)

Metrics for LR using threshold at 0.188868
Probs: 0.159 (0.122)
F1= 0.399
Sensitivity= 0.646
Specificity= 0.748
AUROC= 0.742
Precision= 0.288


Metrics for RF using threshold at 0.175579
Probs: 0.173 (0.170)
F1= 0.449
Sensitivity= 0.781
Specificity= 0.732
AUROC= 0.835
Precision= 0.315


Metrics for XGB using threshold at 0.119263
Probs: 0.157 (0.170)
F1= 0.447
Sensitivity= 0.865
Specificity= 0.684
AUROC= 0.845
Precision= 0.302




Evaluating the models' ability to predict NSSI

In [15]:
threshs=[0.138621, 0.160344, 0.172887]
models1= {a: joblib.load(f'{a}_nssi.sav') for a in algos}

runevals(models1, Xi_holds_nssi, y_holds_nssi)

Metrics for LR using threshold at 0.138621
Probs: 0.071 (0.077)
F1= 0.350
Sensitivity= 0.446
Specificity= 0.898
AUROC= 0.743
Precision= 0.287


Metrics for RF using threshold at 0.160344
Probs: 0.075 (0.091)
F1= 0.440
Sensitivity= 0.661
Specificity= 0.877
AUROC= 0.852
Precision= 0.330


Metrics for XGB using threshold at 0.172887
Probs: 0.067 (0.096)
F1= 0.472
Sensitivity= 0.536
Specificity= 0.933
AUROC= 0.848
Precision= 0.423




In [17]:
runevals(models2, Xi_holds_nssi, y_holds_nssi)

Metrics for LR using threshold at 0.138621
Probs: 0.154 (0.118)
F1= 0.259
Sensitivity= 0.768
Specificity= 0.617
AUROC= 0.772
Precision= 0.156


Metrics for RF using threshold at 0.160344
Probs: 0.163 (0.163)
F1= 0.333
Sensitivity= 0.821
Specificity= 0.714
AUROC= 0.852
Precision= 0.209


Metrics for XGB using threshold at 0.172887
Probs: 0.149 (0.165)
F1= 0.361
Sensitivity= 0.786
Specificity= 0.763
AUROC= 0.856
Precision= 0.234




Evaluating the models' ability to predict attempts

In [18]:
threshs=[0.074037, 0.073374, 0.087420]
models1= {a: joblib.load(f'{a}_att.sav') for a in algos}

runevals(models1, Xi_holds_att, y_holds_att)

Metrics for LR using threshold at 0.074037
Probs: 0.042 (0.043)
F1= 0.224
Sensitivity= 0.469
Specificity= 0.857
AUROC= 0.781
Precision= 0.147


Metrics for RF using threshold at 0.073374
Probs: 0.047 (0.074)
F1= 0.297
Sensitivity= 0.719
Specificity= 0.836
AUROC= 0.882
Precision= 0.187


Metrics for XGB using threshold at 0.08742
Probs: 0.038 (0.062)
F1= 0.392
Sensitivity= 0.625
Specificity= 0.918
AUROC= 0.883
Precision= 0.286




In [20]:
runevals(models2, Xi_holds_att, y_holds_att)

Metrics for LR using threshold at 0.074037
Probs: 0.149 (0.114)
F1= 0.123
Sensitivity= 0.906
Specificity= 0.322
AUROC= 0.751
Precision= 0.066


Metrics for RF using threshold at 0.073374
Probs: 0.155 (0.152)
F1= 0.162
Sensitivity= 1.000
Specificity= 0.456
AUROC= 0.874
Precision= 0.088


Metrics for XGB using threshold at 0.08742
Probs: 0.138 (0.152)
F1= 0.187
Sensitivity= 0.875
Specificity= 0.605
AUROC= 0.854
Precision= 0.104




Predicting attempts with nssi model

In [21]:
y_hold_att=pd.read_csv(r'C:\Users\z5291979\OneDrive - UNSW\Documents\lsac-data\processed_data\y_hold_att.csv')

threshs=[0.074037, 0.073374, 0.087420]
models1= {a: joblib.load(f'{a}_nssi.sav') for a in algos}

runevals(models1, Xi_hold, y_hold_att)

Metrics for LR using threshold at 0.074037
Probs: 0.075 (0.082)
F1= 0.151
Sensitivity= 0.625
Specificity= 0.694
AUROC= 0.698
Precision= 0.086


Metrics for RF using threshold at 0.073374
Probs: 0.083 (0.099)
F1= 0.190
Sensitivity= 0.812
Specificity= 0.690
AUROC= 0.839
Precision= 0.107


Metrics for XGB using threshold at 0.08742
Probs: 0.074 (0.104)
F1= 0.200
Sensitivity= 0.594
Specificity= 0.801
AUROC= 0.823
Precision= 0.120




Deriving performance metrics for previously fitted models in full test dataset

In [22]:
#Loading datasets
y_hold_si=pd.read_csv(r'C:\Users\z5291979\OneDrive - UNSW\Documents\lsac-data\processed_data\y_hold_si.csv')
y_hold_nssi=pd.read_csv(r'C:\Users\z5291979\OneDrive - UNSW\Documents\lsac-data\processed_data\y_hold_nssi.csv')
y_hold_sitbs=pd.read_csv(r'C:\Users\z5291979\OneDrive - UNSW\Documents\lsac-data\processed_data\y_hold_sitbs.csv')

In [23]:
#SI Models
algos=['LR', 'RF', 'XGB']
threshs=[0.188868, 0.175579, 0.119263]
models= {a: joblib.load(f'{a}_si.sav') for a in algos}
runevals(models, Xi_hold, y_hold_si)

Metrics for LR using threshold at 0.188868
Probs: 0.123 (0.107)
F1= 0.380
Sensitivity= 0.479
Specificity= 0.842
AUROC= 0.742
Precision= 0.315


Metrics for RF using threshold at 0.175579
Probs: 0.142 (0.158)
F1= 0.460
Sensitivity= 0.719
Specificity= 0.787
AUROC= 0.826
Precision= 0.338


Metrics for XGB using threshold at 0.119263
Probs: 0.125 (0.152)
F1= 0.429
Sensitivity= 0.833
Specificity= 0.689
AUROC= 0.831
Precision= 0.289




In [24]:
#NSSI Models
threshs=[0.138621, 0.160344, 0.172887]
models= {a: joblib.load(f'{a}_nssi.sav') for a in algos}
runevals(models, Xi_hold, y_hold_nssi)

Metrics for LR using threshold at 0.138621
Probs: 0.075 (0.082)
F1= 0.309
Sensitivity= 0.446
Specificity= 0.880
AUROC= 0.728
Precision= 0.236


Metrics for RF using threshold at 0.160344
Probs: 0.083 (0.099)
F1= 0.376
Sensitivity= 0.661
Specificity= 0.845
AUROC= 0.830
Precision= 0.262


Metrics for XGB using threshold at 0.172887
Probs: 0.074 (0.104)
F1= 0.414
Sensitivity= 0.536
Specificity= 0.912
AUROC= 0.827
Precision= 0.337




In [25]:
#Attempt models
threshs=[0.074037, 0.073374, 0.087420]
models= {a: joblib.load(f'{a}_att.sav') for a in algos}

runevals(models, Xi_hold, y_hold_att)

Metrics for LR using threshold at 0.074037
Probs: 0.045 (0.048)
F1= 0.191
Sensitivity= 0.469
Specificity= 0.842
AUROC= 0.767
Precision= 0.120


Metrics for RF using threshold at 0.073374
Probs: 0.054 (0.081)
F1= 0.231
Sensitivity= 0.719
Specificity= 0.793
AUROC= 0.857
Precision= 0.138


Metrics for XGB using threshold at 0.08742
Probs: 0.045 (0.074)
F1= 0.312
Sensitivity= 0.625
Specificity= 0.891
AUROC= 0.857
Precision= 0.208




In [26]:
#SITB Models
threshs=[0.208806, 0.152130, 0.176893]
models = {a: joblib.load(f'{a}_sitbs.sav') for a in algos}
runevals(models, Xi_hold, y_hold_sitbs)

Metrics for LR using threshold at 0.208806
Probs: 0.161 (0.125)
F1= 0.422
Sensitivity= 0.562
Specificity= 0.781
AUROC= 0.710
Precision= 0.338


Metrics for RF using threshold at 0.15213
Probs: 0.176 (0.173)
F1= 0.474
Sensitivity= 0.769
Specificity= 0.707
AUROC= 0.813
Precision= 0.343


Metrics for XGB using threshold at 0.176893
Probs: 0.161 (0.175)
F1= 0.499
Sensitivity= 0.702
Specificity= 0.778
AUROC= 0.816
Precision= 0.386


