# Model Evaluation of Performance on Individual Outcomes
This notebook compares and evaluates the performance of previously fitted models in predicting individual outcomes. Unlike the previous model evaluations which used the entire sample, these evaluations use reduced datasets comprising of only indidivudals with the outcome of interest, and a control group with no suicide-related outcomes at all. These reduced datasets circumvent problems with calculating metrics such as specificity and precision for individual outcomes in the full data.

In [1]:
from IPython.core.interactiveshell import InteractiveShell
from matplotlib import pyplot 
from numpy import mean
from numpy import std
from sklearn.calibration import calibration_curve
from sklearn.metrics import confusion_matrix as confusion
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import precision_score as ppv
from sklearn.metrics import recall_score as recall
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
import joblib
import numpy as np
import pandas as pd
InteractiveShell.ast_node_interactivity = "all"

Suicidal ideation model evaluation

In [7]:
#Recreating full test dataset
Xi_hold=pd.read_csv(r'C:\Users\z5291979\OneDrive - UNSW\Documents\lsac-data\processed_data\Xi_hold.csv')
y_hold=pd.read_csv(r'C:\Users\z5291979\OneDrive - UNSW\Documents\lsac-data\processed_data\y_hold.csv')
Xi_hold.shape
y_hold.shape
#Getting rid of unnamed column- this often occurs after csv is read with the index from excel
Xi_hold= Xi_hold.drop('Unnamed: 0', axis=1)
Xi_hold.to_csv(r'C:\Users\z5291979\OneDrive - UNSW\Documents\lsac-data\processed_data\Xi_hold.csv')
testdata=Xi_hold.join(y_hold)
testdata

(729, 1012)

(729, 4)

Unnamed: 0,fd20a1,fd20c4,pe05c,pe06c2,fd24c1,f13ip1,f17ip1,f17zip1,f17aip1,f17bip1,...,y9test,y9gram,y9num,y9read,y9spel,y9write,si,nssi,att,sitbs
0,-9.0,-9.0,-9.0,-9.0,-9.0,1.0,2.0,2.0,0.0,0.0,...,0.277500,0.742737,0.542185,0.482281,0.769311,0.371608,0.0,0,0.0,0
1,-9.0,-9.0,-9.0,-9.0,-9.0,1.0,2.0,2.0,0.0,0.0,...,0.275579,0.600097,0.520104,0.453224,0.392045,0.371608,1.0,0,0.0,1
2,-9.0,-9.0,-9.0,-9.0,-9.0,1.0,2.0,2.0,0.0,0.0,...,0.277500,0.541766,0.995751,1.102658,0.628022,0.657920,0.0,0,0.0,0
3,-9.0,-9.0,-9.0,-9.0,-9.0,1.0,2.0,2.0,0.0,0.0,...,0.277500,0.109922,-0.049383,-0.033489,-0.384795,-0.459493,0.0,0,0.0,0
4,-9.0,-9.0,-9.0,-9.0,-9.0,1.0,2.0,2.0,0.0,0.0,...,0.277500,0.742737,0.698127,0.716194,0.507059,1.106774,0.0,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
724,-9.0,-9.0,-9.0,-9.0,-9.0,2.0,1.0,1.0,0.0,0.0,...,0.275579,0.146195,-2.510878,-1.214192,0.138221,0.310965,0.0,0,0.0,0
725,-9.0,-9.0,-9.0,-9.0,-9.0,1.0,2.0,2.0,0.0,0.0,...,0.277500,-0.080756,-0.000623,0.031890,0.345940,-0.035989,1.0,0,0.0,1
726,-9.0,-9.0,-9.0,-9.0,-9.0,1.0,2.0,2.0,0.0,0.0,...,0.277500,0.742737,0.942850,0.999020,0.628022,1.033705,0.0,0,0.0,0
727,-9.0,-9.0,-9.0,-9.0,-9.0,1.0,2.0,2.0,0.0,0.0,...,0.279420,0.651075,0.530685,0.419808,0.420799,0.371608,0.0,0,0.0,0


In [29]:
#Examining frequencies of the different combinations of outcomes
testdata[['si', 'nssi', 'att']].value_counts().reset_index(name='count')
#Creating the different subsets of data for evaluating performance on individual outcomes
si_test=testdata[(testdata['si']==1) | ((testdata['att']==0) & (testdata['nssi']==0))]
nssi_test=testdata[(testdata['nssi']==1) | ((testdata['att']==0) & (testdata['si']==0))]
att_test=testdata[(testdata['att']==1) | ((testdata['nssi']==0) & (testdata['si']==0))]

#Checking to see if the datasets were created properly
sre=['si', 'att', 'nssi']
for i in sre:
    print(f'{i}_test frequencies by outcome')
    #Retrieves variable value and concatenates 'i' and '_test'. locals() function is used to construct variable names dynamically
    locals()[f'{i}_test'][['si', 'nssi', 'att']].value_counts().reset_index(name='count')

Unnamed: 0,si,nssi,att,count
0,0.0,0,0.0,608
1,1.0,0,0.0,48
2,1.0,1,0.0,23
3,0.0,1,0.0,18
4,1.0,1,1.0,14
5,1.0,0,1.0,11
6,0.0,0,1.0,6
7,0.0,1,1.0,1


si_test frequencies by outcome


Unnamed: 0,si,nssi,att,count
0,0.0,0,0.0,608
1,1.0,0,0.0,48
2,1.0,1,0.0,23
3,1.0,1,1.0,14
4,1.0,0,1.0,11


att_test frequencies by outcome


Unnamed: 0,si,nssi,att,count
0,0.0,0,0.0,608
1,1.0,1,1.0,14
2,1.0,0,1.0,11
3,0.0,0,1.0,6
4,0.0,1,1.0,1


nssi_test frequencies by outcome


Unnamed: 0,si,nssi,att,count
0,0.0,0,0.0,608
1,1.0,1,0.0,23
2,0.0,1,0.0,18
3,1.0,1,1.0,14
4,0.0,1,1.0,1
