# Study 3 Data Preparation
### Predictors from Wave 7, self-harm, suicidal ideation and attempts from Wave 8

In [3]:
import pyreadstat
import pandas as pd
import numpy as np
from itertools import combinations
pd.options.display.max_rows = 20
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from sklearn.model_selection import train_test_split
from pathlib import Path

In [2]:
#Importing the necessary data files
df8=pd.read_csv('~/OneDrive - UNSW/Documents/lsac-data/w78pmn.csv')
lsac8, meta=pyreadstat.read_sas7bdat('~/OneDrive - UNSW/Documents/lsac-data/lsacgrk18.sas7bdat')

In [3]:
#Extract SITB outcomes from Wave 8 to link to Wave 7 data
sitbs=lsac8[['hicid', 'jhs54b', 'jhs54c', 'jhs54d', 'jhs54g']]

sitbs=sitbs.rename(columns={'jhs54b':'sh_l', 'jhs54c':'sc', 'jhs54d':'sp', 'jhs54g':'nssi'})
for column in sitbs[1::]:
    sitbs[column].value_counts(dropna=False)
#Outcome was coded as 1 Yes 2 No; recode to 0 No 1 Yes
sitbs=sitbs.replace({2:0})
#Creating a dataframe with all SITBs 
df_all=pd.merge(df8, sitbs, how='inner', on='hicid')

hicid
51101040.0    1
73312029.0    1
73311903.0    1
73311936.0    1
73311937.0    1
             ..
62106607.0    1
62106608.0    1
62106619.0    1
62106624.0    1
88117433.0    1
Name: count, Length: 3037, dtype: int64

sh_l
 2.0    2417
-9.0     381
 1.0     230
-3.0       9
Name: count, dtype: int64

sc
 2.0    2358
-9.0     381
 1.0     291
-3.0       7
Name: count, dtype: int64

sp
 2.0    2422
-9.0     381
 1.0     228
-3.0       6
Name: count, dtype: int64

nssi
-9.0    2961
 1.0      54
 2.0      22
Name: count, dtype: int64

In [4]:
#Dropping any participant which refused to answer the self-harm or considering suicide questions
#Participants who did not answer the suicide attempts question were previously dropped in Study 2
df_all=df_all.drop(df_all[df_all.sh_l<0].index)
df_all=df_all.drop(df_all[df_all.sc<0].index)
for column in df_all[['att', 'sh_l', 'sc']]:
    df_all[column].value_counts()

att
0.0    2323
1.0     105
Name: count, dtype: int64

sh_l
0.0    2226
1.0     202
Name: count, dtype: int64

sc
0.0    2167
1.0     261
Name: count, dtype: int64

In [5]:
# Self-harm and suicidal ideation needs to be created by combining multiple variables
# Suicidal ideation is defined as anyone who reported either considering or planning suicide
df_all['si']=df_all['sc']
df_all['si']=np.where(df_all['sp']==1, 1, df_all['si'])
for column in df_all[['sc', 'sp', 'si']]:
    df_all[column].value_counts(dropna=False)
pd.crosstab(index=df_all['sc'], columns=df_all['sp'], dropna=False, margins=True)

sc
0.0    2167
1.0     261
Name: count, dtype: int64

sp
0.0    2223
1.0     205
Name: count, dtype: int64

si
0.0    2106
1.0     322
Name: count, dtype: int64

sp,0.0,1.0,All
sc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,2106,61,2167
1.0,117,144,261
All,2223,205,2428


In [6]:
pd.crosstab(index=df_all['sh_l'], columns=df_all['att'], dropna=False, margins=True)

att,0.0,1.0,All
sh_l,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,2185,41,2226
1.0,138,64,202
All,2323,105,2428


In [7]:
df_all=df_all.rename(columns={'nssi':'shq'})

In [8]:
#NSSI will be defined as everyone who reported self-harm without reporting suicide attempt, with the addition of participants who reported self-harm without suicidal intent in jhs54g
#Creating a variable which captures the intersection of participants who reported attempted suicide and self-harm
df_all['shsa']=df_all['sh_l']
df_all['shsa'].value_counts()
df_all['shsa'] = np.where((df_all['att'] == 1) & (df_all['sh_l'] == 1), 1, 0)
df_all['shsa'].value_counts()
pd.crosstab(index=df_all['shsa'], columns=df_all['shq'], dropna=False, margins=True)

shsa
0.0    2226
1.0     202
Name: count, dtype: int64

shsa
0    2364
1      64
Name: count, dtype: int64

shq,-9.0,0.0,1.0,All
shsa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2364,0,0,2364
1,0,15,49,64
All,2364,15,49,2428


In [9]:
#Only participants who were eligible to answer the nssi question did so, thus it is okay to add the 'yes' participants to the self-harm cohort
#Defining the participants who self-harmed without attempting suicide- thus forming the basis of the NSSI cohort
df_all = df_all.assign(nssi=np.where((df_all['att'] == 0) & (df_all['sh_l'] == 1), 1, 0))
df_all['nssi'].value_counts()


nssi
0    2290
1     138
Name: count, dtype: int64

In [10]:
#Adding the participants who reported both attempts and self-harm AND answered yes to the specific nssi question jhs54g
df_all['nssi']=np.where((df_all['nssi'] == 0) & (df_all['shq'] == 1), 1, df_all['nssi'])
df_all['nssi'].value_counts()

nssi
0    2241
1     187
Name: count, dtype: int64

In [11]:
#Creating composite outcome sitbs
df_all['sitbs']=0
for column in df_all[['att', 'nssi', 'si']]:
    df_all['sitbs']=np.where(df_all[column]==1, 1, df_all['sitbs'])
df_all['sitbs'].value_counts()

sitbs
0    2025
1     403
Name: count, dtype: int64

In [12]:
columns_to_cross = ['si', 'att', 'nssi']

for column, other_column in combinations(columns_to_cross, 2):
    pd.crosstab(index=df_all[column], columns=df_all[other_column], dropna=False, margins=True)        

for column in columns_to_cross:
    pd.crosstab(index=df_all[column], columns=df_all['sitbs'], dropna=False, margins=True)  

att,0.0,1.0,All
si,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,2085,21,2106
1.0,238,84,322
All,2323,105,2428


nssi,0,1,All
si,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,2044,62,2106
1.0,197,125,322
All,2241,187,2428


nssi,0,1,All
att,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,2185,138,2323
1.0,56,49,105
All,2241,187,2428


sitbs,0,1,All
si,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,2025,81,2106
1.0,0,322,322
All,2025,403,2428


sitbs,0,1,All
att,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,2025,298,2323
1.0,0,105,105
All,2025,403,2428


sitbs,0,1,All
nssi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2025,216,2241
1,0,187,187
All,2025,403,2428


In [13]:
df_all.shape
#Dropping features with less than 5% of positive responses to reduce the number of redundant features
pd.set_option('display.max_rows', 2000)
df_all2=df_all.select_dtypes(include=['int64'])
dropcols=list(df_all2.columns[df_all2.mean(axis=0)<0.05])

(2428, 1414)

In [14]:
df_all.columns.get_loc("A01")
df_all.columns.get_loc("hos")

1331

1399

In [15]:
df_all3=df_all.iloc[:, 1331:1400]
droppbs=df_all3.columns[df_all3.mean(axis=0)<0.05]

In [16]:
df_all3.shape
droppbs
droppbs.shape

(2428, 69)

Index(['A01', 'A02', 'A03', 'A04', 'A06', 'A07', 'A09', 'A10', 'A11', 'B01',
       'B02', 'B03', 'C01', 'C02', 'C03', 'C07', 'C08', 'C09', 'C10', 'D01',
       'D05', 'D06', 'D11', 'G02', 'G04', 'H01', 'H02', 'H03', 'H04', 'J02',
       'J04', 'J05', 'L01', 'L02', 'L04', 'M01', 'M03', 'M04', 'N03', 'N04B',
       'N05A', 'N05B', 'N05C', 'N07B', 'P02', 'P03', 'R01', 'R05', 'R07',
       'S01', 'S02', 'S03', 'V01', 'V06'],
      dtype='object')

(54,)

In [17]:
#Creating the final dataset with redundant features dropped
#s for small
df_alls=df_all.drop(dropcols, axis=1)
df_alls=df_alls.drop(droppbs, axis=1)
df_alls

Unnamed: 0,hicid,fd20a1,fd20c4,pe05c,pe06c2,fd24c1,f13ip1,f17ip1,f17zip1,f17aip1,...,y9spel,y9write,sh_l,sc,sp,shq,si,shsa,nssi,sitbs
0,51101040.0,-9.0,-9.0,-9.0,-9.0,-9.0,1.0,2.0,2.0,0.0,...,561.7,617.6,1.0,1.0,1.0,-9.0,1.0,0,1,1
1,51101041.0,-9.0,-9.0,-9.0,-9.0,-9.0,2.0,1.0,1.0,0.0,...,542.8,594.5,0.0,0.0,0.0,-9.0,0.0,0,0,0
2,51101046.0,-9.0,-9.0,-9.0,-9.0,-9.0,1.0,2.0,2.0,0.0,...,561.7,594.5,0.0,0.0,0.0,-9.0,0.0,0,0,0
3,51101053.0,-9.0,-9.0,-9.0,-9.0,-9.0,1.0,2.0,2.0,0.0,...,723.7,594.5,0.0,0.0,0.0,-9.0,0.0,0,0,0
4,51101060.0,-9.0,-9.0,-9.0,-9.0,-9.0,2.0,1.0,1.0,0.0,...,558.8,570.3,0.0,0.0,0.0,-9.0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2428,88116100.0,-9.0,-9.0,-9.0,-9.0,-9.0,1.0,2.0,2.0,0.0,...,-9.0,-9.0,0.0,1.0,1.0,-9.0,1.0,0,0,1
2429,88116142.0,-9.0,-9.0,-9.0,-9.0,-9.0,2.0,1.0,1.0,0.0,...,673.0,582.5,0.0,0.0,0.0,-9.0,0.0,0,0,0
2430,88116585.0,-9.0,-9.0,-9.0,-9.0,-9.0,1.0,2.0,2.0,0.0,...,552.3,676.0,0.0,0.0,0.0,-9.0,0.0,0,0,0
2431,88116874.0,-9.0,-9.0,-9.0,-9.0,-9.0,1.0,2.0,2.0,0.0,...,657.6,715.7,0.0,0.0,0.0,-9.0,0.0,0,0,0


In [23]:
list(df_alls.columns[-10:])
df_alls=df_alls.drop(['hicid'], axis=1)

['y9spel', 'y9write', 'sh_l', 'sc', 'sp', 'shq', 'si', 'shsa', 'nssi', 'sitbs']

In [24]:
dfallspath=Path('~/OneDrive - UNSW/Documents/lsac-data/df_alls.csv')
dfallspath.parent.mkdir(parents=True, exist_ok=True) 
df_alls.to_csv(dfallspath, index=False)
Xfull=df_alls.drop(columns=['att','sh_l', 'sc', 'sp', 'shq', 'si', 'shsa', 'nssi', 'sitbs'])
yfull_all=df_alls[['si', 'nssi', 'att','sitbs']] 

In [25]:
Xfull.shape
yfull_all.shape

(2428, 1011)

(2428, 4)

In [26]:
#To keep the feature matrix consistent, I will stratify the train and test datasets by outcomes of nssi, si, and att
#Split the data and only perform model development on the 70% training data from here onwards
X, X_hold, y, y_hold=train_test_split(Xfull, yfull_all, test_size=0.30, random_state=26, stratify=yfull_all)


In [28]:
X.shape
X_hold.shape
y.shape
y_hold.shape

(1699, 1011)

(729, 1011)

(1699, 4)

(729, 4)

In [29]:
X.to_csv('X.csv', index=False)
X_hold.to_csv('X_hold.csv', index=False)
y.to_csv('y.csv', index=False)
y_hold.to_csv('y_hold.csv', index=False)

In [30]:
#Checking the stratification worked for each outcome
sre=['si', 'att', 'nssi']
for i in sre:
        print(f'Proportion of {i} in y')
        y[i].value_counts(normalize=True)
        print(f'Proportion of {i} in y_hold')
        y_hold[i].value_counts(normalize=True)

Proportion of si in y


si
0.0    0.866981
1.0    0.133019
Name: proportion, dtype: float64

Proportion of si in y_hold


si
0.0    0.868313
1.0    0.131687
Name: proportion, dtype: float64

Proportion of att in y


att
0.0    0.957034
1.0    0.042966
Name: proportion, dtype: float64

Proportion of att in y_hold


att
0.0    0.956104
1.0    0.043896
Name: proportion, dtype: float64

Proportion of nssi in y


nssi
0    0.922896
1    0.077104
Name: proportion, dtype: float64

Proportion of nssi in y_hold


nssi
0    0.923182
1    0.076818
Name: proportion, dtype: float64

In [32]:
X=pd.read_csv('~/OneDrive - UNSW/Documents/lsac-data/X.csv')
#Imputing the training set
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer
imputer=IterativeImputer(random_state=26)
imputer.fit(X)
Xi=X
Xi[:]=imputer.transform(Xi)

In [6]:
#Scaling the training dataset
from sklearn import preprocessing
scaler=preprocessing.StandardScaler()
#Storing the numerical columns in a list
nums=['perapp',	'peravoid',	'mastavoid',	'mastapp',	'peerpos',	'peermoral',	'peerpostot',	'hb16c12',	'hb16c12a',	'hb15c10',	'hb15c10a',	'hb26c2',	'hb26c2a',	'hb28c3',	'hb27c2',	'hb16c15b',	'hb31c2',	'hb16c15d',	'se27c1',	're23c1',	'hb16c10',	'calcharm',	'he17c2b2',	'he06c2b2',	'he06c3b2',	'tvweek',	'egweek',	'aanga',	'banga',	'aarga',	'barga',	'pw09a',	'pw09b',	'pw09c',	'pw09a',	'pw09b',	'aextra',	'aagree',	'aconsc',	'aneuro',	'aopen',	'cextra',	'cagree',	'cconsc',	'cneuro',	'copen',	'bextra',	'bagree',	'bconsc',	'bneuro',	'bopen',	'bodyfat',	'cbmi',	'chu9d',	'hb15c13',	'acons',	'bcons',	'bcopar',	'smfq',	'fp06c3',	'fp06c4',	'cfout',	'agambf',	'chshipc',	'ahendb',	'bhendb',	'fn13c10',	'airc',	'birc',	'bk6s',	'atotss',	'fd04c',	'pw04c2',	'noldsib',	'npeople',	'nsasib',	'nsib',	'nyngsib',	'oral',	'asupport',	'bsupport',	'beffic',	'sc11c3c',	'sc11b3c',	'pedsef',	'pedsphy',	'pedspse',	'pedspsd',	'pedspsc',	'pedste',	'pedstd',	'pedstc',	'pedssof',	'pedsscd',	'pedsscc',	'pedsphyb',	'cnfp16',	'fp02c2a',	'hs53a1a',	'apgsi',	'pssm',	'cresl',	'fn13p',	'acondb',	'ccondb',	'bcondb',	'bpsoc',	'aemot',	'cemot',	'bemot',	'ahypr',	'chypr',	'bhypr',	'apeer',	'cpeer',	'bpeer',	'apsoc',	'cpsoc',	'csdqtb',	'bsdqtb',	'cnfsad2',	'sle',	'hb14c2',	'hb13c2b',	'hb13c1b',	'ho06c5',	'numcond',	'hinci',	'hs23c3', 'A01',	'A02',	'A03',	'A04',	'A06',	'A07',	'A09',	'A10',	'A11',	'B01',	'B02',	'B03',	'C01',	'C02',	'C03',	'C07',	'C08',	'C09',	'C10',	'D01',	'D05',	'D06',	'D07',	'D10',	'D11',	'G02',	'G03',	'G04',	'H01',	'H02',	'H03',	'H04',	'J01',	'J02',	'J04',	'J05',	'L01',	'L02',	'L04',	'M01',	'M03',	'M04',	'N02',	'N03',	'N04B',	'N05A',	'N05B',	'N05C',	'N06A',	'N06B',	'N07B',	'P02',	'P03',	'R01',	'R03',	'R05',	'R07',	'S01',	'S02',	'S03',	'V01',	'V06',	'benefit',	'mhcp',	'psychol',	'psychia',	'gp',	'mbs',	'hos',	'y9test',	'y9gram',	'y9num',	'y9read',	'y9spel',	'y9write']
Xicols=Xi.columns.values.tolist()
nums2=list(set(Xicols) & set(nums))
numfs=Xi[nums2]
Xi[nums2]=scaler.fit_transform(numfs.values)
Xi.to_csv('Xi.csv')

NameError: name 'Xi' is not defined

In [4]:
X_hold=pd.read_csv('~/OneDrive - UNSW/Documents/lsac-data/X_hold.csv')
#Imputing the holdout set
from sklearn.experimental import enable_iterative_imputer  
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer
imputer=IterativeImputer(random_state=26)
imputer.fit(X_hold)
Xi_hold=X_hold
Xi_hold[:]=imputer.transform(Xi_hold)

In [9]:
pd.set_option('display.max_rows', 20)
#Scaling the holdout dataset
from sklearn import preprocessing
scaler=preprocessing.StandardScaler()
#Storing the numerical columns in a list
nums=['perapp',	'peravoid',	'mastavoid',	'mastapp',	'peerpos',	'peermoral',	'peerpostot',	'hb16c12',	'hb16c12a',	'hb15c10',	'hb15c10a',	'hb26c2',	'hb26c2a',	'hb28c3',	'hb27c2',	'hb16c15b',	'hb31c2',	'hb16c15d',	'se27c1',	're23c1',	'hb16c10',	'calcharm',	'he17c2b2',	'he06c2b2',	'he06c3b2',	'tvweek',	'egweek',	'aanga',	'banga',	'aarga',	'barga',	'pw09a',	'pw09b',	'pw09c',	'pw09a',	'pw09b',	'aextra',	'aagree',	'aconsc',	'aneuro',	'aopen',	'cextra',	'cagree',	'cconsc',	'cneuro',	'copen',	'bextra',	'bagree',	'bconsc',	'bneuro',	'bopen',	'bodyfat',	'cbmi',	'chu9d',	'hb15c13',	'acons',	'bcons',	'bcopar',	'smfq',	'fp06c3',	'fp06c4',	'cfout',	'agambf',	'chshipc',	'ahendb',	'bhendb',	'fn13c10',	'airc',	'birc',	'bk6s',	'atotss',	'fd04c',	'pw04c2',	'noldsib',	'npeople',	'nsasib',	'nsib',	'nyngsib',	'oral',	'asupport',	'bsupport',	'beffic',	'sc11c3c',	'sc11b3c',	'pedsef',	'pedsphy',	'pedspse',	'pedspsd',	'pedspsc',	'pedste',	'pedstd',	'pedstc',	'pedssof',	'pedsscd',	'pedsscc',	'pedsphyb',	'cnfp16',	'fp02c2a',	'hs53a1a',	'apgsi',	'pssm',	'cresl',	'fn13p',	'acondb',	'ccondb',	'bcondb',	'bpsoc',	'aemot',	'cemot',	'bemot',	'ahypr',	'chypr',	'bhypr',	'apeer',	'cpeer',	'bpeer',	'apsoc',	'cpsoc',	'csdqtb',	'bsdqtb',	'cnfsad2',	'sle',	'hb14c2',	'hb13c2b',	'hb13c1b',	'ho06c5',	'numcond',	'hinci',	'hs23c3', 'A01',	'A02',	'A03',	'A04',	'A06',	'A07',	'A09',	'A10',	'A11',	'B01',	'B02',	'B03',	'C01',	'C02',	'C03',	'C07',	'C08',	'C09',	'C10',	'D01',	'D05',	'D06',	'D07',	'D10',	'D11',	'G02',	'G03',	'G04',	'H01',	'H02',	'H03',	'H04',	'J01',	'J02',	'J04',	'J05',	'L01',	'L02',	'L04',	'M01',	'M03',	'M04',	'N02',	'N03',	'N04B',	'N05A',	'N05B',	'N05C',	'N06A',	'N06B',	'N07B',	'P02',	'P03',	'R01',	'R03',	'R05',	'R07',	'S01',	'S02',	'S03',	'V01',	'V06',	'benefit',	'mhcp',	'psychol',	'psychia',	'gp',	'mbs',	'hos',	'y9test',	'y9gram',	'y9num',	'y9read',	'y9spel',	'y9write']
Xi_hold_cols=Xi_hold.columns.values.tolist()
nums2=list(set(Xi_hold_cols) & set(nums))
numhs=Xi_hold[nums2]
Xi_hold[nums2]=scaler.fit_transform(numhs.values)
Xi_hold

Unnamed: 0,fd20a1,fd20c4,pe05c,pe06c2,fd24c1,f13ip1,f17ip1,f17zip1,f17aip1,f17bip1,...,psychia,gp,mbs,hos,y9test,y9gram,y9num,y9read,y9spel,y9write
0,-9.0,-9.0,-9.0,-9.0,-9.0,1.0,2.0,2.0,0.0,0.0,...,-0.120109,0.141955,-0.401386,-0.114709,0.277500,0.742737,0.542185,0.482281,0.769311,0.371608
1,-9.0,-9.0,-9.0,-9.0,-9.0,1.0,2.0,2.0,0.0,0.0,...,-0.120109,-0.474027,-0.499610,-0.114709,0.275579,0.600097,0.520104,0.453224,0.392045,0.371608
2,-9.0,-9.0,-9.0,-9.0,-9.0,1.0,2.0,2.0,0.0,0.0,...,-0.120109,-0.474027,-0.597834,-0.114709,0.277500,0.541766,0.995751,1.102658,0.628022,0.657920
3,-9.0,-9.0,-9.0,-9.0,-9.0,1.0,2.0,2.0,0.0,0.0,...,-0.120109,-0.474027,-0.204937,-0.114709,0.277500,0.109922,-0.049383,-0.033489,-0.384795,-0.459493
4,-9.0,-9.0,-9.0,-9.0,-9.0,1.0,2.0,2.0,0.0,0.0,...,-0.120109,-0.474027,-0.401386,-0.114709,0.277500,0.742737,0.698127,0.716194,0.507059,1.106774
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
724,-9.0,-9.0,-9.0,-9.0,-9.0,2.0,1.0,1.0,0.0,0.0,...,-0.120109,-0.782018,-0.696059,-0.114709,0.275579,0.146195,-2.510878,-1.214192,0.138221,0.310965
725,-9.0,-9.0,-9.0,-9.0,-9.0,1.0,2.0,2.0,0.0,0.0,...,-0.120109,-0.782018,-0.696059,-0.114709,0.277500,-0.080756,-0.000623,0.031890,0.345940,-0.035989
726,-9.0,-9.0,-9.0,-9.0,-9.0,1.0,2.0,2.0,0.0,0.0,...,-0.120109,0.141955,-0.401386,-0.114709,0.277500,0.742737,0.942850,0.999020,0.628022,1.033705
727,-9.0,-9.0,-9.0,-9.0,-9.0,1.0,2.0,2.0,0.0,0.0,...,-0.120109,-0.782018,-0.696059,-0.114709,0.279420,0.651075,0.530685,0.419808,0.420799,0.371608


In [10]:
Xi_hold.to_csv('Xi_hold.csv', index=False)

In [38]:
#Preparing the outcome datasets for all four models
outcome_datasets = {}

# List of outcomes
outcomes = ['si', 'att', 'nssi', 'sitbs']

# Creating separate datasets for each outcome- datasets saved as y_si, y_att, etc.
for outcome in outcomes:
    globals()[f"y_{outcome}"] = y[outcome].copy()

In [42]:
#Checking the resultant outcome datasets
for outcome in outcomes:
    globals()[f"y_{outcome}"]

6       1.0
993     0.0
556     1.0
1234    0.0
2059    0.0
       ... 
1555    0.0
139     0.0
1039    1.0
2058    0.0
433     0.0
Name: si, Length: 1699, dtype: float64

6       0.0
993     0.0
556     0.0
1234    0.0
2059    0.0
       ... 
1555    0.0
139     0.0
1039    1.0
2058    0.0
433     0.0
Name: att, Length: 1699, dtype: float64

6       0
993     0
556     1
1234    0
2059    0
       ..
1555    0
139     0
1039    1
2058    0
433     0
Name: nssi, Length: 1699, dtype: int32

6       1
993     0
556     1
1234    0
2059    0
       ..
1555    0
139     0
1039    1
2058    0
433     0
Name: sitbs, Length: 1699, dtype: int64

In [52]:
# Creating separate datasets for each outcome in the hold-out datasets
for outcome in outcomes:
    globals()[f"y_hold_{outcome}"] = y_hold[outcome].copy()

In [53]:
#Checking the resultant outcome datasets
for outcome in outcomes:
    globals()[f"y_hold_{outcome}"]

149     0.0
488     1.0
1527    0.0
1435    0.0
1062    0.0
       ... 
4       0.0
1489    1.0
1873    0.0
1280    0.0
242     0.0
Name: si, Length: 729, dtype: float64

149     0.0
488     0.0
1527    0.0
1435    0.0
1062    0.0
       ... 
4       0.0
1489    0.0
1873    0.0
1280    0.0
242     0.0
Name: att, Length: 729, dtype: float64

149     0
488     0
1527    0
1435    0
1062    0
       ..
4       0
1489    0
1873    0
1280    0
242     0
Name: nssi, Length: 729, dtype: int32

149     0
488     1
1527    0
1435    0
1062    0
       ..
4       0
1489    1
1873    0
1280    0
242     0
Name: sitbs, Length: 729, dtype: int64

In [55]:
#Saving the datasets
for outcome in outcomes:
    globals()[f"y_{outcome}"].to_csv(f"y_{outcome}.csv", index=False)

for outcome in outcomes:
    globals()[f"y_hold_{outcome}"].to_csv(f"y_hold_{outcome}.csv", index=False)