# SWAN data

The official website of this data:
https://www.icpsr.umich.edu/icpsrweb/ICPSR/series/00253

related paper: Quintana, F. A., Johnson, W. O., Waetjen, L. E., & B. Gold, E. (2016). Bayesian nonparametric longitudinal data analysis. Journal of the American Statistical Association, 111(515), 1168-1181.

Preprocessing:
There are 11 separate files for this dataset, I combine all of them to a single large file.

1. There are three types of features, numerical, categorical and date. All features related to date are removed directly.
1. Categorical features are expended using onehot encoding. The nan values for the categorical features are viewed as unique features, which are also removed.
1. The nan values for numerical feature are imputed using regression.
1. The observation is defined asthe age of the subject to capture the belief that, the age of a subject is another main factors to the outcome.

Statistics:

records: 28489

features: 139 (including subject id, observation id and outcome)

subjects: 3302

observations: 11 (ranging from 1994 - 2008) (age of subject: 42 - 63)

In [28]:
import pandas as pd
import numpy as np

# driveLoc = "F:/onedrive"
# driveLoc = "/Users/jul672/Desktop/OneDrive"
driveLoc = "C:/Users/jokit/OneDrive"
fileLoc = driveLoc+'/phd projects/Study of Women\'s Health Across the Nation (SWAN)/'

In [29]:
# heads = None
# for i in range(11):
#     file = fileLoc + 'wave' + str(i) + '.csv'
#     if heads is None:
#         heads = pd.read_csv(file,nrows=1).columns.values
#         heads[2:] = [x[:-1] for x in heads[2:]]
#     else:
#         tmp = pd.read_csv(file,nrows=1).columns.values
#         if i < 10:
#             tmp[2:] = [x[:-1] for x in tmp[2:]]
#         else:
#             tmp[2:] = [x[:-2] for x in tmp[2:]]
#         mask = [x in tmp for x in heads]
#         heads = heads[mask]
# legalHeads = heads

In [30]:
# legalHeads

In [31]:
legalHeads = ['SWANID', 'VISIT', 'AGE', 'PREGNAN', 'ALCHL24','RACE',
       'EATDRIN', 'STRTPER', 'BLDRWAT', 'BLDDRAW', 'THYROID', 'STROKE',
       'HBCHOLE', 'MIGRAIN', 'OSTEOAR', 'ANEMIA', 'BOTHER', 'APPETIT',
       'BLUES', 'GOOD', 'KEEPMIN', 'DEPRESS', 'EFFORT', 'HOPEFUL',
       'FAILURE', 'FEARFUL', 'RESTLES', 'HAPPY', 'TALKLES', 'LONELY',
       'UNFRNDL', 'ENJOY', 'CRYING', 'SAD', 'DISLIKE', 'GETGOIN', 'JOB',
       'HOSPSTA', 'MDTALK', 'PAPSMEA', 'BRSTEXA', 'MAMOGRA',
       'SMOKERE', 'AVCIGDA', 'STIFF', 'COLDSWE', 'NITESWE', 'VAGINDR',
       'FEELBLU', 'DIZZY', 'IRRITAB', 'NRVOUS', 'FORGET', 'MOODCHG',
       'HARTRAC', 'FEARFULA', 'HDACHE', 'HOTFLAS', 'TRBLSLE', 'WAKEUP',
       'WAKEARL', 'DANDC', 'UTERPRO', 'INCOME', 'STARTNE', 'WORKTRB',
       'QUITJOB', 'WORKLOA', 'PRTUNEM', 'MONEYPR', 'WORSREL', 'RELATEN',
       'SERIPRO', 'CHILDMO', 'RESPCAR', 'LEGALPR', 'SELFVIO', 'MAJEVEN',
       'PULSE', 'SYSBP1', 'DIABP1', 'SYSBP2', 'DIABP2',
       'HEIGHT', 'HTMETHO', 'WEIGHT', 'SCALE', 'WAIST', 'WASTMEA', 'HIP',
       'HIPMEAS', 'BMI', 'DHAS', 'FSH', 'SHBG', 'T',
       'E2AVE', 'FLGCV', 'FLGDIF', 'SPSCTIM', 'SPSCMOD',
       'HPSCTIM', 'HPSCMOD', 'SPBMDT', 'HPBMDT', 'BMDFLG',
       'STATUS']

In [32]:
needDummy = ['PREGNAN', 'ALCHL24','RACE',
       'EATDRIN', 'STRTPER', 'BLDRWAT', 'BLDDRAW', 'THYROID', 'STROKE',
       'HBCHOLE', 'MIGRAIN', 'OSTEOAR', 'ANEMIA', 'BOTHER', 'APPETIT',
       'BLUES', 'GOOD', 'KEEPMIN', 'DEPRESS', 'EFFORT', 'HOPEFUL',
       'FAILURE', 'FEARFUL', 'RESTLES', 'HAPPY', 'TALKLES', 'LONELY',
       'UNFRNDL', 'ENJOY', 'CRYING', 'SAD', 'DISLIKE', 'GETGOIN', 'JOB',
       'PAPSMEA', 'BRSTEXA', 'MAMOGRA',
       'SMOKERE', 'AVCIGDA', 'STIFF', 'COLDSWE', 'NITESWE', 'VAGINDR',
       'FEELBLU', 'DIZZY', 'IRRITAB', 'NRVOUS', 'FORGET', 'MOODCHG',
       'HARTRAC', 'FEARFULA', 'HDACHE', 'HOTFLAS', 'TRBLSLE', 'WAKEUP',
       'WAKEARL', 'DANDC', 'UTERPRO', 'INCOME', 'STARTNE', 'WORKTRB',
       'QUITJOB', 'WORKLOA', 'PRTUNEM', 'MONEYPR', 'WORSREL', 'RELATEN',
       'SERIPRO', 'CHILDMO', 'RESPCAR', 'LEGALPR', 'SELFVIO', 'MAJEVEN',
       'HTMETHO', 'SCALE', 'WASTMEA',
       'HIPMEAS',
       'FLGCV', 'FLGDIF', 'SPSCMOD',
       'HPSCMOD','BMDFLG',
       'STATUS']

In [33]:
data = pd.read_csv(fileLoc + 'full.csv',usecols=legalHeads,delimiter=';')

In [34]:
# compute CES-D   https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3197240/
bad = ['BOTHER','APPETIT','BLUES','DEPRESS','FAILURE','FEARFUL','RESTLES','TALKLES','LONELY','UNFRNDL','CRYING','SAD','DISLIKE','GETGOIN']
good = ['GOOD','EFFORT','HOPEFUL','HAPPY','ENJOY','KEEPMIN']

def formatEntry(x):
    v1 = str(x).lower()
    v2 = v1.split(':')
    if len(v2) > 1:
        return v2[1].strip()
    else:
        return v1

def getCesdScoreForColumn(col):
    tmp = data[col].values
    ans = np.array(list(map(formatEntry,tmp)))
    s = []
    if col in bad:
        for x in ans:
            if x == 'rarely/none of the time (< 1 day)':
                s.append(0)
            elif x == 'some/a little of the time (1-2 days)':
                s.append(1)
            elif x == 'occasionally/mod amt of the time (3-4 days)':
                s.append(2)
            elif x == 'most/all of the time (5-7 days)':
                s.append(3)
            else:
                s.append(0)
    else:
        for x in ans:
            if x == 'rarely/none of the time (< 1 day)':
                s.append(3)
            elif x == 'some/a little of the time (1-2 days)':
                s.append(2)
            elif x == 'occasionally/mod amt of the time (3-4 days)':
                s.append(1)
            elif x == 'most/all of the time (5-7 days)':
                s.append(0)
            else:
                s.append(0)
    return np.array(s)

In [35]:
bad = ['BOTHER','APPETIT','BLUES','DEPRESS','FAILURE','FEARFUL','RESTLES','TALKLES','LONELY','UNFRNDL','CRYING','SAD','DISLIKE','GETGOIN']
good = ['GOOD','EFFORT','HOPEFUL','HAPPY','ENJOY','KEEPMIN']
cesd = None
for col in bad:
    if cesd is None:
        cesd = getCesdScoreForColumn(col)
    else:
        cesd += getCesdScoreForColumn(col)
for col in good:
    cesd += getCesdScoreForColumn(col)

In [36]:
legalHeads = ['SWANID', 'VISIT', 'AGE', 'PREGNAN', 'ALCHL24','RACE',
       'EATDRIN', 'STRTPER', 'THYROID', 'STROKE',
       'MIGRAIN', 'OSTEOAR', 'ANEMIA','JOB',
       'HOSPSTA', 'MDTALK', 
       'SMOKERE', 'STIFF', 'COLDSWE', 'NITESWE', 
       'DIZZY', 'IRRITAB', 'NRVOUS', 'FORGET', 'MOODCHG',
       'HARTRAC', 'HDACHE', 'HOTFLAS', 'TRBLSLE', 'WAKEUP',
       'WAKEARL', 'DANDC', 'UTERPRO', 'INCOME', 'STARTNE', 'WORKTRB',
       'QUITJOB', 'WORKLOA', 'PRTUNEM', 'MONEYPR', 'WORSREL', 'RELATEN',
       'SERIPRO', 'CHILDMO', 'RESPCAR', 'LEGALPR', 'SELFVIO', 'MAJEVEN',
       'HEIGHT','WEIGHT', 'WAIST', 'HIP',
       'BMI', 'DHAS', 'FSH', 'SHBG', 'T',
       'E2AVE','SPBMDT', 'HPBMDT','STATUS']

In [37]:
legalHeadsRename = np.array(['SWANID', 'VISIT', 'AGE', 'Pregnant', 'Alcohol24Hrs','Race',
       'EatOrDrink12Hrs', 'PeriodPastWk', 'ThyroidMed', 'Stroke',
       'Migraines', 'Arthritis', 'Anemia','WorkPast2Wks',
       'HospitalStayPastYear', 'TalkLastYr', 
       'Smoke', 'StiffnessPast2Wks', 'ColdSweatPast2Wks', 'NightSweatPast2Wks', 
       'DizzyPast2Wks', 'IrritablePast2Wks', 'NervousPast2Wks', 'ForgetfulnessPast2Wks', 'MoodChangePast2Wks',
       'HeartRacingPast2Wks', 'HeadachePast2Wks', 'HotFlashesPast2Wks', 'TroubleSleepPast2Wks', 'WakeupSeveralTimesPast2Wks',
       'WakeupEarlyPast2Wks', 'HadD&C', 'HadUterineProcedures', 'Income', 'StartNewJobUpset', 'WorkProblemUpset',
       'QuitJobUpset', 'IncreaseWorkLoadUpset', 'PartnerUnempUpset', 'MoneyProblemUpset', 'WorsenRelationUpset', 'EndedRelationUpset',
       'FamilyProblemUpset', 'ChildMovedUpset', 'ResponsibilityForCareUpset', 'LegalProblemUpset', 'ViolentEventUpset', 'OtherEventUpset',
       'Height', 'Weight', 'Waist', 'Hip',
       'BMI', 'DHAS', 'FSH', 'SHBG', 'Testosterone',
       'Estradiol','SplineBoneDensity', 'HipBoneDensity','STATUS'])

In [38]:
needDummy = [
    'Pregnant', 'Alcohol24Hrs','Race',
       'EatOrDrink12Hrs', 'PeriodPastWk', 'ThyroidMed', 'Stroke',
       'Migraines', 'Arthritis', 'Anemia','WorkPast2Wks',
       'HospitalStayPastYear', 'TalkLastYr', 
       'Smoke', 'StiffnessPast2Wks', 'ColdSweatPast2Wks', 'NightSweatPast2Wks', 
       'DizzyPast2Wks', 'IrritablePast2Wks', 'NervousPast2Wks', 'ForgetfulnessPast2Wks', 'MoodChangePast2Wks',
       'HeartRacingPast2Wks', 'HeadachePast2Wks', 'HotFlashesPast2Wks', 'TroubleSleepPast2Wks', 'WakeupSeveralTimesPast2Wks',
       'WakeupEarlyPast2Wks', 'HadD&C', 'HadUterineProcedures', 'Income', 'StartNewJobUpset', 'WorkProblemUpset',
       'QuitJobUpset', 'IncreaseWorkLoadUpset', 'PartnerUnempUpset', 'MoneyProblemUpset', 'WorsenRelationUpset', 'EndedRelationUpset',
       'FamilyProblemUpset', 'ChildMovedUpset', 'ResponsibilityForCareUpset', 'LegalProblemUpset', 'ViolentEventUpset', 'OtherEventUpset',
       'STATUS'
]

In [39]:
data = pd.read_csv(fileLoc + 'full.csv',usecols=legalHeads,delimiter=';')
data = data.rename({legalHeads[x]:legalHeadsRename[x] for x in range(len(legalHeads))},axis=1)

In [40]:
# change the name of the columns
# change some of the continuous factors to discrete but more meaningful values
talk = data.TalkLastYr.values
def TalkMuch(x):
    if x > 10:
        return 'talk a lot (>10)'
    elif x >= 1 and x <= 10:
        return 'few (1 to 10)'
    elif x == 0:
        return 'never (0)'
    else:
        return 'nan'
talkDis = [TalkMuch(x) for x in talk]
data['TalkLastYr'] = talkDis

# hostpital stay
hpsStay = data.HospitalStayPastYear.values
def HospStay(x):
    if x == 0:
        return 'no'
    elif x > 0:
        return 'yes'
    else:
        return 'nan'
hpsStayDis = [HospStay(x) for x in hpsStay]
data['HospitalStayPastYear'] = hpsStayDis

In [41]:
# reformat the income
def incomeReformat(x):
    newv = []
    for each in x:
        sp = each.split('_')
        last = sp[len(sp) - 1]
        if last == 'less than $19,999':
            newv.append('very low (<20k)')
        elif last == '$20,000 to $49,999' or last == '$50,000 to $99,999':
            newv.append('medium (20k to 100k)')
        elif last == '$100,000 or more':
            newv.append('high (>100k)')
        else:
            newv.append('nan')
    return newv

In [42]:
# change the values of some discrete variables
def cutDownValues(x):
    newv = []
    for each in x:
        sp = each.split('_')
        last = sp[len(sp) - 1]
        if last == 'every day' or last == '9-13 days':
            newv.append('a lot (>8)')
        elif last == 'not at all':
            newv.append('never')
        elif last == '1-5 days' or last == '6-8 days':
            newv.append('some (1 to 8)')
        else:
            newv.append('nan')
    return newv
            
def upsettingCutDown(x):
    newv = []
    for each in x:
        sp = each.split('_')
        last = sp[len(sp) - 1]
        if last == 'yes, very upsetting & still upsetting' or last == 'yes, very upsetting' or last == 'yes, somewhat upsetting':
            newv.append('yes, upset')
        elif last == 'yes, not at all upsetting':
            newv.append('yes, but not upset')
        elif last == 'no':
            newv.append('no')
        else:
            newv.append('nan')
    return newv
featureNeedCutDown = ['StiffnessPast2Wks','ColdSweatPast2Wks', 'NightSweatPast2Wks', 
       'DizzyPast2Wks', 'IrritablePast2Wks', 'NervousPast2Wks', 'ForgetfulnessPast2Wks', 'MoodChangePast2Wks',
       'HeartRacingPast2Wks', 'HeadachePast2Wks', 'HotFlashesPast2Wks', 'TroubleSleepPast2Wks', 'WakeupSeveralTimesPast2Wks',
       'WakeupEarlyPast2Wks',]
upsetFeatures = ['StartNewJobUpset', 'WorkProblemUpset',
       'QuitJobUpset', 'IncreaseWorkLoadUpset', 'PartnerUnempUpset', 'MoneyProblemUpset', 'WorsenRelationUpset', 'EndedRelationUpset',
       'FamilyProblemUpset', 'ChildMovedUpset', 'ResponsibilityForCareUpset', 'LegalProblemUpset', 'ViolentEventUpset', 'OtherEventUpset']

In [43]:
observationCol = 'AGE'
tmp = pd.DataFrame()
tmpData = data[needDummy]
for i in tmpData:
    cols = tmpData[i].values
    tmp[i] = list(map(formatEntry,cols))
tmpData = tmp

for each in featureNeedCutDown:
    tmpData[each] = cutDownValues(tmpData[each].values)
for each in upsetFeatures:
    tmpData[each] = upsettingCutDown(tmpData[each].values)
tmpData['Income'] = incomeReformat(tmpData['Income'].values)

## format for LGPR

In [44]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler

In [45]:
def categorizeFeature(x):
    idmap = {each:i for i,each in enumerate(set(x))}
    if len(idmap) > 1:
        return np.array([idmap[each] for each in x]).reshape(-1,1)
    return None

In [46]:
colnames = []
tmpData_np = np.array(tmpData.values)
cateRes = None
for each in range(tmpData_np.shape[1]):
    if cateRes is None:
        r = categorizeFeature(tmpData_np[:,each])
        if r is not None:
            cateRes = r
            colnames.append(tmpData.columns.values[each])
    else:
        r = categorizeFeature(tmpData_np[:,each])
        if r is not None:
            cateRes = np.concatenate([cateRes,r],axis=1)
            colnames.append(tmpData.columns.values[each])
colnames = np.array(colnames)            
cateCols = np.arange(len(colnames))

In [47]:
tmpContData = data.drop(needDummy,axis=1).drop(['VISIT','SWANID'],axis=1)
stdModel = StandardScaler()
stdRes = stdModel.fit_transform(np.array(tmpContData))
# impute the missing values in the continuous feature
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imp = IterativeImputer(max_iter=50, random_state=0)
stdRes = imp.fit_transform(stdRes)
colnames = np.concatenate([colnames,tmpContData.columns.values])
contCols = np.arange(len(cateCols),len(cateCols)+len(tmpContData.columns.values))

In [48]:
idCol = data['SWANID'].values
oidCol = data[observationCol].values
X = np.concatenate([cateRes,stdRes],axis=1)
y = cesd - 15

In [49]:
mask_remove = np.isnan(oidCol)
idCol = idCol[~mask_remove]
oidCol = oidCol[~mask_remove]
X = X[~mask_remove]
y = y[~mask_remove]

In [50]:
# format a small dataset containing only several individuals
from collections import Counter
ct = Counter(idCol)
top = 50
targetIds = set([x[0] for x in ct.most_common()[:top]])
mask = [x in targetIds for x in idCol]
idCol = idCol[mask]
oidCol = oidCol[mask]
X = X[mask]
y = y[mask]

## format for other algorithms

In [49]:
colnames = []
onehotModel = OneHotEncoder()
onehotRes = onehotModel.fit_transform(np.array(tmpData.values))
for i in range(len(onehotModel.categories_)):
    colnames.extend([tmpData.columns.values[i]+'_'+str(x) for x in onehotModel.categories_[i]])

In [50]:
tmpContData = data.drop(needDummy,axis=1).drop(['VISIT','SWANID'],axis=1)
stdModel = StandardScaler()
stdRes = stdModel.fit_transform(np.array(tmpContData))
# impute the missing values in the continuous feature
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imp = IterativeImputer(max_iter=50, random_state=0)
stdRes = imp.fit_transform(stdRes)
colnames = np.concatenate([colnames,tmpContData.columns.values])

In [51]:
idCol = data['SWANID'].values
oidCol = data[observationCol].values
X = np.concatenate([onehotRes.toarray(),stdRes],axis=1)
y = cesd - 15
v = [not x.endswith('nan') for x in colnames]
X = X[:,v]
colnames = colnames[v]

mask_remove = np.isnan(oidCol)
idCol = idCol[~mask_remove]
oidCol = oidCol[~mask_remove]
X = X[~mask_remove]
y = y[~mask_remove]

In [53]:
len(set(idCol))

3300

In [21]:
# format a small dataset containing only several individuals
from collections import Counter
ct = Counter(idCol)
top = 50
targetIds = set([x[0] for x in ct.most_common()[:top]])
mask = [x in targetIds for x in idCol]
idCol = idCol[mask]
oidCol = oidCol[mask]
X = X[mask]
y = y[mask]

In [22]:
from collections import defaultdict

def splitTrainTest(ids, time, prop=.7, seed = 19):
    np.random.seed(seed)
    mask = np.array([np.random.uniform() < prop for x in ids])
    trainIdx, trainTime = ids[mask], time[mask]
    testIdx,testTime = ids[~mask],time[~mask]
    trainDict = defaultdict(list)
    testDict = defaultdict(list)
    
    for i,m in enumerate(mask):
        if m:
            trainDict[ids[i]].append((time[i],i))
        else:
            testDict[ids[i]].append((time[i],i))
    for k,v in trainDict.items():
        trainV = [x[0] for x in v]
        trainVid = [x[1] for x in v]
        tv = testDict.get(k,None)
        if tv is not None:
            testV = [x[0] for x in tv]
            testVid = [x[1] for x in tv]
            maxId = np.argmax(trainV)
            minId = np.argmin(testV)
            while trainV[maxId] > testV[minId]:
                tmp = trainV[maxId]
                trainV[maxId] = testV[minId]
                testV[minId] = tmp
                tmp = trainVid[maxId]
                trainVid[maxId] = testVid[minId]
                testVid[minId] = tmp
                maxId = np.argmax(trainV)
                minId = np.argmin(testV)
            trainDict[k] = [(x,y) for x,y in zip(trainV,trainVid)]
            testDict[k] = [(x,y) for x,y in zip(testV,testVid)]
    train = []
    test = []
    for k,v in trainDict.items():
        train.extend([x[1] for x in v])
    for k,v in testDict.items():
        test.extend([x[1] for x in v])
    return train, test

def getIndvFixFeature(ids,x):
    dt = defaultdict(list)
    for i,k in enumerate(ids):
        dt[k].append(x[i])
    noChange = np.repeat(True,x.shape[1])
    for k,v in dt.items():
        v = np.array(v)
        for i in range(v.shape[1]):
            if np.sum(v[:,i] - v[0,i]) != 0:
                noChange[i] = False
    return noChange

from scipy import io
def generate(seed = 19,density = 0.7,name='swan'):
    trainIdx, testIdx = splitTrainTest(idCol, X[:,np.where(colnames == 'AGE')[0]].reshape(-1),density,seed)
#     noChange = getIndvFixFeature(idCol,X)
#     tmp_idx = np.arange(X.shape[1])[~noChange]
    if name == 'swanLGPR':
        io.savemat(f'../{name}_{seed}',{'trainId':idCol[trainIdx],'trainOid':oidCol[trainIdx],'trainX':X[trainIdx],'trainY':y[trainIdx],
                                  'testId':idCol[testIdx],'testOid':oidCol[testIdx],'testX':X[testIdx],'testY':y[testIdx],'colname':colnames,
                                  'cateCols':cateCols,'contCols':contCols})
    else:
        io.savemat(f'../{name}_{seed}',{'trainId':idCol[trainIdx],'trainOid':oidCol[trainIdx],'trainX':X[trainIdx],'trainY':y[trainIdx],
                                  'testId':idCol[testIdx],'testOid':oidCol[testIdx],'testX':X[testIdx],'testY':y[testIdx],'colname':colnames})

In [27]:
# generate(seed = 2)
generate(seed = 14,name='swan')