# Clean Data For Q-Learning and GSMRL Model
Here we clean the data by filling the missing values using the previous values in the same column.

In [1]:
import numpy as np
import pickle
import pandas

In [2]:
#GPU setup
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))
seed = 123
rng = np.random.RandomState(seed)

Found GPU at: /device:GPU:0Metal device set to: Apple M1



In [3]:
# Get indices for training, validation, and testing
"""
data_points = 10000
test_size  = int(data_points*0.1)
valid_size = test_size
train_size = data_points - 2 * test_size

indices = np.arange(data_points)
rng.shuffle(indices)
train_indices = indices[: train_size]
validation_indices = indices[train_size : -valid_size]
test_indices = indices[-test_size :]
"""

'\ndata_points = 10000\ntest_size  = int(data_points*0.1)\nvalid_size = test_size\ntrain_size = data_points - 2 * test_size\n\nindices = np.arange(data_points)\nrng.shuffle(indices)\ntrain_indices = indices[: train_size]\nvalidation_indices = indices[train_size : -valid_size]\ntest_indices = indices[-test_size :]\n'

In [4]:
df = pandas.read_csv("./Wright/Daily Diary Long Form.csv")
df = df.replace(r'^\s*$', np.nan, regex=True)
df = df.fillna(method='ffill')
features = df.loc[:,"Stress1":"LOV"]
time     = df.loc[:, "Time"]
target   = df.loc[:, "NegAffDay":]
ind_demo = df.loc[:, "Gender":"NEO_C"]
df

  df = pandas.read_csv("./Wright/Daily Diary Long Form.csv")


Unnamed: 0,ID,Datessmt,True_date,Time,Weekday,Weekend,True_date_b,TimeB,WeekdayB,WeekendB,...,LOV,NegAffDay,Urgency,Exhibitionism,DetatchDay,ImpulsivityDay,Compulsivity,PsychoDay,HostileDay,ManipDay
0,1.0,2/11/13,2/11/13,1,1,0,,,,,...,-7.95,3.25,0,0.33,6.33,0.67,0.33,3.17,0,0
1,1.0,2/11/13,2/12/13,2,2,0,2/11/13,1,1,0,...,-3.66,4.5,1.5,0.33,4.67,1,1.67,3.17,0.5,0
2,1.0,2/11/13,2/13/13,3,3,0,2/12/13,2,2,0,...,-5.95,3.75,1,0.33,6.33,1,1,4.67,0.25,1
3,1.0,2/11/13,2/14/13,4,4,0,2/13/13,3,3,0,...,-2.41,4.75,1,0.67,6,0.33,0.67,4.67,0.25,0.5
4,1.0,2/11/13,2/15/13,5,5,0,2/14/13,4,4,0,...,-1.83,4.5,1.5,0.67,6,0.33,1,3.67,0.25,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,116.0,3/19/13,6/13/13,86,4,0,6/12/13,85,3,0,...,-7.95,3.75,0,0,3,0,0.33,1.67,0,0
9996,116.0,3/19/13,6/14/13,87,5,0,6/13/13,86,4,0,...,-1.41,2.5,0,0,2,0,0,0.83,0,0
9997,116.0,3/19/13,6/15/13,88,6,1,6/14/13,87,5,0,...,-1.83,2.25,0,0.67,1.67,0,0,1.5,0,0
9998,116.0,3/19/13,6/15/13,88,6,1,6/15/13,88,6,1,...,-1.83,2.25,0,0.67,1.67,0,0,1.5,0,0


In [5]:
train_indices = []
for i in range(1,81):
    train_indices += df.index[df['Time'] == f'{i}'].tolist()
validation_indices = []
for i in range(80,91):
    validation_indices += df.index[df['Time'] == f'{i}'].tolist()
test_indices = []
for i in range(90,103):
    test_indices += df.index[df['Time'] == f'{i}'].tolist()

train_indices      = np.array(train_indices, dtype=np.int64)
validation_indices = np.array(validation_indices, dtype=np.int64)
test_indices       = np.array(test_indices, dtype=np.int64)

# Binarize Feature Data Based Median
Here we quantize the demographic and survey feature data based on the median. 

For both Q-Learning and the GSMRL model we use only the features that are markers for the target outputs.
In this case, there are 30 survey DPDS features and 20 demographic features. We assume that the 20 demographic features are given.

In [6]:
# Cast to float first
features = features.astype(float)
ind_demo = ind_demo.astype(float)
target = target.astype(float)
features_col = features.columns
ind_demo_col = ind_demo.columns
target_col = target.columns

In [7]:
# Previous Survey Features and Demographic Features
top_20 = [54, 19, 64, 37, 26,  3, 46, 44, 88, 42, 57, 62, 60, 20, 36, 16, 63, 65, 38, 24]
top_demo = [48, 98,  4,  0, 91, 67, 16, 29, 74, 28, 81, 63, 23, 95, 70, 50, 55, 58, 71, 54]
print("Previous Survey Features:", features_col[top_20])
print("Previous Demographic Features", ind_demo_col[top_demo])

Previous Survey Features: Index(['DPDS19', 'Determined', 'DPDS29', 'DPDS02', 'Indifferent', 'Severe2',
       'DPDS11', 'DPDS09', 'SympatheticIP', 'DPDS07', 'DPDS22', 'DPDS27',
       'DPDS25', 'Hostile', 'DPDS01', 'Alert', 'DPDS28', 'DPDS30', 'DPDS03',
       'Assertive'],
      dtype='object')
Previous Demographic Features Index(['RigidPerfectionism', 'NEO', 'Race2', 'Gender', 'IIPSCMP', 'ISC_HI',
       'Relationships', 'Depressivity', 'IIPSC_DE', 'Submissiveness',
       'IIPSC_ZBC', 'ISC_PA', 'Substance', 'NEO_N', 'ISC_NO', 'Eccentricity',
       'PNI_SSSE', 'PNI_DEV', 'SWLmean', 'PNI_EXP'],
      dtype='object')


In [8]:
# Get marker demographic features
marker_demo = ['EmotionalLability', 'Anxiousness', 'Depressivity', 
              'SeparationInsecurity', 'Anhedonia', 'Withdrawal',
              'RestrictedAffectivity', 'AttentionSeeking', 'Grandiosity',
              'IntimacyAvoidance', 'Hostility', 'Manipulativeness',
              'Deceitfulness', 'Impulsivity', 'RiskTaking',
              'Irresponsibility', 'RigidPerfectionism', 'PerceptualDysregulation',
              'Eccentricity', 'Suspiciousness']
demo_marker = ind_demo.loc[:, marker_demo]
Q_demo = demo_marker.to_numpy()
Q_demo = Q_demo.astype(np.float32)

In [9]:
# Quantize demographic features
for i in range(Q_demo.shape[1]):
    median = np.median(Q_demo[:,i])
    zeros = Q_demo[:,i] - median <= 1e-3
    Q_demo[zeros,i] = 0.
    Q_demo[np.logical_not(zeros),i] = 1.

In [10]:
# Quantize Survey Features
marker_features = ['DPDS01', 'DPDS02', 'DPDS03', 'DPDS04', 'DPDS05', 'DPDS06',
                  'DPDS07', 'DPDS08', 'DPDS09', 'DPDS10', 'DPDS11', 'DPDS12',
                  'DPDS13', 'DPDS15', 'DPDS16', 'DPDS17', 'DPDS19', 'DPDS20', 
                  'DPDS21', 'DPDS22', 'DPDS23', 'DPDS24', 'DPDS25', 'DPDS26', 
                  'DPDS27', 'DPDS28', 'DPDS29', 'DPDS30', 'DPDS31', 'DPDS32']
features_marker = features.loc[:,marker_features]
Q_features = features_marker.to_numpy()
Q_features = Q_features.astype(np.float32)

In [11]:
# Quantize survey DPDS features
for i in range(Q_features.shape[1]):
    median = np.median(Q_features[:,i])
    zeros = Q_features[:,i] - median <= 1e-3
    Q_features[zeros,i] = 0.
    Q_features[np.logical_not(zeros),i] = 1.

In [12]:
# Quantize Target Outputs
Q_target = target.to_numpy()
Q_target = Q_target.astype(np.float32)
for i in range(Q_target.shape[1]):
    median = np.median(Q_target[:,i])
    zeros = Q_target[:,i] - median <= 1e-3
    Q_target[zeros,i] = 0.
    Q_target[np.logical_not(zeros),i] = 1.

In [13]:
# Save Q-Table Features and Target for each Target Output

for i in range(len(target_col)):
    Q_table_data = {
        'train': (Q_features[train_indices], Q_target[train_indices,i]),
        'valid': (Q_features[validation_indices], Q_target[validation_indices,i]),
        'test': (Q_features[test_indices], Q_target[test_indices,i])
    }
    with open(f"./Q-table-data/psych_{target_col[i]}.pkl", "wb") as f:
        pickle.dump(Q_table_data, f)


Q_table_data = {
    'train': (Q_features[train_indices], Q_target[train_indices]),
    'valid': (Q_features[validation_indices], Q_target[validation_indices]),
    'test': (Q_features[test_indices], Q_target[test_indices])
}
with open("./Q-table-data/psych_total.pkl", "wb") as f:
    pickle.dump(Q_table_data, f)

# Prepare Data for GSMRL Model
We add noise to all survey features and target outputs.

In [14]:
window = 2

In [15]:
train_indices = []
for i in range(1,93):
    train_indices += df.index[df['ID'] == i].tolist()
validation_indices = []
for i in range(93,105):
    validation_indices += df.index[df['ID'] == i].tolist()
test_indices = []
for i in range(105,117):
    test_indices += df.index[df['ID'] == i].tolist()

train_indices      = np.array(train_indices, dtype=np.int64)
validation_indices = np.array(validation_indices, dtype=np.int64)
test_indices       = np.array(test_indices, dtype=np.int64)

if train_indices.shape[0]%window != 0:
    train_indices = train_indices[:-(train_indices.shape[0]%window)]
if validation_indices.shape[0]%window != 0:
    validation_indices = validation_indices[:-(validation_indices.shape[0]%window)]
if test_indices.shape[0]%window != 0:
    test_indices = test_indices[:-(test_indices.shape[0]%window)]

In [16]:
AFA_features = features_marker.to_numpy()
AFA_target = target.to_numpy()
AFA_features = AFA_features.astype(np.float32)
AFA_target = AFA_target.astype(np.float32)

In [17]:
AFA_feat_train = AFA_features[train_indices]
AFA_feat_valid = AFA_features[validation_indices]
AFA_feat_test  = AFA_features[test_indices]
AFA_target_train = AFA_target[train_indices]
AFA_target_valid = AFA_target[validation_indices]
AFA_target_test  = AFA_target[test_indices]

In [18]:
train_feat   = np.zeros((train_indices.shape[0]//window, AFA_features.shape[1]*window))
train_target = np.zeros((train_indices.shape[0]//window, AFA_target.shape[1]*window))
valid_feat   = np.zeros((validation_indices.shape[0]//window, AFA_features.shape[1]*window))
valid_target = np.zeros((validation_indices.shape[0]//window, AFA_target.shape[1]*window))
test_feat    = np.zeros((test_indices.shape[0]//window, AFA_features.shape[1]*window))
test_target  = np.zeros((test_indices.shape[0]//window, AFA_target.shape[1]*window))

In [19]:
for i in range(0, train_indices.shape[0], window):
    train_feat[i//window]   = AFA_feat_train[i:i+window].flatten()
    train_target[i//window] = AFA_target_train[i:i+window].flatten()

for i in range(0, validation_indices.shape[0], window):
    valid_feat[i//window]   = AFA_feat_valid[i:i+window].flatten()
    valid_target[i//window] = AFA_target_valid[i:i+window].flatten()
    
for i in range(0, test_indices.shape[0], window):
    test_feat[i//window]   = AFA_feat_test[i:i+window].flatten()
    test_target[i//window] = AFA_target_test[i:i+window].flatten()

In [20]:
train_feat   += np.random.normal(0, 0.02, train_feat.shape)
train_target += np.random.normal(0, 0.02, train_target.shape)
valid_feat   += np.random.normal(0, 0.02, valid_feat.shape)
valid_target += np.random.normal(0, 0.02, valid_target.shape)
test_feat    += np.random.normal(0, 0.02, test_feat.shape)
test_target  += np.random.normal(0, 0.02, test_target.shape)

In [21]:
# Save AFA Features and Target for each Target Output
AFA_data = {
    'train': (train_feat.astype(np.float32), train_target.astype(np.float32)),
    'valid': (valid_feat.astype(np.float32), valid_target.astype(np.float32)),
    'test':  (test_feat.astype(np.float32),  test_target.astype(np.float32)),
}
with open(f"./psych_GSMRL.pkl", "wb") as f:
    pickle.dump(AFA_data, f)