In [4]:
import numpy as np
import pandas as pd
from scipy.stats import norm
from sklearn.model_selection import train_test_split
np.random.seed(92104)

combined_data = pd.read_csv('cleaned_data.csv')

#add numeric id
combined_data['idnum'] = combined_data['subjectkey'].astype('category').cat.codes
combined_data

print(combined_data.shape)
print('Number of subject ids: ' + str(len(combined_data['subjectkey'].unique())))

print('Number of subjects with fitbit activity: ' + 
      str(sum(combined_data[['subjectkey', 'has_activity']].drop_duplicates()["has_activity"] == 1)))

print(sum(combined_data[["subjectkey", "clinical"]].drop_duplicates()["clinical"].isna()))
print(sum(combined_data[["subjectkey", "clinical"]].drop_duplicates()["clinical"].notna()))
combined_data = combined_data[((combined_data["has_activity"] == 1) | 
                               (combined_data["has_sleep"] == 1)) & 
                              (combined_data["has_clinical"] == 1) & 
                              (combined_data["clinical"].notna())]
len(combined_data["subjectkey"].unique())

strat_cols = ['clinical', 'sex', 'has_sleep']
unique_ids = combined_data[['subjectkey'] + strat_cols].drop_duplicates()
unique_ids.shape

train_ids, holdout_ids = train_test_split(unique_ids, test_size = .2, 
                                          random_state=1876, shuffle=True, 
                                          stratify=unique_ids[strat_cols])

print("Train supra-clinical cases:", sum(train_ids["clinical"] == 1))
print("Holdout supra-clinical cases:", sum(holdout_ids["clinical"] == 1))
print("Train supra-clinical cases prop:", sum(train_ids["clinical"] == 1) / train_ids.shape[0])
print("Holdout supra-clinical cases prop:", sum(holdout_ids["clinical"] == 1) / holdout_ids.shape[0])
print("Train Female:", sum(train_ids["sex"] == 'F'))
print("Holdout Female:", sum(holdout_ids["sex"] == 'F'))
print("Train Female prop:", sum(train_ids["sex"] == 'F') / train_ids.shape[0])
print("Holdout Female prop:", sum(holdout_ids["sex"] == 'F') / holdout_ids.shape[0])
print("Train has sleep:", sum(train_ids["has_sleep"] == 1))
print("Holdout has sleep:", sum(holdout_ids["has_sleep"] == 1))
print("Train has sleep prop:", sum(train_ids["has_sleep"] == 1) / train_ids.shape[0])
print("Holdout has sleep prop:", sum(holdout_ids["has_sleep"] == 1) / holdout_ids.shape[0])

train_data = train_ids.merge(combined_data, on = ['subjectkey'] + strat_cols, how = 'left')
holdout_data = holdout_ids.merge(combined_data, on = ['subjectkey'] + strat_cols, how = 'left')
print(train_data.shape)
print(len(train_data['subjectkey'].unique()))
print(train_ids.shape)
print(holdout_data.shape)
print(len(holdout_data['subjectkey'].unique()))
print(holdout_ids.shape)

holdout_ids["subjectkey"].to_csv('holdout_ids.csv')
train_data.to_csv('train_data.csv', index=False)
holdout_data.to_csv('holdout_data.csv', index=False)

(109285, 183)
Number of subject ids: 6571
Number of subjects with fitbit activity: 5680
19
6552
Train supra-clinical cases: 442
Holdout supra-clinical cases: 110
Train supra-clinical cases prop: 0.09710017574692442
Holdout supra-clinical cases prop: 0.09666080843585237
Train Female: 2184
Holdout Female: 547
Train Female prop: 0.4797891036906854
Holdout Female prop: 0.4806678383128295
Train has sleep: 3546
Holdout has sleep: 886
Train has sleep prop: 0.7789982425307557
Holdout has sleep prop: 0.7785588752196837
(86659, 183)
4552
(4552, 4)
(21720, 183)
1138
(1138, 4)
