In [36]:
%matplotlib inline

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import glob
import os

import torch
import torch.nn as nn
import torch.nn.functional as F

# settings
plt.rc('font', size = 9)
plt.rc('font', family='sans serif')
plt.rcParams['pdf.fonttype']=42
plt.rcParams['ps.fonttype']=42
plt.rcParams['text.usetex']=False
plt.rcParams['legend.frameon']=False
plt.rcParams['axes.grid']=False
plt.rcParams['legend.markerscale']=0.5
plt.rcParams['savefig.dpi']=600
sns.set_style("ticks")

In [37]:
df = pd.DataFrame()
for i, file in enumerate(glob.glob('/home/ngr/gdrive/wearables/data/MOD_1000_Woman_Activity_Data/*')):
    labels = os.path.split(file)[1].split(' ')
    if len(labels) == 4:
        pid, status, _, ga = labels # gestational age
    elif len(labels) == 2:
        pid, status = labels
        ga = np.nan
    elif len(labels) == 3:
        pid, status, ga = labels
        if ga=='Loss':
            ga = np.nan
        elif ga=='Withdrawn':
            status = 'Withdrawn'
            ga = np.nan
    elif len(labels) == 5:
        pid, _, status, _, ga = labels
    else: 
        print('  skipping\t', file, '\t', labels)
        continue
    dt = pd.DataFrame({'status':status, 'GA':ga, 'fpath':file}, index=[pid])
    df = df.append(dt)

  skipping	 /home/ngr/Downloads/MOD_1000_Woman_Activity_Data/1342_GA22.csv 	 ['1342_GA22.csv']
  skipping	 /home/ngr/Downloads/MOD_1000_Woman_Activity_Data/1342_GA22.mtn 	 ['1342_GA22.mtn']
  skipping	 /home/ngr/Downloads/MOD_1000_Woman_Activity_Data/1648 	 ['1648']
  skipping	 /home/ngr/Downloads/MOD_1000_Woman_Activity_Data/1831_GA24.csv 	 ['1831_GA24.csv']
  skipping	 /home/ngr/Downloads/MOD_1000_Woman_Activity_Data/2202_GA34.mtn 	 ['2202_GA34.mtn']


In [38]:
# drop some erroneous ones 
df = df.loc[(df['status']!='Uploaded') & (df['status']!='Withdrawn') & (df['status']!='to'), :]

In [39]:
df['status'].unique()

array(['Delivered', 'Miscarriage', 'Pregnant', 'Loss', 'Misscarriage',
       'Closed', 'Delivere', 'Prgnant', 'Deliverd', 'Term', 'Preterm',
       'IUFD', 'Delieverd', 'Delivery'], dtype=object)

In [40]:
df.loc[df['status']=='Closed', 'fpath'].values

array(['/home/ngr/Downloads/MOD_1000_Woman_Activity_Data/1265 Closed Loss',
       '/home/ngr/Downloads/MOD_1000_Woman_Activity_Data/1313 Closed Loss 26w6d',
       '/home/ngr/Downloads/MOD_1000_Woman_Activity_Data/1375 Closed Loss 22wks0days',
       '/home/ngr/Downloads/MOD_1000_Woman_Activity_Data/2405 Closed Loss 9w4d'],
      dtype=object)

In [41]:
df.loc[df['status']=='Term', 'fpath'].values

array(['/home/ngr/Downloads/MOD_1000_Woman_Activity_Data/1477 Term Delivered 39w2d',
       '/home/ngr/Downloads/MOD_1000_Woman_Activity_Data/2411 Term 40w4d'],
      dtype=object)

In [42]:
df.loc[df['status']=='Preterm', 'fpath'].values

array(['/home/ngr/Downloads/MOD_1000_Woman_Activity_Data/1482 Preterm 36w1d'],
      dtype=object)

In [43]:
# if GA is NaN, mask that by that label, otherwise multiply by 1
## give status label consistent names
status_dict = {
    'Delivered':'delivered', 
    'Miscarriage':'miscarriage', 
    'Pregnant':'pregnant', 
    'Loss':'miscarriage',
    'Misscarriage':'miscarriage', 
    'Closed':'miscarriage', 
    'Delivere':'delivered',
    'Prgnant':'pregnant', 
    'Deliverd':'delivered',
    'Term':'delivered', 
    'Preterm':'delivered', 
    'IUFD':'miscarriage', 
    'Delieverd':'delivered', 
    'Delivery':'delivered',
}

df['status'] = df['status'].map(status_dict)

In [44]:
df['status_label'] = df['status']
df['status_label'] = df['status_label'].map({'delivered': 2, 'pregnant': 0, 'miscarriage': 1})

In [45]:
df['pid'] = df.index
df.loc[df.duplicated(subset='pid'), :]
df.loc['1265', :]
# this is fine to have duplicates

Unnamed: 0,status,GA,fpath,status_label,pid
1265,miscarriage,,/home/ngr/Downloads/MOD_1000_Woman_Activity_Da...,1,1265
1265,pregnant,,/home/ngr/Downloads/MOD_1000_Woman_Activity_Da...,0,1265


In [73]:
# drop the one repeated one
df = df.drop_duplicates(subset=['GA', 'status_label'])

In [48]:
df.loc[df['GA']=='_', 'fpath'].values

array(['/home/ngr/Downloads/MOD_1000_Woman_Activity_Data/1014 Delivered _',
       '/home/ngr/Downloads/MOD_1000_Woman_Activity_Data/1162 Delivered _',
       '/home/ngr/Downloads/MOD_1000_Woman_Activity_Data/2006 Delivered _'],
      dtype=object)

In [49]:
df.loc[df['GA']=='GA', 'fpath'].values

array(['/home/ngr/Downloads/MOD_1000_Woman_Activity_Data/1645 Delivered Unknown GA'],
      dtype=object)

In [50]:
df.loc[df['GA']=='FT', 'fpath'].values

array(['/home/ngr/Downloads/MOD_1000_Woman_Activity_Data/1812 Delivered FT',
       '/home/ngr/Downloads/MOD_1000_Woman_Activity_Data/2586 Delivered FT'],
      dtype=object)

In [51]:
df.loc[(df['GA']=='GA') | (df['GA']=='_'), 'GA'] = np.nan
df.loc[(df['GA']=='FT'), 'GA'] = '39w0d'

for i in df.loc[[True if 'term' in str(i).lower() else False for i in df['GA']], :].index:
    df.loc[i, 'GA'] = df.loc[i, 'GA'].split('erm')[1]


In [54]:
np.isnan(np.nan)

True

In [74]:
df['GA_label'] = df['GA']
for i in df.index:
    if isinstance(df.loc[i, 'GA_label'], str):
        df.loc[i, 'GA_label'] = int(df.loc[i, 'GA_label'].lower().split('w')[0])
        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['GA_label'] = df['GA']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [75]:
df['GA_label'].unique()

array([37, nan, 39, 36, 33, 40, 38, 28, 35, 34, 41, 31, 8, 26, 22, 20, 24,
       32, 29, 27, 23, 14, 17, 15, 25, 9, 13, 10, 30, 3, 11, 12],
      dtype=object)

In [77]:
df.to_csv('/home/ngr/gdrive/wearables/data/labels_per_file.csv')

In [80]:
# quick viz
pd.read_csv(df.loc['1001', 'fpath']

'/home/ngr/Downloads/MOD_1000_Woman_Activity_Data/1001 Delivered Term 37w1d'

In [84]:
for f in glob.glob(os.path.join(df.loc['1001', 'fpath'], '*.csv')):
    dt = pd.read_csv(f, header=None)

In [85]:
dt

Unnamed: 0,0,1,2,3
0,UserID,1001_GA32,,
1,UserName,5900-282,,
2,Sex,Female,,
3,DOB,6/21/2017,,
4,FirmwareVersion,20,,
...,...,...,...,...
50560,7/26/2017,12:56:00 PM,0.0,70.905
50561,7/26/2017,12:57:00 PM,0.0,70.905
50562,7/26/2017,12:58:00 PM,0.0,67.860
50563,7/26/2017,12:59:00 PM,0.0,69.165
