In [None]:
import pandas as pd
import json

In [None]:
path = '/Users/yasaman/UWEXP/data/cmu-phaseII-daily/'
feature_acronyms = {
    'activity': 'act',
    'applications': None,
#    'audio': 'audio', # not used for discrimination descriptive work
#    'battery': 'batt', # not used for discrimination descriptive work
#    'bluetooth': 'blue', # not used for discrimination descriptive work
    'calls': 'call', 
    'locations': 'loc',
    'screen': 'screen',
    'sleep': 'slp',
    'steps': 'steps',
#    'wifi': 'wifi' # not used for discrimination descriptive work
}
slice_acronyms = {
    'allday': '',
    'morning': '_mo',
    'afternoon': '_af',
    'evening': '_ev',
    'night': '_ni'
}
date = 'yyyy-mm-dd'

In [None]:
sensor_data = []
sensor_columns = ['device_id', 'date']
for feat, facr in feature_acronyms.items():
    if facr is None:
        continue
    for slc, sacr in slice_acronyms.items():
        # construct the file name
        file_name = 'f_'+facr+','+sacr+',,_day.csv'
        
        # read the file in data with device_id as index
        data = pd.read_csv(path+file_name, header=0, index_col=0)
        
        # construct the prefix and suffix of feature columns
        prefix = 'f_'+facr+'_'
        suffix = sacr+'_day_'
        
        cols = list(data.columns)
        
        # obtain feature names
        feat_names = set([col[len(prefix):-(len(suffix)+len(date))] for col in cols if col != 'device_id'])
        
        # for each feature named by name
        for name in feat_names:
            
            # get all columns for feature named name across different dates
            feat_cols = [col for col in cols if col != 'device_id' if col[len(prefix):-(len(suffix)+len(date))] == name]
            
            # get feature data across
            features = data[feat_cols]
            
            # stack feature data for all dates
            features = features.stack().reset_index()
            
            # obtain dates from column names now under level_1
            features['level_1'] = features['level_1'].apply(lambda x: x[-len(date):])
            
            # rename the level_1 with dates and 0 (i.e. feature data) with name of the feature
            features = features.rename({'level_1':'date', 0:feat+'_'+name+'_'+slc}, axis='columns')
            
            # set device_id and date as index to facilitate concatenation
            features = features.set_index(['device_id', 'date'])
            
            # add features to the list of features for concatenation
            sensor_data.append(features)
            
            sensor_columns.append(feat+'_'+name+'_'+slc)
            
sensor_data = pd.concat(sensor_data, axis=1)
sensor_data = sensor_data.reset_index()
sensor_data = sensor_data[sensor_columns]

In [None]:
device_id2pid_file = '/Users/yasaman/UWEXP/script-input/sensors/pid_device_cmu.json'
with open(device_id2pid_file, 'r') as fileObj:
    device_id2pid_mapping = json.load(fileObj)

In [None]:
def get_pid (device_id, device_id2pid_mapping):
    if device_id[-12:] in device_id2pid_mapping:
        return device_id2pid_mapping[device_id[-12:]]
    return None

In [None]:
sensor_data['PID'] = sensor_data.apply(lambda x: get_pid(x['device_id'], device_id2pid_mapping), axis=1)
#sensor_data[sensor_data['PID'].isnull()].shape[0] # should be zero if there is a PID for every device_id
sensor_data[sensor_data['PID'].isnull()]['device_id'].unique() # device_id's with no PID

In [None]:
columns = list(sensor_data.columns)
columns.remove('PID')
columns.insert(0, 'PID')
sensor_data = sensor_data[columns]

In [None]:
set(device_id2pid_mapping.values()) - set(sensor_data['PID'])

In [None]:
sensor_data.to_csv('/Users/yasaman/UWEXP/analysis-scripts/sensors/cmudata/results/sensors.csv', index=False)

test code follows from here

In [None]:
feat = 'activity'
slc = 'allday'
df = pd.DataFrame({'device_id': [1, 2, 3], 
                   'f_act_feat1_day_2018-01-01': [11, 12, 13], 
                   'f_act_feat1_day_2018-01-02': [-11, -12, -13],
                   'f_act_feat2_day_2018-01-01': [101, 102, 103], 
                   'f_act_feat2_day_2018-01-02': [-101, -102, -103]})
df

In [None]:
prefix = 'f_'+feature_acronyms[feat]+'_'
suffix = slice_acronyms[slc]+'_day_'
date = 'yyyy-mm-dd'
cols = list(df.columns)

In [None]:
feat_names = set([col[len(prefix):-(len(suffix)+len(date))] for col in cols if col != 'device_id'])
#name_mapping = {col: col[len(prefix):-(len(suffix)+len(date))]+col[-(len(date)+1):] for col in cols if col != 'device_id'}
#df = df.rename(name_mapping, axis='columns')

In [None]:
df2 = df.set_index('device_id')
new_data = []
columns = ['device_id', 'date']
for name in feat_names:
    # get all columns for feature named name across different dates
    feat_cols = [col for col in cols if col != 'device_id' if col[len(prefix):-(len(suffix)+len(date))] == name]
    
    # get feature data across
    features = df2[feat_cols]
    
    # stack feature data for all dates
    features = features.stack().reset_index()
    
    # obtain dates from column names now under level_1
    features['level_1'] = features['level_1'].apply(lambda x: x[-len(date):])
    
    # rename the level_1 with dates and 0 (i.e. feature data) with name of the feature
    features = features.rename({'level_1':'date', 0:feat+'_'+name+'_'+slc}, axis='columns')
    
    # set device_id and date as index to facilitate concatenation
    features = features.set_index(['device_id', 'date'])
    
    # add features to the list of features for concatenation
    new_data.append(features)
    
    columns.append(feat+'_'+name+'_'+slc)

new_data = pd.concat(new_data, axis=1)
new_data = new_data.reset_index()
new_data[columns]

In [None]:
df2 = df.set_index('device_id')
df2

In [None]:
df2 = df2[['f_act_feat1_day_2018-01-01', 'f_act_feat1_day_2018-01-02']].stack().reset_index()
df2['level_1'] = df2['level_1'].apply(lambda x: x[-len(date):])
df2
# rename level_1 to date and 0 to feature name

In [None]:
name = 'feat1'
[col for col in cols if col != 'device_id' if col[len(prefix):-(len(suffix)+len(date))] == name]

In [None]:
pid_did_cmu_file = '/Users/yasaman/Downloads/PID-device.csv'
pid_did_cmu = pd.read_csv(pid_did_cmu_file, dtype={'ID': 'int32',
                                                   'Andrew ID': 'str',
                                                   'AWARE Device ID': 'str',
                                                   'New Device ID': 'str', 
                                                   'New Device ID #2': 'str', 
                                                   'New Device ID #3': 'str', 
                                                   'Phone #': 'str',
                                                   'Cell Phone Provider': 'str'})

In [None]:
def get_latest_id(row):
    NaN = row.isnull()
    if NaN['New Device ID #3'] == False:
        #print('returning3: {}'.format(row['New Device ID #3']))
        return row['New Device ID #3']
    if NaN['New Device ID #2'] == False:
        #print('returning2: {}'.format(row['New Device ID #2']))
        return row['New Device ID #2']
    if NaN['New Device ID'] == False:
        #print('returning1: {}'.format(row['New Device ID']))
        return row['New Device ID']
    if NaN['AWARE Device ID'] == False:
        #print('returning0: {}'.format(row['AWARE Device ID']))
        return row['AWARE Device ID']
    #print('returning None')
    return None

In [None]:
pid_did_cmu['device_id'] = pid_did_cmu.apply(lambda x: get_latest_id(x), axis=1)

In [None]:
pid_did_cmu.to_csv('/Users/yasaman/Downloads/pid_device_cmu.csv', index=False)