In [1]:
# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload

# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

In [2]:
import sys
sys.path.append('../')

# importing all source code
from src.features import build_features
from src.visualization import visualize
from src.data import make_dataset
from src.reports import make_report

import pandas as pd
import numpy as np

In [35]:
import os
from datetime import datetime,timedelta

import pandas as pd
import numpy as np
import scipy.stats as stats

class bpeace2:
    
    def __init__(self):
        self.study='bpeace2'
    
    def process_weekly_surveys(self, data_dir = '../data/raw/bpeace2/beiwe/'):
        '''
        Processes raw weekly survey answers and timing. The encoding is:
        - eQ2L3J08ChlsdSXXKOoOjyLJ: morning
        - 7TaT8zapOWO0xdtONnsY8CE0: evening
        
        Parameters:
        - 
        
        Returns 
        '''
        # defining some variables for ease of understanding
        parent_dir = '../data/raw/bpeace2/beiwe/survey_answers/'
        morning_survey_id = 'eQ2L3J08ChlsdSXXKOoOjyLJ'
        evening_survey_id = '7TaT8zapOWO0xdtONnsY8CE0'
        
        # defining the final dataframes to append to
        evening_survey_df = pd.DataFrame()
        morning_survey_df = pd.DataFrame()
        
        # Morning Survey Data
        # -------------------
        # looping through the participants and then all their data
        for participant in os.listdir(parent_dir):
            # making sure we don't read from any hidden directories/files
            if len(participant) == 8:
                pid = participant
                participant_df = pd.DataFrame(columns=['ID','Content','Stress','Lonely','Sad','Energy','TST','SOL','NAW','Restful'])
            
                for file in os.listdir(f'{parent_dir}{participant}/survey_answers/{morning_survey_id}/'):
                    # reading raw data
                    df = pd.read_csv(f'{parent_dir}{participant}/survey_answers/{morning_survey_id}/{file}')
                    # adding new row
                    try:
                        participant_df.loc[datetime.strptime(file[:-4],'%Y-%m-%d %H_%M_%S')] = [pid,df.loc[4,'answer'],df.loc[5,'answer'],df.loc[6,'answer'],df.loc[7,'answer'],df.loc[8,'answer'],
                                                                                               df.loc[0,'answer'],df.loc[1,'answer'],df.loc[2,'answer'],df.loc[3,'answer']]
                    except KeyError:
                        print(f'Problem with morning survey {file} for Participant {pid} - Participant most likely did not answer a question')
                        #self.move_to_purgatory(f'{parent_dir}{participant}/survey_answers/{morning_survey_id}/{file}',f'../../data/purgatory/{self.study}-{pid}-survey-morning-{file}')
            
                # appending participant df to overall df
                morning_survey_df = morning_survey_df.append(participant_df)
            else:
                print(f'Directory {participant} is not valid')
        
        # replacing string values with numeric
        morning_survey_df.replace({'Not at all':0,'A little bit':1,'Quite a bit':2,'Very Much':3},inplace=True)
        morning_survey_df.replace({'Low energy':0, 'Somewhat low energy':1,'Neutral':2,'Somewhat high energy':3,'High Energy':4},inplace=True)
        
        # Evening Survey Data
        # -------------------
        for participant in os.listdir(parent_dir):
            if len(participant) == 8:
                pid = participant
                # less columns
                participant_df = pd.DataFrame(columns=['ID','Content','Stress','Lonely','Sad','Energy'])
            
                for file in os.listdir(f'{parent_dir}{participant}/survey_answers/{evening_survey_id}/'):
                    df = pd.read_csv(f'{parent_dir}{participant}/survey_answers/{evening_survey_id}/{file}')
                    try:
                        participant_df.loc[datetime.strptime(file[:-4],'%Y-%m-%d %H_%M_%S')] = [pid,df.loc[0,'answer'],df.loc[1,'answer'],df.loc[2,'answer'],df.loc[3,'answer'],df.loc[4,'answer']]
                    except KeyError:
                        print(f'Problem with evening survey {file} for Participant {pid} - Participant most likely did not answer a question')
                        #self.move_to_purgatory(f'{parent_dir}{participant}/survey_answers/{evening_survey_id}/{file}',f'../../data/purgatory/{self.study}-{pid}-survey-evening-{file}')
            
                evening_survey_df = evening_survey_df.append(participant_df)
            else:
                print(f'Directory {participant} is not valid')
                
        evening_survey_df.replace({'Not at all':0,'A little bit':1,'Quite a bit':2,'Very Much':3},inplace=True)
        evening_survey_df.replace({'Low energy':0, 'Somewhat low energy':1,'Neutral':2,'Somewhat high energy':3,'High Energy':4},inplace=True)
        
        return morning_survey_df, evening_survey_df
    
    def process_gps(self, data_dir = '/Volumes/HEF_Dissertation_Research/utx000/extension/data/beiwe/gps/'):
        '''
        Processes the raw gps data into one csv file for each participant and saves into /data/processed/
        
        All GPS data are recorded at 1-second intervals and stored in separate data files for every hour. The
        data are combined into one dataframe per participant, downsampled to 5-minute intervals using the
        mode value for those 5-minutes (after rounding coordinates to five decimal places), and combined into
        a final dataframe that contains all participants' data. 

        Returns True is able to process the data, false otherwise.
        '''
        gps_df = pd.DataFrame()
        for participant in os.listdir(data_dir):
            if participant == '2xtqkfz1':
            #if len(participant) == 8: # checking to make sure we only look for participant directories
                pid = participant
                print(f'\tWorking for Participant: {pid}')
                participant_df = pd.DataFrame() # 
                for file in os.listdir(f'{data_dir}{pid}/gps/'):
                    if file[-1] == 'v': # so we only import cs[v] files
                        try:
                            hourly_df = pd.read_csv(f'{data_dir}{pid}/gps/{file}',usecols=[1,2,3,4,5]) # all columns but UTC
                        except KeyError:
                            print(f'Problem with evening survey {file} for Participant {pid}')
                            #self.move_to_purgatory(f'{parent_dir}{participant}/survey_answers/{evening_survey_id}/{file}',f'../../data/purgatory/{self.study}-{pid}-survey-evening-{file}')
                    
                        if len(hourly_df) > 0: # append to participant df if there were data for that hour
                            participant_df = participant_df.append(hourly_df,ignore_index=True)
                    
                # converting utc to cdt
                participant_df['Time'] = pd.to_datetime(participant_df['UTC time']) - timedelta(hours=5)
                participant_df.drop(['UTC time'],axis=1,inplace=True)
                participant_df.set_index('Time',inplace=True)
                # rounding gps and taking the mode for every 5-minutes
                participant_df = round(participant_df,5)
                participant_df = participant_df.resample('5T').apply({lambda x: stats.mode(x)[0]})
                participant_df['Beiwe'] = pid
                
                gps_df = gps_df.append(participant_df)
                
        gps_df.columns = ['Lat','Long','Alt','Accuracy','Beiwe']
        return gps_df
    
    def process_environment_survey(self, data_file = '../../data/raw/bpeace2/surveys/EESurvey_E1_raw.csv'):
        '''
        Processes raw environment survey (first instance) and combines relevant data into processed directory.
        
        Columns currently included:
        0 - Record ID
        2 - Survey Timestamp
        4-9 - Building Type (apartment, duplex, house, dorm, hotel, other)

        Returns True if processed, False otherwise
        '''
        print('\tProcessing first environment survey...')
        
        ee = pd.read_csv(data_file,usecols=[0,2,4,5,6,7,8,9],parse_dates=[1])
        ee.columns = ['REDCap','Timestamp','Apartment','Duplex','House','Dorm','Hotel','Other']
        ee.dropna(subset=['Timestamp'],inplace=True)
        ee.set_index('Timestamp',inplace=True)
        
        return ee
    
    

In [36]:
test = bpeace2()

In [37]:
df = test.process_gps()
print(df.head())

	Working for Participant: 2xtqkfz1
                        Lat     Long      Alt Accuracy     Beiwe
Time                                                            
2020-06-01 10:35:00  32.923 -96.9628  152.328       65  2xtqkfz1
2020-06-01 10:40:00      []       []       []       []  2xtqkfz1
2020-06-01 10:45:00      []       []       []       []  2xtqkfz1
2020-06-01 10:50:00      []       []       []       []  2xtqkfz1
2020-06-01 10:55:00      []       []       []       []  2xtqkfz1


In [7]:
df1['Time'] = pd.to_datetime(df1['UTC time']) - timedelta(hours=5)
df1.drop(['UTC time'],axis=1,inplace=True)
df1.set_index('Time',inplace=True)
df1.resample('5T').mode()

NameError: name 'df1' is not defined

In [None]:
result = frame.resample("M").apply({'a':'sum', 'b':lambda x: stats.mode(x)[0]})

In [None]:
df2 = df1.resample('1h').apply({lambda x: stats.mode(x)[0]})

In [None]:
df2

In [None]:
df2.plot()

In [17]:
round(df,5)

Unnamed: 0_level_0,Lat,Long,Alt,Accuracy
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-06-01 10:35:00,32.92302,-96.96281,152.32799,65.00000
2020-06-01 10:40:00,,,,
2020-06-01 10:45:00,,,,
2020-06-01 10:50:00,,,,
2020-06-01 10:55:00,,,,
...,...,...,...,...
2020-07-08 16:10:00,,,,
2020-07-08 16:15:00,,,,
2020-07-08 16:20:00,30.29101,-97.74941,153.60001,18.77300
2020-07-08 16:25:00,,,,


In [18]:
df

Unnamed: 0_level_0,Lat,Long,Alt,Accuracy
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-06-01 10:35:00,32.923019,-96.962811,152.327988,65.000000
2020-06-01 10:40:00,,,,
2020-06-01 10:45:00,,,,
2020-06-01 10:50:00,,,,
2020-06-01 10:55:00,,,,
...,...,...,...,...
2020-07-08 16:10:00,,,,
2020-07-08 16:15:00,,,,
2020-07-08 16:20:00,30.291008,-97.749410,153.600006,18.773000
2020-07-08 16:25:00,,,,
