In [3]:
# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload

# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

In [4]:
import sys
sys.path.append('../')

# importing all source code
from src.features import build_features
from src.visualization import visualize
from src.data import make_dataset
from src.reports import make_report

import pandas as pd
import numpy as np

In [77]:
import os
from datetime import datetime,timedelta

import pandas as pd
import numpy as np
import scipy.stats as stats

class bpeace2:
    
    def __init__(self):
        self.study='bpeace2'
    
    def process_weekly_surveys(self, data_dir = '../data/raw/bpeace2/beiwe/'):
        '''
        Processes raw weekly survey answers and timing. The encoding is:
        - eQ2L3J08ChlsdSXXKOoOjyLJ: morning
        - 7TaT8zapOWO0xdtONnsY8CE0: evening
        
        Parameters:
        - 
        
        Returns 
        '''
        # defining some variables for ease of understanding
        parent_dir = '../data/raw/bpeace2/beiwe/survey_answers/'
        morning_survey_id = 'eQ2L3J08ChlsdSXXKOoOjyLJ'
        evening_survey_id = '7TaT8zapOWO0xdtONnsY8CE0'
        
        # defining the final dataframes to append to
        evening_survey_df = pd.DataFrame()
        morning_survey_df = pd.DataFrame()
        
        # Morning Survey Data
        # -------------------
        # looping through the participants and then all their data
        for participant in os.listdir(parent_dir):
            # making sure we don't read from any hidden directories/files
            if len(participant) == 8:
                pid = participant
                participant_df = pd.DataFrame(columns=['ID','Content','Stress','Lonely','Sad','Energy','TST','SOL','NAW','Restful'])
            
                for file in os.listdir(f'{parent_dir}{participant}/survey_answers/{morning_survey_id}/'):
                    # reading raw data
                    df = pd.read_csv(f'{parent_dir}{participant}/survey_answers/{morning_survey_id}/{file}')
                    # adding new row
                    try:
                        participant_df.loc[datetime.strptime(file[:-4],'%Y-%m-%d %H_%M_%S')] = [pid,df.loc[4,'answer'],df.loc[5,'answer'],df.loc[6,'answer'],df.loc[7,'answer'],df.loc[8,'answer'],
                                                                                               df.loc[0,'answer'],df.loc[1,'answer'],df.loc[2,'answer'],df.loc[3,'answer']]
                    except KeyError:
                        print(f'Problem with morning survey {file} for Participant {pid} - Participant most likely did not answer a question')
                        #self.move_to_purgatory(f'{parent_dir}{participant}/survey_answers/{morning_survey_id}/{file}',f'../../data/purgatory/{self.study}-{pid}-survey-morning-{file}')
            
                # appending participant df to overall df
                morning_survey_df = morning_survey_df.append(participant_df)
            else:
                print(f'Directory {participant} is not valid')
        
        # replacing string values with numeric
        morning_survey_df.replace({'Not at all':0,'A little bit':1,'Quite a bit':2,'Very Much':3},inplace=True)
        morning_survey_df.replace({'Low energy':0, 'Somewhat low energy':1,'Neutral':2,'Somewhat high energy':3,'High Energy':4},inplace=True)
        
        # Evening Survey Data
        # -------------------
        for participant in os.listdir(parent_dir):
            if len(participant) == 8:
                pid = participant
                # less columns
                participant_df = pd.DataFrame(columns=['ID','Content','Stress','Lonely','Sad','Energy'])
            
                for file in os.listdir(f'{parent_dir}{participant}/survey_answers/{evening_survey_id}/'):
                    df = pd.read_csv(f'{parent_dir}{participant}/survey_answers/{evening_survey_id}/{file}')
                    try:
                        participant_df.loc[datetime.strptime(file[:-4],'%Y-%m-%d %H_%M_%S')] = [pid,df.loc[0,'answer'],df.loc[1,'answer'],df.loc[2,'answer'],df.loc[3,'answer'],df.loc[4,'answer']]
                    except KeyError:
                        print(f'Problem with evening survey {file} for Participant {pid} - Participant most likely did not answer a question')
                        #self.move_to_purgatory(f'{parent_dir}{participant}/survey_answers/{evening_survey_id}/{file}',f'../../data/purgatory/{self.study}-{pid}-survey-evening-{file}')
            
                evening_survey_df = evening_survey_df.append(participant_df)
            else:
                print(f'Directory {participant} is not valid')
                
        evening_survey_df.replace({'Not at all':0,'A little bit':1,'Quite a bit':2,'Very Much':3},inplace=True)
        evening_survey_df.replace({'Low energy':0, 'Somewhat low energy':1,'Neutral':2,'Somewhat high energy':3,'High Energy':4},inplace=True)
        
        return morning_survey_df, evening_survey_df
    
    def process_gps(self, data_dir = '/Volumes/HEF_Dissertation_Research/utx000/extension/data/beiwe/gps/'):
        '''
        Processes the raw gps data into one csv file for each participant and saves into /data/processed/

        Returns 
        '''
        gps_df = pd.DataFrame()
        for participant in os.listdir(data_dir):
            if len(participant) == 8:
                pid = participant
                print(f'Working for Participant: {pid}')
                participant_df = pd.DataFrame()
                for file in os.listdir(f'{data_dir}{pid}/gps/'):
                    if file[-1] == 'v':
                        try:
                            hourly_df = pd.read_csv(f'{data_dir}{pid}/gps/{file}',usecols=[1,2,3,4,5])
                        except KeyError:
                            print(f'Problem with evening survey {file} for Participant {pid}')
                            #self.move_to_purgatory(f'{parent_dir}{participant}/survey_answers/{evening_survey_id}/{file}',f'../../data/purgatory/{self.study}-{pid}-survey-evening-{file}')
                    
                        if len(hourly_df) > 0:
                            participant_df = participant_df.append(hourly_df,ignore_index=True)
                    
                participant_df['Time'] = pd.to_datetime(participant_df['UTC time']) - timedelta(hours=5)
                participant_df.drop(['UTC time'],axis=1,inplace=True)
                participant_df.set_index('Time',inplace=True)
                participant_df = participant_df.resample('5T').mean()
                
                gps_df = gps_df.append(participant_df)
                
        gps_df.columns = ['Lat','Long','Alt','Accuracy']
        return gps_df
    
    

In [78]:
test = bpeace2()

In [79]:
df = test.process_gps()

Working for Participant: 2xtqkfz1
Working for Participant: 4i7679py
Working for Participant: 5fvmg226
Working for Participant: 745vq78e
Working for Participant: 9jtzsuu8
Working for Participant: 9xmhtq74
Working for Participant: awa8uces
Working for Participant: e73a1pd5
Working for Participant: hcpu5myv
Working for Participant: hxj6brwj
Working for Participant: i31pt4b4
Working for Participant: idbkjh8u
Working for Participant: itmylz3g
Working for Participant: kyj367pi
Working for Participant: lkkjddam
Working for Participant: mm69prai
Working for Participant: olaxadz5
Working for Participant: qh34m4r9
Working for Participant: rnse61g4
Working for Participant: tlmlq19s
Working for Participant: tmexej5v
Working for Participant: vr9j5rry
Working for Participant: xdbdrk6e
Working for Participant: xlw5ntd5
Working for Participant: xxvnhauv


In [66]:
df

In [67]:
df1['Time'] = pd.to_datetime(df1['UTC time']) - timedelta(hours=5)
df1.drop(['UTC time'],axis=1,inplace=True)
df1.set_index('Time',inplace=True)
df1.resample('5T').mode()

KeyError: 'UTC time'

In [68]:
result = frame.resample("M").apply({'a':'sum', 'b':lambda x: stats.mode(x)[0]})

AttributeError: 'DatetimeIndexResampler' object has no attribute 'mode'

In [74]:
df2 = df1.resample('1h').apply({lambda x: stats.mode(x)[0]})

In [75]:
df2

Unnamed: 0_level_0,latitude,longitude,altitude,accuracy
Unnamed: 0_level_1,<lambda>,<lambda>,<lambda>,<lambda>
Time,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2020-06-01 10:00:00,32.923,-96.9628,152.328,65
2020-06-01 11:00:00,[],[],[],[]
2020-06-01 12:00:00,[],[],[],[]
2020-06-01 13:00:00,[],[],[],[]
2020-06-01 14:00:00,[],[],[],[]
...,...,...,...,...
2020-08-31 11:00:00,[],[],[],[]
2020-08-31 12:00:00,[],[],[],[]
2020-08-31 13:00:00,32.8964,-96.971,152,2500
2020-08-31 14:00:00,[],[],[],[]


In [76]:
df2.plot()

TypeError: no numeric data to plot