# Exploring the UTx000 Extension Beiwe Data
(Known as BPEACE2 in the [GH repo](https://github.com/intelligent-environments-lab/utx000))

# Determining if participants were home when completing EMAs
We want to use the GPS data and timestamps of completed EMAs to see if the participant was home when they submitted their EMA. 

In [1]:
import warnings
warnings.filterwarnings('ignore')

# Package Import

In [2]:
import sys
sys.path.append('../')

from src.features import build_features
from src.visualization import visualize

import pandas as pd
pd.set_option('display.max_columns', 200)
import numpy as np

from datetime import datetime, timedelta
import math

import matplotlib.pyplot as plt
import matplotlib.dates as mdates

import geopy.distance

# Data Import

## GPS Data
The GPS data are available in the ```processed``` directory and already downsampled to 1-minute increments.

In [3]:
gps = pd.read_csv('../data/processed/beiwe-gps-ux_s20.csv', index_col="timestamp", parse_dates=True, infer_datetime_format=True)
gps.drop(["utc","altitude","accuracy"],axis="columns",inplace=True)
gps.dropna(inplace=True)
gps.tail()

Unnamed: 0_level_0,lat,long,beiwe
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-09-02 04:16:00,30.36721,-97.79321,zdpffrox
2020-09-02 04:26:00,30.36725,-97.79324,zdpffrox
2020-09-02 04:27:00,30.36728,-97.79331,zdpffrox
2020-09-02 04:29:00,30.36718,-97.79324,zdpffrox
2020-09-02 04:37:00,30.36718,-97.79324,zdpffrox


In [4]:
print(f"Number of participants: {len(gps['beiwe'].unique())}")

Number of participants: 52


## Address Information
We will need the address information from participants in order to determine if the participant is home or not.

In [5]:
info = pd.read_excel('../data/raw/utx000/admin/id_crossover.xlsx',sheet_name='beacon')
info.drop(["return_date","volume","housemates","roommates","n_rooms","no2_sensor","original_start","original_end","original_move","original_address","second address","lat3","long3","third address"],axis="columns",inplace=True)
info.dropna(subset=["beacon","lat","long"],inplace=True)
info.head()

Unnamed: 0,redcap,beiwe,beacon,start_date,end_date,move_date,lat,long,lat2,long2
0,70.0,2xtqkfz1,32.0,2020-06-12 10:57:00,2020-06-29,NaT,32.92309,-96.9628,,
1,37.0,4i7679py,44.0,2020-06-16 15:28:00,2020-08-22,2020-08-21,33.03598,-96.65369,30.288375,-97.743444
2,38.0,5fvmg226,23.0,2020-06-04 11:52:00,2020-09-30,NaT,30.2415,-97.71547,,
3,55.0,745vq78e,5.0,2020-06-09 10:21:00,2020-08-23,NaT,30.28798,-97.749045,,
4,36.0,9jtzsuu8,15.0,2020-06-08 15:08:00,2020-09-09,2020-08-24,33.142,-97.1135,30.291221,-97.746498


In [6]:
info = info[info["lat2"].isnull()].drop(["lat2","long2"],axis="columns")
info.sort_values("redcap",inplace=True)

In [7]:
print(f"Number of participants: {len(info['beiwe'].unique())}")

Number of participants: 14


## EMA Data
We need the morning and evening EMAs because we are interested in mood, not sleep quality this time ;) 

In [4]:
morning_ema = pd.read_csv("../data/processed/beiwe-morning_ema-ux_s20.csv",parse_dates=["timestamp"],infer_datetime_format=True)
morning_ema.drop(["tst","sol","naw","restful"],axis="columns",inplace=True) # don't need and dropping to combine
evening_ema = pd.read_csv("../data/processed/beiwe-evening_ema-ux_s20.csv",parse_dates=["timestamp"],infer_datetime_format=True)
ema = morning_ema.append(evening_ema)
ema.reset_index(inplace=True,drop=True)
ema.dropna(inplace=True)
ema.head()

Unnamed: 0,timestamp,beiwe,content,stress,lonely,sad,energy,redcap,beacon
0,2020-05-13 09:10:27,qh34m4r9,3.0,0.0,0.0,0.0,1.0,68,19.0
1,2020-05-13 09:15:49,awa8uces,0.0,2.0,1.0,1.0,1.0,28,26.0
2,2020-05-13 09:42:19,xxvnhauv,1.0,1.0,1.0,3.0,0.0,21,22.0
5,2020-05-13 10:32:23,mm69prai,1.0,0.0,1.0,1.0,1.0,62,13.0
6,2020-05-13 10:35:08,tlmlq19s,2.0,1.0,0.0,0.0,3.0,47,36.0


## Homestay
Data from Peter

In [8]:
homestay = pd.read_csv("../data/processed/beiwe-homestay-ux_s20.csv",index_col=0,parse_dates=["start","end"],infer_datetime_format=True)
homestay.head()

Unnamed: 0,home.lat,home.long,start,end,nth.stay,beiwe,when,t
0,30.288063,-97.748945,2020-06-09 21:08:03,2020-06-09 23:03:48,1,745vq78e,nomove,6945.0
1,30.288063,-97.748945,2020-06-10 00:00:33,2020-06-10 11:25:42,2,745vq78e,nomove,41109.0
2,30.288063,-97.748945,2020-06-10 12:51:43,2020-06-12 00:34:05,3,745vq78e,nomove,128542.0
3,30.288063,-97.748945,2020-06-12 01:01:25,2020-06-12 02:29:20,4,745vq78e,nomove,5275.0
4,30.288063,-97.748945,2020-06-12 03:02:26,2020-06-12 03:55:25,5,745vq78e,nomove,3179.0


# Getting Location When EMA is Completed
We can create a function that gets the coordinates of the participants' locations at a time `t`.

## My Algorithm

In [10]:
def get_coordinates(t, gps, pt, id_var="beiwe", window=10):
    """
    Gets GPS coordinates for a given participant
    
    Inputs:
    - t: datetime corresponding to the time point of interest
    - gps: dataframe of GPS coordinates for all participants
    - pt: string of participant
    - id_var: string of identifying variable
    - window: integer of plus/minus time to look for GPS coordinates
    
    Returns mean coordinates during plus/minus window as [lat,long] and gps timepoint
    """
    gps_by_pt = gps[gps[id_var] == pt] # gps data for given participant
    if len(gps_by_pt) > 0:
        timeframe = [t - timedelta(minutes=window),t + timedelta(minutes=window)] # getting timeframe to average gps coordinates over
        gps_by_pt_in_window = gps_by_pt[timeframe[0]:timeframe[-1]] # restricting to timeframe
        if len(gps_by_pt_in_window) > 0:
            return [np.nanmean(gps_by_pt_in_window["lat"]), np.nanmean(gps_by_pt_in_window["long"])], gps_by_pt_in_window.index[-1] # returning mean lat/long coordinates
        
    return [np.nan, np.nan], np.nan

# Getting Time Only When Participants are Home
We can use the GPS coordinates from Beiwe and the addresses to filter out the data so that we can determine the time that participants are home.

## My Algorithm

In [11]:
def get_time_when_home(gps_df,info_df,radius=100,verbose=False):
    """returns gps data only from times when participants are home"""
    gps_with_distance = pd.DataFrame()
    for pt in info["beiwe"].unique():
        # getting data by pt
        gps_pt = gps_df[gps_df['beiwe'] == pt]
        info_pt = info_df[info_df['beiwe'] == pt]
        if verbose:
            print(f'Working for Participant {pt} - Beacon', int(info_pt['beacon'].values[0]))
        # getting pt address points
        lat_pt1 = info_pt['lat'].values[0]
        long_pt1 = info_pt['long'].values[0]
        coords_add_1 = (lat_pt1, long_pt1)
        # Getting distances to address from coordinates
        d1 = []
        for lat, long in zip(gps_pt["lat"].values,gps_pt["long"].values):
            d1.append(geopy.distance.distance(coords_add_1, (lat,long)).m)
            
        gps_pt["d1"] = d1
        gps_with_distance = gps_with_distance.append(gps_pt)
        
    return gps_with_distance[(gps_with_distance["d1"] < radius)], gps_with_distance

In [12]:
gps_home, gps_dist = get_time_when_home(gps,info,verbose=True)
print("Number of datapoints:",len(gps_home))

Working for Participant rnse61g4 - Beacon 34
Working for Participant hxj6brwj - Beacon 28
Working for Participant xdbdrk6e - Beacon 24
Working for Participant awa8uces - Beacon 26
Working for Participant rj4lxgvp - Beacon 48
Working for Participant nvtfpaor - Beacon 46
Working for Participant vr9j5rry - Beacon 25
Working for Participant 5fvmg226 - Beacon 23
Working for Participant tmexej5v - Beacon 29
Working for Participant xlw5ntd5 - Beacon 10
Working for Participant 745vq78e - Beacon 5
Working for Participant mm69prai - Beacon 13
Working for Participant 2xtqkfz1 - Beacon 32
Working for Participant i31pt4b4 - Beacon 11
Number of datapoints: 88911


# Checking if Participant was Home When Survey was Submitted

## My Algorithm

In [13]:
def get_ema_location(gps_df,ema_df):
    """appends the location to the ema dataframe"""
    ema_with_loc = pd.DataFrame()
    # looping through each participant
    for pt in gps_df["beiwe"].unique():
        # gps
        gps_home_by_pt = gps_df[gps_df["beiwe"] == pt]
        gps_home_by_pt["time"] = gps_home_by_pt.index
        gps_home_by_pt["dt"] = (gps_home_by_pt["time"] - gps_home_by_pt["time"].shift(1)).dt.total_seconds() / 60
        # ema
        ema_by_pt = ema_df[ema_df["beiwe"] == pt]
        lats = []
        longs = []
        # looping through participant EMA submissions
        for submission in ema_by_pt["timestamp"]:
            loc, timestamp = get_coordinates(submission,gps_home_by_pt,pt)
            # if one of the loc coordinates is NaN, then the pt had no home gps data - this works because we are passing home gps ONLY
            if math.isnan(loc[0]):
                lats.append(np.nan)
                longs.append(np.nan)
            else:
                lats.append(loc[0])
                longs.append(loc[1])
        # appending location and adding to overall df
        ema_by_pt["lat"] = lats
        ema_by_pt["long"] = longs
        ema_with_loc = ema_with_loc.append(ema_by_pt.dropna())
        
    return ema_with_loc

### Getting the Numbers

In [14]:
ema_loc = get_ema_location(gps_home,ema)
ema_loc.head()

Unnamed: 0,timestamp,beiwe,content,stress,lonely,sad,energy,redcap,beacon,lat,long
57,2020-05-17 09:54:09,rnse61g4,3,0.0,0.0,0.0,2.0,15,34.0,30.28575,-97.744063
331,2020-05-31 20:54:26,rnse61g4,2,0.0,0.0,1.0,2.0,15,34.0,30.285645,-97.744165
931,2020-06-24 10:30:45,rnse61g4,2,1.0,0.0,0.0,2.0,15,34.0,30.285702,-97.744122
1187,2020-07-05 11:07:38,rnse61g4,2,0.0,0.0,0.0,1.0,15,34.0,30.2857,-97.7441
1649,2020-07-24 11:13:19,rnse61g4,3,0.0,0.0,0.0,3.0,15,34.0,30.28565,-97.744207


In [15]:
print("Number of Participants:", len(ema_loc["beiwe"].unique()))
print("Number of Surveys Completed at Home:",len(ema_loc))
print("Breakdown:")
n = ema_loc["beiwe"].value_counts()
for index, val in zip(n.index,n.values):
    print(f"\t{index} - {val}")

Number of Participants: 12
Number of Surveys Completed at Home: 661
Breakdown:
	xdbdrk6e - 92
	hxj6brwj - 81
	awa8uces - 75
	tmexej5v - 63
	mm69prai - 58
	745vq78e - 54
	xlw5ntd5 - 49
	vr9j5rry - 47
	5fvmg226 - 45
	i31pt4b4 - 39
	2xtqkfz1 - 37
	rnse61g4 - 21


### Saving Results

In [16]:
ema_loc.to_csv("../data/processed/beiwe-ema_at_home-ux_s20.csv",index=False)

## With Peter's Data

In [6]:
def get_emas_when_home(ema_df,homestay_df):
    """
    Returns only the emas that were completed at home
    """
    df_ema = ema_df.copy()
    df_home = homestay_df.copy()
    home = []
    time_at_home = []
    for pt in df_ema["beiwe"].unique():
        ema_pt = df_ema[df_ema["beiwe"] == pt]
        homestay_pt = homestay_df[homestay_df["beiwe"] == pt]
        for submission in ema_pt["timestamp"]:
            found = False
            for s, e in zip(homestay_pt["start"],homestay_pt["end"]):
                if submission > s and submission < e:
                    home.append(1)
                    found = True
                    time_at_home.append((submission - s).total_seconds())
                    break

            if found == False:
                home.append(0)
                time_at_home.append(0)


    df_ema["home"] = home
    df_ema["time_at_home"] = time_at_home
    ema_loc_homestay = df_ema[df_ema["home"] == 1]
    ema_loc_homestay.drop(["home"],axis="columns",inplace=True)
    return ema_loc_homestay

### Saving Results

In [13]:
ema_loc_homestay = get_emas_when_home(ema,homestay)
ema_loc_homestay.to_csv("../data/processed/beiwe-ema_at_home_v2-ux_s20.csv",index=False)

In [15]:
ema_evening_homestay = get_emas_when_home(evening_ema,homestay)
ema_evening_homestay.to_csv("../data/processed/beiwe-ema_evening_at_home-ux_s20.csv",index=False)

In [16]:
ema_morning_homestay = get_emas_when_home(morning_ema,homestay)
ema_morning_homestay.to_csv("../data/processed/beiwe-ema_morning_at_home-ux_s20.csv",index=False)

## Comparison

In [19]:
print("Number of surveys from my algorithm:", len(ema_loc))
print("Number of surveys from Peter's algorithm:", len(ema_loc_homestay))
emas_at_home = ema_loc.merge(right=ema_loc_homestay,on="timestamp")
print("Number of merged surveys:", len(emas_at_home))

Number of surveys from my algorithm: 661
Number of surveys from Peter's algorithm: 821
Number of merged surveys: 225


<div class="alert alert-block alert-warning">

The two algorithms seem to return a different set of EMAs which is interesting although I trust Peter's algorithm more than mine :) 
    
</div>

# Related Analysis

## Inspecting Distances and Coordinates of Addresses
Checking to see if the address coordinates really make sense...

In [20]:
def get_common_coordinates(gps_df,pt):
    """gets the commonly occuring GPS coordinates"""
    gps_by_pt = gps_df[gps_df["beiwe"] == pt]
    
    try:
        lats = list(gps_by_pt["lat"].values)
        lat_mode = max(set(lats), key=lats.count)
        longs = list(gps_by_pt["long"].values)
        long_mode = max(set(longs), key=longs.count)
    except ValueError as e:
        print(e)
        return [np.nan,np.nan]
    
    return [lat_mode,long_mode]

In [21]:
def inpsect_distances(gps_with_d, pt, byvar="beiwe",ylim=1000):
    """Plots distances for participant"""
    df_to_plot = gps_with_d[gps_with_d[byvar] == pt]
    fig, ax = plt.subplots(figsize=(24,6))
    ax.scatter(df_to_plot.index,df_to_plot["d1"],s=5,color="black")
    # x-axis
    ax.set_xlim([datetime(2020,5,1),datetime(2020,9,1)])
    # y-axis
    ax.set_ylim([0,ylim])
    for loc in ["top","right"]:
        ax.spines[loc].set_visible(False)
        
    plt.show()
    plt.close()

In [22]:
for pt in info["beiwe"].unique():
    coords = get_common_coordinates(gps,pt)
    info_by_pt = info[info["beiwe"] == pt]
    print(f"Participant {pt}:\n\tFrom Address:\t({round(info_by_pt['lat'].values[0],6)},{round(info_by_pt['long'].values[0],6)})\n\tEstimate:\t({round(coords[0],6)},{round(coords[1],6)})")
    #inpsect_distances(gps_dist,pt,ylim=400)

Participant rnse61g4:
	From Address:	(30.2857,-97.7441)
	Estimate:	(30.2857,-97.74433)
Participant hxj6brwj:
	From Address:	(30.28007,-97.74115)
	Estimate:	(30.28007,-97.74115)
Participant xdbdrk6e:
	From Address:	(33.13661,-96.62536)
	Estimate:	(33.13661,-96.62536)
Participant awa8uces:
	From Address:	(30.04939,-95.5055)
	Estimate:	(30.04939,-95.5055)
max() arg is an empty sequence
Participant rj4lxgvp:
	From Address:	(30.286894,-97.748909)
	Estimate:	(nan,nan)
max() arg is an empty sequence
Participant nvtfpaor:
	From Address:	(27.602539,-99.463737)
	Estimate:	(nan,nan)
Participant vr9j5rry:
	From Address:	(30.35748,-97.75179)
	Estimate:	(30.35748,-97.75179)
Participant 5fvmg226:
	From Address:	(30.2415,-97.71547)
	Estimate:	(30.2415,-97.71547)
Participant tmexej5v:
	From Address:	(30.39634,-97.64425)
	Estimate:	(30.39634,-97.64424)
Participant xlw5ntd5:
	From Address:	(30.35162,-97.61234)
	Estimate:	(30.35162,-97.61234)
Participant 745vq78e:
	From Address:	(30.28798,-97.749045)
	Est