# 4 Detailed Preprocessing of Passive Data

This notebook shows the analysis of situational context using EMA and passive sensing data

1. **Load Data**: Load necessary data from pickle files.
2. **Preprocess EMA**:

In [1]:
import os
import sys
import regex as re
# If your current working directory is the notebooks directory, use this:
notebook_dir = os.getcwd()  # current working directory
src_path = os.path.abspath(os.path.join(notebook_dir, '..', 'src'))
sys.path.append(src_path)

# Add the parent directory to sys.path
parent_dir = os.path.abspath(os.path.join(notebook_dir, '..'))
sys.path.append(parent_dir)
import glob
import pickle
from IPython.display import Markdown
from config import datapath, preprocessed_path

import pandas as pd
import numpy as np
import datetime as dt

from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from math import radians, cos, sin, asin, sqrt
import statistics  # Make sure this is imported


import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns 
import matplotlib.patches as mpatches

sns.set_context("notebook", rc={"axes.labelsize": 14, "xtick.labelsize": 14, "ytick.labelsize": 14})
sns.set_style("whitegrid", {'axes.grid': True})
%matplotlib inline
import plotly.express as px


In [2]:
backup_path = preprocessed_path + "backup_data_passive_actual.feather"
df_backup = pd.read_feather(backup_path)

with open(preprocessed_path + '/ema_data.pkl', 'rb') as file:
    df_ema_framework = pickle.load(file)

with open(preprocessed_path + '/ema_content.pkl', 'rb') as file:
    df_ema_content = pickle.load(file)  

with open(preprocessed_path + '/monitoring_data.pkl', 'rb') as file:
    df_monitoring = pickle.load(file)

In [3]:
df_backup.head()

Unnamed: 0,customer,type,startTimestamp,endTimestamp,doubleValue,longValue,booleanValue,dateValue,stringValue,userReliability,...,Sleep_actual_days_with_data,Sleep_data_coverage_per,Heart_Rate_actual_days_with_data,Heart_Rate_data_coverage_per,ema_relative_start_phase0,ema_relative_end_phase0,ema_relative_start_phase1,ema_relative_end_phase1,ema_relative_start_phase2,ema_relative_end_phase2
0,4MLe,Steps,2023-05-17 18:44:00,2023-05-17 18:45:00,6.0,,False,,-99,,...,329.0,71.366594,343.0,74.403471,2023-05-17,2023-06-01,2023-10-27,2023-11-11,2024-08-15,2024-08-18
1,4MLe,ActiveBurnedCalories,2023-05-17 18:44:00,2023-05-17 18:45:00,0.14,,False,,-99,,...,329.0,71.366594,343.0,74.403471,2023-05-17,2023-06-01,2023-10-27,2023-11-11,2024-08-15,2024-08-18
2,4MLe,CoveredDistance,2023-05-17 18:44:00,2023-05-17 18:45:00,4.62,,False,,-99,,...,329.0,71.366594,343.0,74.403471,2023-05-17,2023-06-01,2023-10-27,2023-11-11,2024-08-15,2024-08-18
3,4MLe,HeartRate,2023-05-17 18:58:01,2023-05-17 18:58:38,,74.0,False,,-99,,...,329.0,71.366594,343.0,74.403471,2023-05-17,2023-06-01,2023-10-27,2023-11-11,2024-08-15,2024-08-18
4,4MLe,ActiveBurnedCalories,2023-05-17 19:04:00,2023-05-17 19:05:00,0.36,,False,,-99,,...,329.0,71.366594,343.0,74.403471,2023-05-17,2023-06-01,2023-10-27,2023-11-11,2024-08-15,2024-08-18


In [4]:
# Configurations
# Check min. amount of EMA data available to map to passive data

timedelta_hours = 2
assess = 0

#GPS data
speed_limit = 1.4
max_distance = 150 
kms_per_radian = 6371.0088
epsilon = 0.05/kms_per_radian
min_samples = 30
min_nights_obs = 5
min_f_home = 0.5



## 1. Prepare passive features

In [5]:
df_pass_act = df_backup.copy()

In [6]:
# Only keep data that were collected during the first assessment phase
df_pass_act_base = df_pass_act[df_pass_act.startTimestamp <= df_pass_act.ema_base_end]

### 1.1 Calculate GPS features

In [7]:
df_pass_act_loc =df_pass_act_base[df_pass_act_base.type.isin(["Latitude", "Longitude"])][["customer", "startTimestamp", "type", "doubleValue"]]

In [8]:
df_loc = df_pass_act_loc.pivot_table(
    index=["customer", "startTimestamp"],
    columns="type",
    values=["doubleValue"],
    aggfunc='first'  # Using 'first' since each type should theoretically have only one entry per customer and timestamp
)

# Flatten the MultiIndex in columns
df_loc.columns = ['_'.join(col).strip() for col in df_loc.columns.values]

df_loc = df_loc.rename_axis(None, axis=1).reset_index()

# Rename the columns for clarity
df_loc = df_loc.rename(columns={
    'doubleValue_Latitude': 'Latitude',
    'doubleValue_Longitude': 'Longitude',
})

In [9]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN

class HomeClusterExtractor:
    def __init__(self, df, speed_limit, max_distance, epsilon, min_samples, min_nights_obs, min_f_home):
        self.df = df.copy()
        self.speed_limit = speed_limit
        self.max_distance = max_distance
        self.epsilon = epsilon
        self.min_samples = min_samples
        self.min_nights_obs = min_nights_obs
        self.min_f_home = min_f_home
        
        self.df['hour_gps'] = self.df['startTimestamp'].dt.hour
        self.df['day_gps'] = self.df['startTimestamp'].dt.date

    def calculate_distances_and_speeds(self):
        """Calculate distances and speeds for each customer."""
        self.df['distance'], self.df['time_diff'], self.df['speed'] = np.nan, np.nan, np.nan

        for customer in self.df['customer'].unique():
            mask = self.df['customer'] == customer
            customer_data = self.df.loc[mask]

            # Calculate distances between consecutive points
            distances = self._calculate_distances(customer_data)
            time_diffs = customer_data['startTimestamp'].diff().dt.total_seconds().fillna(0)
            speeds = distances / time_diffs.replace(0, np.nan)

            self.df.loc[mask, 'distance'] = distances
            self.df.loc[mask, 'time_diff'] = time_diffs
            self.df.loc[mask, 'speed'] = speeds

    def calculate_transition(self):
        """Calculate transition (movement) status based on speed and distance."""
        self.df['transition'] = np.where(
            (self.df['speed'] >= self.speed_limit) | (self.df['distance'] >= self.max_distance),
            1,  # In transition
            0   # Not in transition
        )

    def _calculate_distances(self, df):
        """Helper method to calculate distances using haversine formula."""
        coords = df[['Latitude', 'Longitude']].values
        distances = np.array([
            self._haversine(coords[i-1][1], coords[i-1][0], coords[i][1], coords[i][0])
            for i in range(1, len(coords))
        ])
        return np.append(distances, 0)  # Append 0 for the last point

    def _haversine(self, lon1, lat1, lon2, lat2):
        """Haversine formula to calculate distance between two lat/lon points in meters."""
        R = 6371000  # Radius of Earth in meters
        phi_1 = np.radians(lat1)
        phi_2 = np.radians(lat2)
        delta_phi = np.radians(lat2 - lat1)
        delta_lambda = np.radians(lon2 - lon1)
        a = np.sin(delta_phi/2.0)**2 + np.cos(phi_1) * np.cos(phi_2) * np.sin(delta_lambda/2.0)**2
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
        meters = R * c  # Output distance in meters
        return meters

    def extract_stationary_points(self):
        """Filter stationary points based on speed and distance."""
        return self.df[(self.df['speed'] < self.speed_limit) & (self.df['distance'] < self.max_distance)]

    def apply_clustering(self, df):
        """Apply DBSCAN clustering on stationary points."""
        return df.groupby('customer').apply(self._apply_dbscan).reset_index(drop=True)

    def _apply_dbscan(self, df):
        """Helper method to apply DBSCAN clustering."""
        clustering_model = DBSCAN(eps=self.epsilon, min_samples=self.min_samples, metric="haversine")
        cluster_labels = clustering_model.fit_predict(df[['Longitude', 'Latitude']].apply(np.radians))
        return pd.DataFrame({'cluster': cluster_labels}, index=df.index)

    def find_home_cluster(self, geodata_clusters):
        """Identify the home cluster based on nighttime data."""
        # Filter for night hours
        geodata_night = geodata_clusters.loc[
            (geodata_clusters['hour_gps'] >= 20) | (geodata_clusters['hour_gps'] <= 7)
        ].copy()

        # Filter out any rows with a clusterID of -1
        geodata_night = geodata_night[geodata_night['clusterID'] != 'None']

        # Calculate the mode of clusterID per user during night hours
        geodata_night['home'] = geodata_night.groupby('customer')['clusterID'].transform(
            lambda x: statistics.mode(x)
        )

        # Calculate various metrics to validate the home cluster
        geodata_night['nights_with_obs'] = geodata_night.groupby('customer')['day_gps'].transform('nunique')
        geodata_night['night_obs'] = geodata_night.groupby('customer')['day_gps'].transform('size')
        geodata_night['n_home'] = geodata_night.groupby('customer')['home'].transform(lambda x: x.value_counts().iloc[0])
        geodata_night['f_home'] = geodata_night['n_home'] / geodata_night['night_obs']

        # Update the 'home' label based on conditions
        geodata_night['home'] = geodata_night.apply(
            lambda x: x['home'] if x['nights_with_obs'] >= self.min_nights_obs and x['f_home'] > self.min_f_home else None, axis=1)

        # Extract a mapping of userID to home cluster
        user_home_mapping = geodata_night[['customer', 'home']].drop_duplicates()

        # Merging back to the full dataset
        return pd.merge(geodata_clusters, user_home_mapping, on='customer', how='left')
    
    def determine_if_at_home(self, df):
        """Determine if a person is at home."""
        df['at_home'] = df.apply(
            lambda x: 1 if x['clusterID'] == x['home'] else (0 if pd.notna(x['home']) else -1), axis=1
        )
        return df

    def run(self):
        """Run the full extraction process."""
        self.calculate_distances_and_speeds()
        self.calculate_transition()

        # Ensure transition is correctly added to geodata_clusters
        geodata_clusters = self.df.copy()

        stationary_df = self.extract_stationary_points()
        geodata_cluster_df = self.apply_clustering(stationary_df)

        geodata_clusters = pd.concat([geodata_clusters.reset_index(drop=True), geodata_cluster_df['cluster']], axis=1)
        geodata_clusters = geodata_clusters[geodata_clusters['cluster'] != -1]
        geodata_clusters['clusterID'] = geodata_clusters['customer'].astype(str) + '00' + geodata_clusters['cluster'].astype(str)

        geodata_clusters = self.find_home_cluster(geodata_clusters)
        geodata_clusters = self.determine_if_at_home(geodata_clusters)

        return geodata_clusters


In [10]:
# Instantiate the class
extractor = HomeClusterExtractor(
    df=df_loc,
    speed_limit=speed_limit,
    max_distance=max_distance,
    epsilon=epsilon,
    min_samples=min_samples,
    min_nights_obs=min_nights_obs,
    min_f_home=min_f_home
)

# Run the extraction process
home_clusters = extractor.run()

In [11]:
home_clusters.transition.describe()

count    1.013856e+06
mean     6.309279e-01
std      4.825538e-01
min      0.000000e+00
25%      0.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      1.000000e+00
Name: transition, dtype: float64

In [12]:
home_clusters_red = home_clusters[["customer", "startTimestamp", "at_home","transition" ]]

In [13]:
home_clusters

Unnamed: 0,customer,startTimestamp,Latitude,Longitude,hour_gps,day_gps,distance,time_diff,speed,transition,cluster,clusterID,home,at_home
0,05kz,2023-10-10 19:35:07,58.255878,-29.427252,19,2023-10-10,0.000000,0.0,,0,0.0,05kz000.0,05kz000.0,1
1,05kz,2023-10-10 19:35:11,58.255878,-29.427252,19,2023-10-10,0.000000,4.0,0.000000,0,0.0,05kz000.0,05kz000.0,1
2,05kz,2023-10-10 19:35:15,58.255878,-29.427252,19,2023-10-10,16.842597,4.0,4.210649,1,0.0,05kz000.0,05kz000.0,1
3,05kz,2023-10-10 23:25:08,58.255878,-29.427252,23,2023-10-10,3150.442913,13461.0,0.234042,1,0.0,05kz000.0,05kz000.0,1
4,05kz,2023-10-11 15:54:59,58.253108,-29.481662,15,2023-10-11,443.166520,8528.0,0.051966,1,0.0,05kz000.0,05kz000.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1013851,zgxc,2023-10-27 16:41:13,-51.442148,-91.319991,16,2023-10-27,10.837161,37.0,0.292896,0,,zgxc00nan,,-1
1013852,zgxc,2023-10-27 16:41:38,-51.442058,-91.319931,16,2023-10-27,3.612388,25.0,0.144496,0,,zgxc00nan,,-1
1013853,zgxc,2023-10-27 16:41:43,-51.442028,-91.319911,16,2023-10-27,7.518023,5.0,1.503605,1,,zgxc00nan,,-1
1013854,zgxc,2023-10-27 16:42:36,-51.442088,-91.319961,16,2023-10-27,15.041611,53.0,0.283804,0,,zgxc00nan,,-1


## 2. Prepare EMA data

In [14]:
df_ema_udi = df_ema_content[["customer", "createdAt_day", "quest_create", "unique_day_id", "assess"]]

In [15]:
# Group by customer and unique_day_id and calculate the minimum quest_create
df_min_quest = df_ema_udi.groupby(['customer', 'unique_day_id'])['quest_create'].min().reset_index()

# Rename the column to sensor_block_end
df_min_quest.rename(columns={'quest_create': 'sensor_block_end'}, inplace=True)

# Merge the minimum quest_create back to the original DataFrame
df_ema_udi = pd.merge(df_ema_udi, df_min_quest, on=['customer', 'unique_day_id'], how='left')

# Create the sensor_block_start column, which is 2 hours before quest_create
df_ema_udi.drop(columns=['quest_create'], inplace=True)
df_ema_udi = df_ema_udi.drop_duplicates()

In [16]:
df_ema_udi['sensor_block_start'] = df_ema_udi['sensor_block_end'] - pd.Timedelta(hours=2)


In [17]:
# Only include first assessment phase
df_ema_udi_base = df_ema_udi.loc[df_ema_udi.assess == 0]

## 3. Merge EMA to passive data

In [18]:
import pandas as pd
import numpy as np

class EMAMapper:
    def __init__(self, df_ema, df_data, df_home_clusters=None):
        self.df_ema = df_ema.copy()
        self.df_data = df_data.copy()

        if df_home_clusters is not None:
            self.df_home_clusters = df_home_clusters.copy()
            self.df_home_clusters['startTimestamp'] = pd.to_datetime(self.df_home_clusters['startTimestamp'])
        else:
            self.df_home_clusters = None

        self.df_ema['sensor_block_start'] = pd.to_datetime(self.df_ema['sensor_block_start'])
        self.df_ema['sensor_block_end'] = pd.to_datetime(self.df_ema['sensor_block_end'])
        self.df_data['startTimestamp'] = pd.to_datetime(self.df_data['startTimestamp'])
        if 'endTimestamp' in self.df_data.columns:
            self.df_data['endTimestamp'] = pd.to_datetime(self.df_data['endTimestamp'])

        self.outlier_distances = []  # To collect filtered outlier distances

    def _haversine(self, lon1, lat1, lon2, lat2):
        lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

        dlon = lon2 - lon1
        dlat = lat2 - lat1

        a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
        c = 2 * np.arcsin(np.sqrt(a))
        r = 6371  # Radius of Earth in kilometers
        distance = c * r

        return distance

    def count_gps_rows_within_blocks_and_calculate_distances(self, max_distance_km=50):
        gps_counts = []
        total_distances = []

        df_gps = self.df_data[self.df_data['type'].isin(['Latitude', 'Longitude'])]
        df_gps = df_gps.pivot_table(index=["customer", "startTimestamp"], columns="type", values="doubleValue", aggfunc='first').reset_index()

        for idx, ema_row in self.df_ema.iterrows():
            sensor_block_start = ema_row['sensor_block_start']
            sensor_block_end = ema_row['sensor_block_end']

            df_filtered = df_gps[(df_gps['startTimestamp'] >= sensor_block_start) & 
                                 (df_gps['startTimestamp'] <= sensor_block_end)]

            if df_filtered.empty:
                gps_counts.append(0)
                total_distances.append(0)
            else:
                df_filtered = df_filtered[(df_filtered['Latitude'] != 0) & (df_filtered['Longitude'] != 0)]
                df_filtered = df_filtered.dropna(subset=['Latitude', 'Longitude'])

                gps_count = df_filtered.shape[0]
                gps_counts.append(gps_count)

                latitudes = df_filtered['Latitude'].values
                longitudes = df_filtered['Longitude'].values

                distances = []
                for i in range(1, len(latitudes)):
                    distance = self._haversine(longitudes[i-1], latitudes[i-1], longitudes[i], latitudes[i])

                    # Apply max distance outlier detection
                    if distance > max_distance_km:
                        self.outlier_distances.append({
                            'customer': df_filtered.iloc[i]['customer'],
                            'unique_day_id': ema_row['unique_day_id'],
                            'segment': i,
                            'distance_km': distance,
                            'reason': 'Exceeds max distance'
                        })
                        distances.append(0)  # Set outlier distances to 0
                    else:
                        distances.append(distance)

                total_distance = np.sum(distances)
                total_distances.append(total_distance)

        self.df_ema['n_GPS'] = gps_counts
        self.df_ema['total_distance_km'] = total_distances

        return self.df_ema

    def map_steps_to_ema(self):
        n_steps_values = []

        df_steps = self.df_data[self.df_data['type'] == 'Steps']

        for idx, ema_row in self.df_ema.iterrows():
            sensor_block_start = ema_row['sensor_block_start']
            sensor_block_end = ema_row['sensor_block_end']

            df_filtered = df_steps[(df_steps['startTimestamp'] < sensor_block_end) & 
                                   (df_steps['endTimestamp'] > sensor_block_start)]

            if df_filtered.empty:
                n_steps_values.append(0)
            else:
                overlap_start = df_filtered['startTimestamp'].combine(sensor_block_start, max)
                overlap_end = df_filtered['endTimestamp'].combine(sensor_block_end, min)

                overlap_duration = (overlap_end - overlap_start).dt.total_seconds()
                step_duration = (df_filtered['endTimestamp'] - df_filtered['startTimestamp']).dt.total_seconds()

                proportion = overlap_duration / step_duration
                weighted_value = proportion * df_filtered['doubleValue']

                n_steps = weighted_value.sum()
                n_steps_values.append(round(n_steps))

        self.df_ema['n_steps'] = n_steps_values

        return self.df_ema

    def map_at_home_to_ema(self):
        """
        Maps the 'at_home' variable to EMA blocks based on whether the startTimestamp from the df_home_clusters falls within the blocks.

        Returns:
        - df_ema with an additional column 'at_home_percentage' containing the proportion of time spent at home within each block.
        - df_ema with an additional column 'at_home_binary' indicating if the person was at home (1), not at home (0), or unknown (-1).
        """
        if self.df_home_clusters is None:
            raise ValueError("df_home_clusters is not provided during initialization.")

        at_home_percentage_values = []
        at_home_binary_values = []

        for idx, ema_row in self.df_ema.iterrows():
            sensor_block_start = ema_row['sensor_block_start']
            sensor_block_end = ema_row['sensor_block_end']
            customer = ema_row['customer']

            df_filtered = self.df_home_clusters[
                (self.df_home_clusters['startTimestamp'] >= sensor_block_start) & 
                (self.df_home_clusters['startTimestamp'] <= sensor_block_end) & 
                (self.df_home_clusters['customer'] == customer)
            ]

            if df_filtered.empty:
                at_home_percentage_values.append(0)
                at_home_binary_values.append(-1)  # Unknown
            else:
                # Exclude unknown (-1) values from the calculation
                valid_at_home = df_filtered[df_filtered['at_home'] != -1]

                if valid_at_home.empty:
                    at_home_percentage_values.append(0)
                    at_home_binary_values.append(-1)  # Unknown
                else:
                    # Calculate the proportion of time spent at home within the block
                    at_home_percentage = valid_at_home['at_home'].mean() * 100  # Convert proportion to percentage
                    at_home_percentage_values.append(at_home_percentage)

                    # Determine binary at_home status
                    at_home_status = 1 if valid_at_home['at_home'].mean() > 0 else 0
                    at_home_binary_values.append(at_home_status)

        self.df_ema['at_home_percentage'] = at_home_percentage_values
        self.df_ema['at_home_binary'] = at_home_binary_values

        return self.df_ema


In [19]:
# Assuming df_ema, df_data, and df_home_clusters_red are already defined DataFrames
ema_mapper = EMAMapper(df_ema_udi_base, df_pass_act_base, df_home_clusters=home_clusters_red)

# Mapping steps and GPS data
df_ema_with_mapped_values = ema_mapper.map_steps_to_ema()
df_ema_with_gps_counts_distances = ema_mapper.count_gps_rows_within_blocks_and_calculate_distances(max_distance_km=50)

# Mapping the 'at_home' variable
df_ema_with_at_home = ema_mapper.map_at_home_to_ema()

In [20]:
df_ema_with_at_home

Unnamed: 0,customer,createdAt_day,unique_day_id,assess,sensor_block_end,sensor_block_start,n_steps,n_GPS,total_distance_km,at_home_percentage,at_home_binary
0,MYAi,2023-09-19,20230919_1,0,2023-09-19 07:31:20.352,2023-09-19 05:31:20.352,295,115,0.438162,100.0,1
32,MYAi,2023-09-19,20230919_6,0,2023-09-19 17:14:33.463,2023-09-19 15:14:33.463,11126,720,49.047973,100.0,1
33,MYAi,2023-09-28,20230928_6,0,2023-09-28 18:29:55.737,2023-09-28 16:29:55.737,15424,263,60.085323,100.0,1
35,MYAi,2023-09-25,20230925_6,0,2023-09-25 17:08:34.592,2023-09-25 15:08:34.592,15565,439,42.653878,0.0,-1
39,MYAi,2023-09-22,20230922_6,0,2023-09-22 17:06:14.166,2023-09-22 15:06:14.166,6507,233,20.542444,100.0,1
...,...,...,...,...,...,...,...,...,...,...,...
712412,xwB7,2023-07-21,20230721_2,0,2023-07-21 08:11:31.637,2023-07-21 06:11:31.637,320,4,0.043616,0.0,-1
712443,xwB7,2023-07-21,20230721_4,0,2023-07-21 11:40:30.657,2023-07-21 09:40:30.657,8228,72,1.023709,100.0,1
712472,xwB7,2023-07-20,20230720_5,0,2023-07-20 13:46:15.775,2023-07-20 11:46:15.775,9443,256,22.752635,0.0,-1
712502,xwB7,2023-07-20,20230720_7,0,2023-07-20 16:58:19.651,2023-07-20 14:58:19.651,7404,124,2.877848,0.0,-1


In [21]:
df_ema_with_at_home.at_home_percentage.describe()

count    16693.000000
mean        40.620886
std         48.818515
min          0.000000
25%          0.000000
50%          0.000000
75%        100.000000
max        100.000000
Name: at_home_percentage, dtype: float64

In [22]:
with open(preprocessed_path + '/map_ema_passive.pkl', 'wb') as file:
    pickle.dump(df_ema_with_at_home, file)