# 4 Detailed Preprocessing of Passive Data

This notebook shows the analysis of situational context using EMA and passive sensing data

1. **Load Data**: Load necessary data from pickle files.
2. **Preprocess EMA**:

In [1]:
import os
import sys
import regex as re
# If your current working directory is the notebooks directory, use this:
notebook_dir = os.getcwd()  # current working directory
src_path = os.path.abspath(os.path.join(notebook_dir, '..', 'src'))
sys.path.append(src_path)

# Add the parent directory to sys.path
parent_dir = os.path.abspath(os.path.join(notebook_dir, '..'))
sys.path.append(parent_dir)
import glob
import pickle
from IPython.display import Markdown
from config import datapath, preprocessed_path

import pandas as pd
import numpy as np
import datetime as dt

from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from math import radians, cos, sin, asin, sqrt
import statistics  # Make sure this is imported


import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns 
import matplotlib.patches as mpatches

sns.set_context("notebook", rc={"axes.labelsize": 14, "xtick.labelsize": 14, "ytick.labelsize": 14})
sns.set_style("whitegrid", {'axes.grid': True})
%matplotlib inline
import plotly.express as px


In [2]:
backup_path = preprocessed_path + "backup_data_passive_actual.feather"
df_backup = pd.read_feather(backup_path)

with open(preprocessed_path + '/ema_data.pkl', 'rb') as file:
    df_ema_framework = pickle.load(file)

with open(preprocessed_path + '/ema_content.pkl', 'rb') as file:
    df_ema_content = pickle.load(file)  

with open(preprocessed_path + '/monitoring_data.pkl', 'rb') as file:
    df_monitoring = pickle.load(file)

In [17]:
# Configurations
# Check min. amount of EMA data available to map to passive data

timedelta_hours = 2
assess = 0

#GPS data
speed_limit = 1.4
max_distance = 150 
kms_per_radian = 6371000
epsilon = 100/kms_per_radian
min_samples = 10
min_cluster_size = 20
min_nights_obs = 4
min_f_home = 0.5

# EMA
assessment_phase = [0] #1,2
min_num_daily = 4
min_days_data = 7

In [91]:
df_ema1 = df_ema_content.loc[df_ema_content.study.isin([24,25])] # first assessment phase
df_ema1 = df_ema1.loc[df_ema1["n_quest"] >= min_num_daily]
df_ema1["n_days_min"] = df_ema1.groupby("customer")['quest_complete_day'].transform("nunique")
df_ema1 = df_ema1.loc[df_ema1.n_days_min >= min_days_data]
df_ema1_customers = df_ema1.customer.unique().tolist()

## 1. Prepare passive features

In [92]:
df_pass_act = df_backup.copy()

KeyboardInterrupt: 

In [None]:
# Only keep data that were collected during the first assessment phase
df_pass_act_base = df_pass_act[df_pass_act.startTimestamp <= df_pass_act.ema_base_end]

In [None]:
df_pass_act_base = df_pass_act_base.loc[df_pass_act_base.customer.isin(df_ema1_customers)]

### 1.1 Calculate GPS features

In [None]:
df_pass_act_loc =df_pass_act_base[df_pass_act_base.type.isin(["Latitude", "Longitude"])][["customer", "startTimestamp", "type", "doubleValue"]]

In [None]:
df_loc = df_pass_act_loc.pivot_table(
    index=["customer", "startTimestamp"],
    columns="type",
    values=["doubleValue"],
    aggfunc='first'  # Using 'first' since each type should theoretically have only one entry per customer and timestamp
)

# Flatten the MultiIndex in columns
df_loc.columns = ['_'.join(col).strip() for col in df_loc.columns.values]

df_loc = df_loc.rename_axis(None, axis=1).reset_index()

# Rename the columns for clarity
df_loc = df_loc.rename(columns={
    'doubleValue_Latitude': 'Latitude',
    'doubleValue_Longitude': 'Longitude',
})

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
import hdbscan
import statistics

class HomeClusterExtractor:
    def __init__(self, df, speed_limit, max_distance, epsilon, min_samples, min_nights_obs, min_f_home, clustering_method='dbscan', normalize_min_samples=False, min_data_points=10):
        self.df = df.copy()
        self.speed_limit = speed_limit
        self.max_distance = max_distance
        self.epsilon = epsilon
        self.min_samples = min_samples
        self.min_nights_obs = min_nights_obs
        self.min_f_home = min_f_home
        self.clustering_method = clustering_method
        self.normalize_min_samples = normalize_min_samples
        self.min_data_points = min_data_points  # Minimum data points threshold

        self.df['hour_gps'] = self.df['startTimestamp'].dt.hour
        self.df['day_gps'] = self.df['startTimestamp'].dt.date

    def calculate_distances_and_speeds(self):
        """Calculate distances and speeds for each customer."""
        self.df['distance'], self.df['time_diff'], self.df['speed'] = np.nan, np.nan, np.nan

        for customer in self.df['customer'].unique():
            mask = self.df['customer'] == customer
            customer_data = self.df.loc[mask]

            distances = self._calculate_distances(customer_data)
            time_diffs = customer_data['startTimestamp'].diff().dt.total_seconds().fillna(0)
            speeds = distances / time_diffs.replace(0, np.nan)

            self.df.loc[mask, 'distance'] = distances
            self.df.loc[mask, 'time_diff'] = time_diffs
            self.df.loc[mask, 'speed'] = speeds

    def calculate_stationary_and_transition(self):
        """Determine stationary points and transition status based on speed and distance."""
        self.df = self.df[self.df['speed'] <= 220 * 1000 / 3600]  # Filter out points with speed > 220 km/h
        self.df['stationary'] = (self.df['speed'] < self.speed_limit) & (self.df['distance'] < self.max_distance)
        self.df['transition'] = np.where(self.df['stationary'], 0, 1)
        return self.df

    def _calculate_distances(self, df):
        """Helper method to calculate distances using haversine formula."""
        coords = df[['Latitude', 'Longitude']].values
        distances = np.array([
            self._haversine(coords[i-1][1], coords[i-1][0], coords[i][1], coords[i][0])
            for i in range(1, len(coords))
        ])
        return np.append(distances, 0)

    def _haversine(self, lon1, lat1, lon2, lat2):
        """Haversine formula to calculate distance between two lat/lon points in meters."""
        R = 6371000
        lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
        dlon = lon2 - lon1
        dlat = lat2 - lat1
        a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
        return R * c

    def apply_clustering(self, df):
        """Apply clustering based on the selected method."""
        df_cleaned = df.dropna(subset=['Longitude', 'Latitude'])

        return df_cleaned.groupby('customer').apply(self._apply_clustering_method).reset_index(drop=True)

    def _apply_clustering_method(self, df):
        """Helper method to apply the chosen clustering method."""
        customer_point_count = len(df)

        # Skip clustering for customers with too few points
        if customer_point_count < self.min_data_points:
            print(f"Customer {df['customer'].iloc[0]} has too few data points ({customer_point_count}). Skipping clustering.")
            return pd.DataFrame({'cluster': [-1] * customer_point_count}, index=df.index)

        # Use normalized min_samples or a default value
        if self.normalize_min_samples:
            min_samples = max(2, int(customer_point_count * 0.02))  # 3% of points, with a minimum of 2
        else:
            min_samples = self.min_samples

        if self.clustering_method == 'dbscan':
            clustering_model = DBSCAN(eps=self.epsilon, min_samples=min_samples, metric="haversine")
            cluster_labels = clustering_model.fit_predict(df[['Longitude', 'Latitude']].apply(np.radians))
            cluster_labels = cluster_labels.astype(int)
            return pd.DataFrame({'cluster': cluster_labels}, index=df.index)
        elif self.clustering_method == 'hdbscan':
            min_cluster_size = max(2, min(min_samples, customer_point_count))
            clustering_model = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, metric='haversine')
            cluster_labels = clustering_model.fit_predict(df[['Longitude', 'Latitude']].apply(np.radians))
            cluster_labels = cluster_labels.astype(int)
            return pd.DataFrame({'cluster': cluster_labels}, index=df.index)
        else:
            raise ValueError(f"Invalid clustering method: {self.clustering_method}")

    def data_quality_check(self):
        """Filter out customers with insufficient data points."""
        customer_counts = self.df.groupby('customer').size().reset_index(name='point_count')
        valid_customers = customer_counts[customer_counts['point_count'] >= self.min_data_points]['customer']
        self.df = self.df[self.df['customer'].isin(valid_customers)]
        print(f"Data quality check: {len(valid_customers)} customers with sufficient data retained.")
        
    def find_home_cluster(self, geodata_clusters):
        """Identify the home cluster based on nighttime data, with fallback to the largest cluster from all data points."""

        # Filter for night hours
        geodata_night = geodata_clusters.loc[
            (geodata_clusters['hour_gps'] >= 20) | (geodata_clusters['hour_gps'] <= 7)
        ].copy()

        # Initialize the 'home' column to None
        geodata_clusters['home'] = None

        # Time-based home cluster assignment: most frequent cluster at night
        if not geodata_night.empty:
            # Only exclude noise from home assignment during night hours
            valid_clusters_night = geodata_night[geodata_night['cluster'] != -1].copy()

            if not valid_clusters_night.empty:
                # Calculate the most frequent cluster (mode) per customer at night
                valid_clusters_night['home'] = valid_clusters_night.groupby('customer')['cluster'].transform(
                    lambda x: statistics.mode(x) if len(x) > 0 else None
                )

                # Calculate the number of unique nights with observations for each customer
                valid_clusters_night['nights_with_obs'] = valid_clusters_night.groupby('customer')['day_gps'].transform('nunique')

                # Count the number of points in the identified home cluster (most frequent) per customer
                valid_clusters_night['n_home'] = valid_clusters_night.groupby(['customer', 'home'])['day_gps'].transform('size')

                # Calculate the total number of night-time points for each customer
                valid_clusters_night['night_obs'] = valid_clusters_night.groupby('customer')['day_gps'].transform('size')

                # Calculate the fraction of night-time points spent at home
                valid_clusters_night['f_home'] = valid_clusters_night['n_home'] / valid_clusters_night['night_obs']

                # Apply both conditions: Minimum nights observed and minimum fraction of time spent at home
                valid_clusters_night['home'] = valid_clusters_night.apply(
                    lambda x: x['home'] if (x['nights_with_obs'] >= self.min_nights_obs) and (x['f_home'] >= self.min_f_home) else None, axis=1
                )

                # Merge the time-based home assignment back into the main dataframe
                home_mapping = valid_clusters_night[['customer', 'home']].drop_duplicates(subset=['customer'])
                geodata_clusters = pd.merge(geodata_clusters, home_mapping, on='customer', how='left', suffixes=('', '_temp'))
                geodata_clusters['home'] = geodata_clusters['home'].combine_first(geodata_clusters['home_temp'])
                geodata_clusters.drop(columns=['home_temp'], inplace=True)

        # Fallback: Assign the largest cluster per customer from **all data points** if no home is found
        no_home_customers = geodata_clusters.loc[geodata_clusters['home'].isna(), 'customer'].unique()
        print(f"Customers with no home after time-based method: {len(no_home_customers)}")

        if len(no_home_customers) > 0:
            # Consider all points (not just night-time) for customers with no home cluster
            fallback_home_clusters = (
                geodata_clusters[geodata_clusters['customer'].isin(no_home_customers) & (geodata_clusters['cluster'] != -1)]
                .groupby(['customer', 'cluster'])
                .size()
                .reset_index(name='cluster_size')
            )

            if not fallback_home_clusters.empty:
                # Take the largest cluster per customer based on **all data points**
                fallback_home_clusters = fallback_home_clusters.loc[
                    fallback_home_clusters.groupby('customer')['cluster_size'].idxmax()
                ]

                # Assign the fallback home clusters
                fallback_home_clusters['home'] = fallback_home_clusters['cluster']

                # Merge fallback home clusters back to the main dataset
                fallback_home_mapping = fallback_home_clusters[['customer', 'home']].drop_duplicates()
                geodata_clusters = pd.merge(geodata_clusters, fallback_home_mapping, on='customer', how='left', suffixes=('', '_fallback'))

                # Fill any remaining NaNs in the 'home' column with the fallback cluster
                geodata_clusters['home'] = geodata_clusters['home'].combine_first(geodata_clusters['home_fallback'])
                geodata_clusters.drop(columns=['home_fallback'], inplace=True)
                print(f"Fallback home clusters assigned: {len(fallback_home_clusters)}")

        # For customers that still have no home cluster after the fallback
        final_no_home = geodata_clusters.loc[geodata_clusters['home'].isna(), 'customer'].unique()
        print(f"Warning: {len(final_no_home)} customers still do not have a home cluster.")

        # Create homeID by combining customer and home cluster
        geodata_clusters['homeID'] = geodata_clusters.apply(
            lambda x: f"{x['customer']}00{int(x['home'])}" if pd.notna(x['home']) else None, axis=1
        )

        return geodata_clusters



    def determine_if_at_home(self, df):
        """Determine if a person is at home, handling unclustered points (-1) properly."""
        
        # Convert cluster to integer before creating the clusterID and homeID
        df['cluster'] = df['cluster'].astype(int)
        df['home'] = df['home'].astype(int, errors='ignore')  # Handle NaNs gracefully

        # Create clusterID and homeID, ensuring no decimal points
        df['clusterID'] = df.apply(lambda x: f"{x['customer']}00{int(x['cluster'])}" if x['cluster'] != -1 else None, axis=1)
        df['homeID'] = df.apply(lambda x: f"{x['customer']}00{int(x['home'])}" if pd.notna(x['home']) else None, axis=1)

        # Check if a person is at home (-1 if no valid cluster/home)
        df['at_home'] = df.apply(
            lambda x: -1 if x['cluster'] == -1 else (1 if x['clusterID'] == x['homeID'] else 0), axis=1
        )
        return df


    def run(self):
        """Run the full extraction process."""
        self.data_quality_check()
        self.calculate_distances_and_speeds()
        self.df = self.calculate_stationary_and_transition()

        # Apply clustering based on all data (not just stationary points)
        geodata_cluster_df = self.apply_clustering(self.df)

        # Merge clustering results back to the original dataframe, including transition points
        geodata_clusters = pd.concat([self.df.reset_index(drop=True), geodata_cluster_df[['cluster']]], axis=1)
        geodata_clusters['cluster'].fillna(-1, inplace=True)

        # Find home cluster with fallback
        geodata_clusters = self.find_home_cluster(geodata_clusters)

        # Determine if the person is at home
        geodata_clusters = self.determine_if_at_home(geodata_clusters)

        return geodata_clusters


In [76]:
# Example usage with HDBSCAN and normalized min_samples:
extractor = HomeClusterExtractor(df_loc, speed_limit=speed_limit, max_distance=max_distance, epsilon=epsilon, min_samples=min_samples, 
                                 min_nights_obs = min_nights_obs, min_f_home=min_f_home, clustering_method='dbscan', 
                                 normalize_min_samples=False, min_data_points=50)
result = extractor.run()



Data quality check: 166 customers with sufficient data retained.
Customers with no home after time-based method: 22
Fallback home clusters assigned: 22


In [77]:
(result.loc[result.cluster ==-1]["customer"].count()/result.shape[0])*100

16.861500091068507

In [47]:
home_clusters_red = result[["customer", "startTimestamp", "at_home","transition", "distance", "time_diff", "speed" ]]

## 2. Prepare EMA data

In [93]:
df_ema_udi = df_ema_content[["customer", "createdAt_day", "quest_create", "unique_day_id", "assess"]]

In [94]:
# Group by customer and unique_day_id and calculate the minimum quest_create
df_min_quest = df_ema_udi.groupby(['customer', 'unique_day_id'])['quest_create'].min().reset_index()

# Rename the column to sensor_block_end
df_min_quest.rename(columns={'quest_create': 'sensor_block_end'}, inplace=True)

# Merge the minimum quest_create back to the original DataFrame
df_ema_udi = pd.merge(df_ema_udi, df_min_quest, on=['customer', 'unique_day_id'], how='left')

# Create the sensor_block_start column, which is 2 hours before quest_create
df_ema_udi.drop(columns=['quest_create'], inplace=True)
df_ema_udi = df_ema_udi.drop_duplicates()

In [95]:
df_ema_udi['sensor_block_start'] = df_ema_udi['sensor_block_end'] - pd.Timedelta(hours=2)


In [96]:
# Only include first assessment phase
df_ema_udi_base = df_ema_udi.loc[df_ema_udi.assess == 0]

In [97]:
df_ema_udi_test = df_ema_udi_base.head(100)

In [83]:
df_pass_act_test = df_pass_act_base.head(100)

## 3. Merge EMA to passive data

In [84]:
import pandas as pd
import numpy as np

class EMAMapper:
    def __init__(self, df_ema, df_data, df_home_clusters=None):
        self.df_ema = df_ema.copy()
        self.df_data = df_data.copy()

        if df_home_clusters is not None:
            self.df_home_clusters = df_home_clusters.copy()
            self.df_home_clusters['startTimestamp'] = pd.to_datetime(self.df_home_clusters['startTimestamp'])
        else:
            self.df_home_clusters = None

        self.df_ema['sensor_block_start'] = pd.to_datetime(self.df_ema['sensor_block_start'])
        self.df_ema['sensor_block_end'] = pd.to_datetime(self.df_ema['sensor_block_end'])
        self.df_data['startTimestamp'] = pd.to_datetime(self.df_data['startTimestamp'])

    def map_steps_to_ema(self):
        """Map steps to EMA blocks."""
        n_steps_values = []

        df_steps = self.df_data[self.df_data['type'] == 'Steps']

        for idx, ema_row in self.df_ema.iterrows():
            sensor_block_start = ema_row['sensor_block_start']
            sensor_block_end = ema_row['sensor_block_end']

            # Use df_steps, not df_filtered, for filtering
            df_filtered = df_steps[(df_steps['startTimestamp'] < sensor_block_end) & 
                                   (df_steps['endTimestamp'] > sensor_block_start)]

            if df_filtered.empty:
                n_steps_values.append(0)
            else:
                overlap_start = df_filtered['startTimestamp'].combine(sensor_block_start, max)
                overlap_end = df_filtered['endTimestamp'].combine(sensor_block_end, min)

                overlap_duration = (overlap_end - overlap_start).dt.total_seconds()
                step_duration = (df_filtered['endTimestamp'] - df_filtered['startTimestamp']).dt.total_seconds()

                proportion = overlap_duration / step_duration
                weighted_value = proportion * df_filtered['doubleValue']

                n_steps = weighted_value.sum()
                n_steps_values.append(round(n_steps))

        self.df_ema['n_steps'] = n_steps_values
        return self.df_ema

    def map_gps_and_transition_to_ema(self):
        """
        Map GPS, stationary, and transition data to EMA blocks in one process, using the predefined 'sensor_block_start' and 'sensor_block_end'.

        Returns:
        - df_ema with additional columns: 'n_GPS', 'total_distance_km', 'transition', 'transition_minutes', 'at_home_minute', and 'at_home_binary'.
        """
        if self.df_home_clusters is None:
            raise ValueError("df_home_clusters is not provided during initialization.")

        gps_counts = []
        total_distances = []
        transition_values = []
        transition_minute_values = []
        at_home_minute_values = []
        at_home_binary_values = []

        for idx, ema_row in self.df_ema.iterrows():
            sensor_block_start = ema_row['sensor_block_start']  # Use predefined block start
            sensor_block_end = ema_row['sensor_block_end']      # Use predefined block end
            customer = ema_row['customer']

            # Filter the home clusters for the current customer and time window
            df_filtered = self.df_home_clusters[
                (self.df_home_clusters['customer'] == customer) &
                (self.df_home_clusters['startTimestamp'] >= sensor_block_start) &
                (self.df_home_clusters['startTimestamp'] <= sensor_block_end)
            ]

            if df_filtered.empty:
                # No data for the block, set to default values
                gps_counts.append(0)
                total_distances.append(0)
                transition_values.append(-1)  # No data
                transition_minute_values.append(0)
                at_home_minute_values.append(0)
                at_home_binary_values.append(-1)  # No data
            else:
                # Count GPS points
                gps_count = df_filtered.shape[0]
                gps_counts.append(gps_count)

                # Calculate total distance (sum of distances)
                total_distance = df_filtered['distance'].sum() / 1000  # Convert to kilometers
                total_distances.append(total_distance)

                # Calculate time differences between consecutive GPS points within the block
                df_filtered.loc[:, 'time_diff'] = df_filtered['startTimestamp'].diff().dt.total_seconds().fillna(0)

                # Calculate transition minutes and at_home minutes
                transition_minutes = df_filtered[df_filtered['transition'] == 1]['time_diff'].sum() / 60  # Convert seconds to minutes
                at_home_minutes = df_filtered[df_filtered['at_home'] == 1]['time_diff'].sum() / 60  # Convert seconds to minutes

                transition_minute_values.append(transition_minutes)
                at_home_minute_values.append(at_home_minutes)

                # Calculate at_home_binary
                if df_filtered['at_home'].eq(1).any():
                    # At least one GPS point at home
                    at_home_binary_values.append(1)
                else:
                    # There is data, but no valid home cluster, or all points are not at home
                    at_home_binary_values.append(0)

                # Transition status
                if transition_minutes > 0:
                    transition_status = 1  # Some transition occurred
                else:
                    transition_status = 0  # No transition occurred
                transition_values.append(transition_status)

        # Update the df_ema DataFrame with new columns
        self.df_ema['n_GPS'] = gps_counts
        self.df_ema['total_distance_km'] = total_distances
        self.df_ema['transition'] = transition_values
        self.df_ema['transition_minutes'] = transition_minute_values
        self.df_ema['at_home_minute'] = at_home_minute_values
        self.df_ema['at_home_binary'] = at_home_binary_values

        return self.df_ema



In [85]:

# Step 2: Map EMA data
ema_mapper = EMAMapper(df_ema_udi_base, df_pass_act_base, df_home_clusters=home_clusters_red)

# Step 3: Map steps, GPS data, and transitions
df_ema_with_steps = ema_mapper.map_steps_to_ema()
df_ema_with_gps_and_transition = ema_mapper.map_gps_and_transition_to_ema()
df_ema_with_at_home = ema_mapper.map_gps_and_transition_to_ema()

# Final DataFrame with all the mapped data
df_ema_with_at_home.head()


Unnamed: 0,customer,createdAt_day,unique_day_id,assess,sensor_block_end,sensor_block_start,n_steps,n_GPS,total_distance_km,transition,transition_minutes,at_home_minute,at_home_binary
0,MYAi,2023-09-19,20230919_1,0,2023-09-19 07:31:20.352,2023-09-19 05:31:20.352,295,115,0.445619,0,0.0,118.0,1
32,MYAi,2023-09-19,20230919_6,0,2023-09-19 17:14:33.463,2023-09-19 15:14:33.463,11126,286,16.757672,1,18.75,0.0,0
33,MYAi,2023-09-28,20230928_6,0,2023-09-28 18:29:55.737,2023-09-28 16:29:55.737,12097,2,19.321239,1,35.283333,0.0,0
35,MYAi,2023-09-25,20230925_6,0,2023-09-25 17:08:34.592,2023-09-25 15:08:34.592,13281,0,0.0,-1,0.0,0.0,-1
39,MYAi,2023-09-22,20230922_6,0,2023-09-22 17:06:14.166,2023-09-22 15:06:14.166,6328,85,6.81599,1,12.2,0.0,0


In [86]:
df_ema_with_at_home.groupby("at_home_binary")["customer"].count()

at_home_binary
-1    9783
 0    2540
 1    4879
Name: customer, dtype: int64

In [89]:
df_ema_with_at_home.groupby("transition")["customer"].count()

transition
-1    9783
 0    3155
 1    4264
Name: customer, dtype: int64

In [87]:
df_ema_with_at_home.transition_minutes.describe()

count    17202.000000
mean         3.662431
std         11.599654
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max        116.833333
Name: transition_minutes, dtype: float64

In [90]:
with open(preprocessed_path + '/map_ema_passive.pkl', 'wb') as file:
    pickle.dump(df_ema_with_at_home, file)