# 4 Detailed Preprocessing of Passive Data

This notebook shows the analysis of situational context using EMA and passive sensing data

1. **Load Data**: Load necessary data from pickle files.
2. **Preprocess EMA**:

In [1]:
import os
import sys
import regex as re
# If your current working directory is the notebooks directory, use this:
notebook_dir = os.getcwd()  # current working directory
src_path = os.path.abspath(os.path.join(notebook_dir, '..', 'src'))
sys.path.append(src_path)

# Add the parent directory to sys.path
parent_dir = os.path.abspath(os.path.join(notebook_dir, '..'))
sys.path.append(parent_dir)
import glob
import pickle
from IPython.display import Markdown
from config import datapath, preprocessed_path

import pandas as pd
import numpy as np
import datetime as dt

from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from math import radians, cos, sin, asin, sqrt
import statistics  # Make sure this is imported


import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns 
import matplotlib.patches as mpatches

sns.set_context("notebook", rc={"axes.labelsize": 14, "xtick.labelsize": 14, "ytick.labelsize": 14})
sns.set_style("whitegrid", {'axes.grid': True})
%matplotlib inline
import plotly.express as px


In [2]:
backup_path = preprocessed_path + "backup_data_passive_actual.feather"
df_backup = pd.read_feather(backup_path)

with open(preprocessed_path + '/ema_data.pkl', 'rb') as file:
    df_ema_framework = pickle.load(file)

with open(preprocessed_path + '/ema_content.pkl', 'rb') as file:
    df_ema_content = pickle.load(file)  

with open(preprocessed_path + '/monitoring_data.pkl', 'rb') as file:
    df_monitoring = pickle.load(file)

In [25]:
# Configurations
# Check min. amount of EMA data available to map to passive data

timedelta_hours = 2
assess = 0

#GPS data
speed_limit = 1.4
max_distance = 150 
kms_per_radian = 6371000
epsilon = 100/kms_per_radian
min_samples = 10
min_nights_obs = 7
min_f_home = 0.5

# EMA
assessment_phase = [0] #1,2
min_num_daily = 4
min_days_data = 7

In [26]:
df_ema1 = df_ema_content.loc[df_ema_content.study.isin([24,25])] # first assessment phase
df_ema1 = df_ema1.loc[df_ema1["n_quest"] >= min_num_daily]
df_ema1["n_days_min"] = df_ema1.groupby("customer")['quest_complete_day'].transform("nunique")
df_ema1 = df_ema1.loc[df_ema1.n_days_min >= min_days_data]
df_ema1_customers = df_ema1.customer.unique().tolist()

## 1. Prepare passive features

In [27]:
df_pass_act = df_backup.copy()

In [28]:
# Only keep data that were collected during the first assessment phase
df_pass_act_base = df_pass_act[df_pass_act.startTimestamp <= df_pass_act.ema_base_end]

In [None]:
df_pass_act_base = df_pass_act_base.loc[df_pass_act_base.customer.isin(df_ema1_customers)]

### 1.1 Calculate GPS features

In [29]:
df_pass_act_loc =df_pass_act_base[df_pass_act_base.type.isin(["Latitude", "Longitude"])][["customer", "startTimestamp", "type", "doubleValue"]]

In [30]:
df_loc = df_pass_act_loc.pivot_table(
    index=["customer", "startTimestamp"],
    columns="type",
    values=["doubleValue"],
    aggfunc='first'  # Using 'first' since each type should theoretically have only one entry per customer and timestamp
)

# Flatten the MultiIndex in columns
df_loc.columns = ['_'.join(col).strip() for col in df_loc.columns.values]

df_loc = df_loc.rename_axis(None, axis=1).reset_index()

# Rename the columns for clarity
df_loc = df_loc.rename(columns={
    'doubleValue_Latitude': 'Latitude',
    'doubleValue_Longitude': 'Longitude',
})

In [31]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
import statistics

class HomeClusterExtractor:
    def __init__(self, df, speed_limit, max_distance, epsilon, min_samples, min_nights_obs, min_f_home):
        self.df = df.copy()
        self.speed_limit = speed_limit  # Speed limit in meters per second (m/s)
        self.max_distance = max_distance  # Maximum distance to consider for stationary points (in meters)
        self.epsilon = epsilon  # Epsilon value for DBSCAN clustering
        self.min_samples = min_samples  # Minimum samples for DBSCAN clustering
        self.min_nights_obs = min_nights_obs  # Minimum nights of observation to identify home cluster
        self.min_f_home = min_f_home  # Minimum fraction of time spent at home for it to be considered home
        
        self.df['hour_gps'] = self.df['startTimestamp'].dt.hour
        self.df['day_gps'] = self.df['startTimestamp'].dt.date

    def calculate_distances_and_speeds(self):
        """Calculate distances and speeds for each customer."""
        self.df['distance'], self.df['time_diff'], self.df['speed'] = np.nan, np.nan, np.nan

        for customer in self.df['customer'].unique():
            mask = self.df['customer'] == customer
            customer_data = self.df.loc[mask]

            # Calculate distances between consecutive points
            distances = self._calculate_distances(customer_data)
            time_diffs = customer_data['startTimestamp'].diff().dt.total_seconds().fillna(0)
            speeds = distances / time_diffs.replace(0, np.nan)

            self.df.loc[mask, 'distance'] = distances
            self.df.loc[mask, 'time_diff'] = time_diffs
            self.df.loc[mask, 'speed'] = speeds

    def calculate_stationary_and_transition(self):
        """
        Determine stationary points and transition status based on speed and distance.
        All points not classified as stationary will be marked as in transition.

        Returns:
        - Updates self.df with 'stationary' and 'transition' columns.
        """
        # Filter out points with speed > 220 km/h
        self.df = self.df[self.df['speed'] <= 220 * 1000 / 3600]  # 220 km/h converted to m/s

        # Calculate stationary status: True for stationary points, False for transition points
        self.df['stationary'] = (self.df['speed'] < self.speed_limit) & (self.df['distance'] < self.max_distance)

        # Calculate transition status: Not stationary means in transition
        self.df['transition'] = np.where(self.df['stationary'], 0, 1)

        return self.df

    def _calculate_distances(self, df):
        """Helper method to calculate distances using haversine formula."""
        coords = df[['Latitude', 'Longitude']].values
        distances = np.array([
            self._haversine(coords[i-1][1], coords[i-1][0], coords[i][1], coords[i][0])
            for i in range(1, len(coords))
        ])
        return np.append(distances, 0)  # Append 0 for the last point

    def _haversine(self, lon1, lat1, lon2, lat2):
        # Haversine formula to calculate distance between two lat/lon points in meters
        R = 6371000  # Radius of Earth in meters
        lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

        dlon = lon2 - lon1
        dlat = lat2 - lat1

        a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
        distance = R * c  # Output distance in meters

        return distance

    def apply_clustering(self, df):
        """Apply DBSCAN clustering on stationary points."""
        return df.groupby('customer').apply(self._apply_dbscan).reset_index(drop=True)

    def _apply_dbscan(self, df):
        """Helper method to apply DBSCAN clustering."""
        clustering_model = DBSCAN(eps=self.epsilon, min_samples=self.min_samples, metric="haversine")
        cluster_labels = clustering_model.fit_predict(df[['Longitude', 'Latitude']].apply(np.radians))

        # Convert cluster labels to integers
        cluster_labels = cluster_labels.astype(int)

        return pd.DataFrame({'cluster': cluster_labels}, index=df.index)

    def find_home_cluster(self, geodata_clusters):
        """Identify the home cluster based on nighttime data."""
        # Filter for night hours
        geodata_night = geodata_clusters.loc[
            (geodata_clusters['hour_gps'] >= 20) | (geodata_clusters['hour_gps'] <= 7)
        ].copy()

        # Filter out any rows with a clusterID of -1
        geodata_night = geodata_night[geodata_night['cluster'] != -1]

        # Calculate the mode of clusterID per user during night hours
        geodata_night['home'] = geodata_night.groupby('customer')['clusterID'].transform(
            lambda x: statistics.mode(x)
        )

        # Ensure 'home' is treated as a string without floating-point suffix
        geodata_night['home'] = geodata_night['home'].astype(str).str.replace('.0', '')

        # Calculate various metrics to validate the home cluster
        geodata_night['nights_with_obs'] = geodata_night.groupby('customer')['day_gps'].transform('nunique')
        geodata_night['night_obs'] = geodata_night.groupby('customer')['day_gps'].transform('size')
        geodata_night['n_home'] = geodata_night.groupby('customer')['home'].transform(lambda x: x.value_counts().iloc[0])
        geodata_night['f_home'] = geodata_night['n_home'] / geodata_night['night_obs']

        # Update the 'home' label based on conditions
        geodata_night['home'] = geodata_night.apply(
            lambda x: x['home'] if x['nights_with_obs'] >= self.min_nights_obs and x['f_home'] > self.min_f_home else None, axis=1)

        # Extract a mapping of userID to home cluster
        user_home_mapping = geodata_night[['customer', 'home']].drop_duplicates()

        # Merging back to the full dataset
        return pd.merge(geodata_clusters, user_home_mapping, on='customer', how='left')

    def determine_if_at_home(self, df):
        """Determine if a person is at home."""
        # Ensure that both 'clusterID' and 'home' are treated as strings for comparison
        df['clusterID'] = df['clusterID'].astype(str).str.replace('.0', '')
        df['home'] = df['home'].astype(str).str.replace('.0', '')

        # Create a column 'at_home' that compares 'clusterID' with 'home'
        df['at_home'] = df.apply(
            lambda x: 1 if x['clusterID'] == x['home'] else (0 if pd.notna(x['home']) else -1), axis=1
        )
        return df

    def run(self):
        """Run the full extraction process."""
        self.calculate_distances_and_speeds()

        # Calculate stationary and transition points
        self.df = self.calculate_stationary_and_transition()

        # Apply clustering only to stationary points
        stationary_df = self.df[self.df['stationary']]  # Filter only stationary points
        geodata_cluster_df = self.apply_clustering(stationary_df)

        # Merge clustering results back to the original dataframe, including transition points
        geodata_clusters = pd.concat([self.df.reset_index(drop=True), geodata_cluster_df[['cluster']]], axis=1)

        # Fill NaN cluster labels for transition points with a placeholder (e.g., -1)
        geodata_clusters['cluster'].fillna(-1, inplace=True)

        # Create clusterID only for non-transition points
        geodata_clusters['clusterID'] = geodata_clusters.apply(
            lambda x: f"{x['customer']}00{int(x['cluster'])}" if x['cluster'] != -1 else None, axis=1
        )

        # Identify home clusters
        geodata_clusters = self.find_home_cluster(geodata_clusters)

        # Determine if the person is at home
        geodata_clusters = self.determine_if_at_home(geodata_clusters)

        return geodata_clusters


In [32]:
# Instantiate the class
extractor = HomeClusterExtractor(
    df=df_loc,
    speed_limit=speed_limit,
    max_distance=max_distance,
    epsilon=epsilon,
    min_samples=min_samples,
    min_nights_obs=min_nights_obs,
    min_f_home=min_f_home
)

# Run the extraction process
home_clusters = extractor.run()

In [33]:
home_clusters_red = home_clusters[["customer", "startTimestamp", "at_home","transition", "distance", "time_diff", "speed" ]]

In [34]:
home_clusters.home.nunique()

70

## 2. Prepare EMA data

In [35]:
df_ema_udi = df_ema_content[["customer", "createdAt_day", "quest_create", "unique_day_id", "assess"]]

In [36]:
# Group by customer and unique_day_id and calculate the minimum quest_create
df_min_quest = df_ema_udi.groupby(['customer', 'unique_day_id'])['quest_create'].min().reset_index()

# Rename the column to sensor_block_end
df_min_quest.rename(columns={'quest_create': 'sensor_block_end'}, inplace=True)

# Merge the minimum quest_create back to the original DataFrame
df_ema_udi = pd.merge(df_ema_udi, df_min_quest, on=['customer', 'unique_day_id'], how='left')

# Create the sensor_block_start column, which is 2 hours before quest_create
df_ema_udi.drop(columns=['quest_create'], inplace=True)
df_ema_udi = df_ema_udi.drop_duplicates()

In [37]:
df_ema_udi['sensor_block_start'] = df_ema_udi['sensor_block_end'] - pd.Timedelta(hours=2)


In [38]:
# Only include first assessment phase
df_ema_udi_base = df_ema_udi.loc[df_ema_udi.assess == 0]

In [39]:
df_ema_udi_test = df_ema_udi_base.head(100)

In [40]:
df_pass_act_test = df_pass_act_base.head(100)

## 3. Merge EMA to passive data

In [41]:
import pandas as pd
import numpy as np

class EMAMapper:
    def __init__(self, df_ema, df_data, df_home_clusters=None):
        self.df_ema = df_ema.copy()
        self.df_data = df_data.copy()

        if df_home_clusters is not None:
            self.df_home_clusters = df_home_clusters.copy()
            self.df_home_clusters['startTimestamp'] = pd.to_datetime(self.df_home_clusters['startTimestamp'])
        else:
            self.df_home_clusters = None

        self.df_ema['sensor_block_start'] = pd.to_datetime(self.df_ema['sensor_block_start'])
        self.df_ema['sensor_block_end'] = pd.to_datetime(self.df_ema['sensor_block_end'])
        self.df_data['startTimestamp'] = pd.to_datetime(self.df_data['startTimestamp'])

    def map_steps_to_ema(self):
        """Map steps to EMA blocks."""
        n_steps_values = []

        df_steps = self.df_data[self.df_data['type'] == 'Steps']

        for idx, ema_row in self.df_ema.iterrows():
            sensor_block_start = ema_row['sensor_block_start']
            sensor_block_end = ema_row['sensor_block_end']

            # Use df_steps, not df_filtered, for filtering
            df_filtered = df_steps[(df_steps['startTimestamp'] < sensor_block_end) & 
                                   (df_steps['endTimestamp'] > sensor_block_start)]

            if df_filtered.empty:
                n_steps_values.append(0)
            else:
                overlap_start = df_filtered['startTimestamp'].combine(sensor_block_start, max)
                overlap_end = df_filtered['endTimestamp'].combine(sensor_block_end, min)

                overlap_duration = (overlap_end - overlap_start).dt.total_seconds()
                step_duration = (df_filtered['endTimestamp'] - df_filtered['startTimestamp']).dt.total_seconds()

                proportion = overlap_duration / step_duration
                weighted_value = proportion * df_filtered['doubleValue']

                n_steps = weighted_value.sum()
                n_steps_values.append(round(n_steps))

        self.df_ema['n_steps'] = n_steps_values
        return self.df_ema


    def map_gps_and_transition_to_ema(self):
        """
        Map GPS, stationary, and transition data to EMA blocks in one process.

        Returns:
        - df_ema with additional columns: 'n_GPS', 'total_distance_km', 'transition', 'transition_minutes', 'at_home_minute', and 'at_home_binary'.
        """
        if self.df_home_clusters is None:
            raise ValueError("df_home_clusters is not provided during initialization.")

        gps_counts = []
        total_distances = []
        transition_values = []
        transition_minute_values = []
        at_home_minute_values = []
        at_home_binary_values = []

        for idx, ema_row in self.df_ema.iterrows():
            sensor_block_start = ema_row['sensor_block_start']
            sensor_block_end = ema_row['sensor_block_end']
            customer = ema_row['customer']

            # Filter the home clusters for the current EMA block and customer
            df_filtered = self.df_home_clusters[
                (self.df_home_clusters['startTimestamp'] >= sensor_block_start) & 
                (self.df_home_clusters['startTimestamp'] <= sensor_block_end) & 
                (self.df_home_clusters['customer'] == customer)
            ]

            if df_filtered.empty:
                # If no GPS data is found in the block, set to default values
                gps_counts.append(0)
                total_distances.append(0)
                transition_values.append(-1)  # Unknown
                transition_minute_values.append(0)
                at_home_minute_values.append(0)
                at_home_binary_values.append(-1)  # Unknown (no data)
            else:
                # Count GPS points
                gps_count = df_filtered.shape[0]
                gps_counts.append(gps_count)

                # Calculate total distance (sum of distances)
                total_distance = df_filtered['distance'].sum() / 1000  # Convert to kilometers
                total_distances.append(total_distance)

                # Calculate transition data
                transition_minutes = df_filtered[df_filtered['transition'] == 1]['time_diff'].sum() / 60  # Seconds to minutes
                transition_minute_values.append(transition_minutes)
                transition_status = 1 if transition_minutes > 0 else 0
                transition_values.append(transition_status)

                # Calculate at_home_minute
                at_home_minutes = df_filtered[df_filtered['at_home'] == 1]['time_diff'].sum() / 60  # Seconds to minutes
                at_home_minute_values.append(at_home_minutes)

                # Calculate at_home_binary: 1 if at least one row has at_home = 1, otherwise 0 or -1 if unknown
                if df_filtered['at_home'].eq(1).any():
                    at_home_binary_values.append(1)  # At least one point at home
                elif df_filtered['at_home'].eq(0).all():
                    at_home_binary_values.append(0)  # All points not at home
                else:
                    at_home_binary_values.append(-1)  # Unknown (if no valid data)

        self.df_ema['n_GPS'] = gps_counts
        self.df_ema['total_distance_km'] = total_distances
        self.df_ema['transition'] = transition_values
        self.df_ema['transition_minutes'] = transition_minute_values
        self.df_ema['at_home_minute'] = at_home_minute_values
        self.df_ema['at_home_binary'] = at_home_binary_values

        return self.df_ema


In [42]:

# Step 2: Map EMA data
ema_mapper = EMAMapper(df_ema_udi_base, df_pass_act_base, df_home_clusters=home_clusters_red)

# Step 3: Map steps, GPS data, and transitions
df_ema_with_steps = ema_mapper.map_steps_to_ema()
df_ema_with_gps_and_transition = ema_mapper.map_gps_and_transition_to_ema()
df_ema_with_at_home = ema_mapper.map_gps_and_transition_to_ema()

# Final DataFrame with all the mapped data
df_ema_with_at_home.head()


Unnamed: 0,customer,createdAt_day,unique_day_id,assess,sensor_block_end,sensor_block_start,n_steps,n_GPS,total_distance_km,transition,transition_minutes,at_home_minute,at_home_binary
0,MYAi,2023-09-19,20230919_1,0,2023-09-19 07:31:20.352,2023-09-19 05:31:20.352,295,115,0.445619,0,0.0,0.0,0
32,MYAi,2023-09-19,20230919_6,0,2023-09-19 17:14:33.463,2023-09-19 15:14:33.463,11126,286,16.757672,1,18.233333,0.0,0
33,MYAi,2023-09-28,20230928_6,0,2023-09-28 18:29:55.737,2023-09-28 16:29:55.737,15424,2,19.321239,1,60.05,0.0,0
35,MYAi,2023-09-25,20230925_6,0,2023-09-25 17:08:34.592,2023-09-25 15:08:34.592,15565,0,0.0,-1,0.0,0.0,-1
39,MYAi,2023-09-22,20230922_6,0,2023-09-22 17:06:14.166,2023-09-22 15:06:14.166,6507,85,6.81599,1,11.55,0.0,0


In [43]:
df_ema_with_at_home.at_home_minute.describe()

count    17087.000000
mean        18.095328
std         90.882889
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max       3301.950000
Name: at_home_minute, dtype: float64

In [45]:
with open(preprocessed_path + '/map_ema_passive.pkl', 'wb') as file:
    pickle.dump(df_ema_with_at_home, file)