# 04 Detailed Preprocessing of Passive Data

This notebook shows the analysis of situational context using EMA and passive sensing data

1. **Load Data**: Load necessary data from pickle files.
2. **Preprocess EMA**:

In [1]:
import os
import sys
import regex as re
# If your current working directory is the notebooks directory, use this:
notebook_dir = os.getcwd()  # current working directory
src_path = os.path.abspath(os.path.join(notebook_dir, '..', 'src'))
sys.path.append(src_path)

# Add the parent directory to sys.path
parent_dir = os.path.abspath(os.path.join(notebook_dir, '..'))
sys.path.append(parent_dir)
import glob
import pickle
from IPython.display import Markdown
from config import datapath, preprocessed_path

import pandas as pd
import numpy as np
import datetime as dt

import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns 
import matplotlib.patches as mpatches

sns.set_context("notebook", rc={"axes.labelsize": 14, "xtick.labelsize": 14, "ytick.labelsize": 14})
sns.set_style("whitegrid", {'axes.grid': True})
%matplotlib inline
import plotly.express as px


In [2]:
backup_path = preprocessed_path + "backup_data_passive.feather"
df_backup = pd.read_feather(backup_path)

with open(preprocessed_path + '/ema_data.pkl', 'rb') as file:
    df_ema_framework = pickle.load(file)

with open(preprocessed_path + '/ema_content.pkl', 'rb') as file:
    df_ema_content = pickle.load(file)  

with open(preprocessed_path + '/monitoring_data.pkl', 'rb') as file:
    df_monitoring = pickle.load(file)

In [3]:
# Configurations
# Check min. amount of EMA data available to map to passive data

timedelta_hours = 2
assess = 0

#GPS data
speed_limit = 1.4
max_distance = 150 
kms_per_radian = 6371.0088
epsilon = 0.03/kms_per_radian
min_samples = 10
min_nights_obs = 5
min_f_home = 0.5



## 1. Prepare passive features

In [4]:
df_pass_act = df_backup.copy()

In [5]:
# Only keep data that were collected during the first assessment phase
df_pass_act_base = df_pass_act[df_pass_act.startTimestamp_day <= df_pass_act.ema_relative_end_phase0]

### 1.1 Calculate GPS features

In [8]:
df_pass_act_loc =df_pass_act_base[df_pass_act_base.type.isin(["Latitude", "Longitude"])][["customer", "startTimestamp", "type", "doubleValue"]]

In [10]:
df_loc = df_pass_act_loc.pivot_table(
    index=["customer", "startTimestamp"],
    columns="type",
    values=["doubleValue"],
    aggfunc='first'  # Using 'first' since each type should theoretically have only one entry per customer and timestamp
)

# Flatten the MultiIndex in columns
df_loc.columns = ['_'.join(col).strip() for col in df_loc.columns.values]

df_loc = df_loc.rename_axis(None, axis=1).reset_index()

# Rename the columns for clarity
df_loc = df_loc.rename(columns={
    'doubleValue_Latitude': 'Latitude',
    'doubleValue_Longitude': 'Longitude',
})

In [None]:

class HomeClusterExtractor:
    def __init__(self, df, speed_limit, max_distance, epsilon, min_samples, min_nights_obs, min_f_home):
        self.df = df.copy()
        self.speed_limit = speed_limit
        self.max_distance = max_distance
        self.epsilon = epsilon
        self.min_samples = min_samples
        self.min_nights_obs = min_nights_obs
        self.min_f_home = min_f_home

    def calculate_distances_and_speeds(self):
        """Calculate distances and speeds for each customer."""
        self.df['distance'], self.df['time_diff'], self.df['speed'] = np.nan, np.nan, np.nan

        for customer in self.df['customer'].unique():
            mask = self.df['customer'] == customer
            customer_data = self.df.loc[mask]

            # Calculate distances between consecutive points
            distances = self._calculate_distances(customer_data)
            time_diffs = customer_data['startTimestamp'].diff().dt.total_seconds().fillna(0)
            speeds = distances / time_diffs.replace(0, np.nan)

            self.df.loc[mask, 'distance'] = distances
            self.df.loc[mask, 'time_diff'] = time_diffs
            self.df.loc[mask, 'speed'] = speeds

    def _calculate_distances(self, df):
        """Helper method to calculate distances using haversine formula."""
        coords = df[['Latitude', 'Longitude']].values
        distances = np.array([
            haversine(coords[i-1][1], coords[i-1][0], coords[i][1], coords[i][0])
            for i in range(1, len(coords))
        ])
        return np.append(distances, 0)  # Append 0 for the last point

    def extract_stationary_points(self):
        """Filter stationary points based on speed and distance."""
        return self.df[(self.df['speed'] < self.speed_limit) & (self.df['distance'] < self.max_distance)]

    def apply_clustering(self, df):
        """Apply DBSCAN clustering on stationary points."""
        return df.groupby('customer').apply(self._apply_dbscan).reset_index(drop=True)

    def _apply_dbscan(self, df):
        """Helper method to apply DBSCAN clustering."""
        clustering_model = DBSCAN(eps=self.epsilon, min_samples=self.min_samples, metric="haversine")
        cluster_labels = clustering_model.fit_predict(df[['Longitude', 'Latitude']].apply(np.radians))
        return pd.DataFrame({'cluster_100m': cluster_labels}, index=df.index)

    def find_home_cluster(self, geodata_clusters):
        """Identify the home cluster based on nighttime data."""
        geodata_night = geodata_clusters[(geodata_clusters['hour_gps'] >= 20) | (geodata_clusters['hour_gps'] <= 6)]

        geodata_night['home'] = geodata_night.groupby('customer')['clusterID'].transform(
            lambda x: statistics.mode(x)
        )
        geodata_night['nights_with_obs'] = geodata_night.groupby('customer')['day_gps'].transform('nunique')
        geodata_night['night_obs'] = geodata_night.groupby('customer')['day_gps'].transform('size')
        geodata_night['n_home'] = geodata_night.groupby('customer')['home'].transform(lambda x: x.value_counts().iloc[0])
        geodata_night['f_home'] = geodata_night['n_home'] / geodata_night['night_obs']

        geodata_night['home'] = geodata_night.apply(
            lambda x: x['home'] if x['nights_with_obs'] >= self.min_nights_obs and x['f_home'] > self.min_f_home else None, axis=1
        )

        user_home_mapping = geodata_night[['customer', 'home']].drop_duplicates()
        return pd.merge(geodata_clusters, user_home_mapping, on='customer', how='left')

    def run(self):
        """Run the full extraction process."""
        self.calculate_distances_and_speeds()
        stationary_df = self.extract_stationary_points()
        geodata_cluster_df = self.apply_clustering(stationary_df)

        geodata_clusters = pd.concat([stationary_df.reset_index(drop=True), geodata_cluster_df['cluster_100m']], axis=1)
        geodata_clusters['clusterID'] = geodata_clusters['customer'].astype(str) + '00' + geodata_clusters['cluster_100m'].astype(str)

        return self.find_home_cluster(geodata_clusters)

# Example usage:
extractor = HomeClusterExtractor(df_speed, speed_limit=30, max_distance=50, epsilon=0.001, min_samples=5, min_nights_obs=3, min_f_home=0.7)
home_clusters = extractor.run()


## 2. Prepare EMA data

In [12]:
df_ema_udi = df_ema_content[["customer", "createdAt_day", "quest_create", "unique_day_id", "assess"]]

In [13]:
# Group by customer and unique_day_id and calculate the minimum quest_create
df_min_quest = df_ema_udi.groupby(['customer', 'unique_day_id'])['quest_create'].min().reset_index()

# Rename the column to sensor_block_end
df_min_quest.rename(columns={'quest_create': 'sensor_block_end'}, inplace=True)

# Merge the minimum quest_create back to the original DataFrame
df_ema_udi = pd.merge(df_ema_udi, df_min_quest, on=['customer', 'unique_day_id'], how='left')

# Create the sensor_block_start column, which is 2 hours before quest_create
df_ema_udi.drop(columns=['quest_create'], inplace=True)
df_ema_udi = df_ema_udi.drop_duplicates()

In [14]:
df_ema_udi['sensor_block_start'] = df_ema_udi['sensor_block_end'] - pd.Timedelta(hours=2)


In [15]:
# Only include first assessment phase
df_ema_udi_base = df_ema_udi.loc[df_ema_udi.assess == 0]

## 3. Merge EMA to passive data

In [19]:
import pandas as pd

class EMAMapper:
    def __init__(self, df_ema, df_data):
        """
        Initialize the EMAMapper class with EMA blocks and combined data.

        Parameters:
        - df_ema: DataFrame containing the EMA blocks with 'sensor_block_start' and 'sensor_block_end'.
        - df_data: DataFrame containing various data types with 'type', 'startTimestamp', and 'endTimestamp'.
        """
        self.df_ema = df_ema.copy()
        self.df_data = df_data.copy()

        # Ensure datetime format
        self.df_ema['sensor_block_start'] = pd.to_datetime(self.df_ema['sensor_block_start'])
        self.df_ema['sensor_block_end'] = pd.to_datetime(self.df_ema['sensor_block_end'])
        self.df_data['startTimestamp'] = pd.to_datetime(self.df_data['startTimestamp'])
        if 'endTimestamp' in self.df_data.columns:
            self.df_data['endTimestamp'] = pd.to_datetime(self.df_data['endTimestamp'])

    def map_steps_to_ema(self):
        """
        Maps Steps data to EMA blocks based on time overlap and calculates n_steps.
        Rounds n_steps to the nearest whole number and fills with 0 if no Steps data is found within the block.

        Returns:
        - df_ema with an additional column 'n_steps' containing the summed and rounded Steps values.
        """

        # Initialize a list to store the mapped step values
        n_steps_values = []

        # Filter df_data for Steps data only
        df_steps = self.df_data[self.df_data['type'] == 'Steps']

        # Iterate over each row in df_ema
        for idx, ema_row in self.df_ema.iterrows():
            # Extract the current block's start and end times
            sensor_block_start = ema_row['sensor_block_start']
            sensor_block_end = ema_row['sensor_block_end']

            # Filter the Steps data within the current EMA block
            df_filtered = df_steps[(df_steps['startTimestamp'] < sensor_block_end) & 
                                   (df_steps['endTimestamp'] > sensor_block_start)]

            if df_filtered.empty:
                n_steps_values.append(0)
            else:
                # Calculate overlap start and end using Pandas functions
                overlap_start = df_filtered['startTimestamp'].combine(sensor_block_start, max)
                overlap_end = df_filtered['endTimestamp'].combine(sensor_block_end, min)

                # Calculate overlap duration
                overlap_duration = (overlap_end - overlap_start).dt.total_seconds()

                # Calculate step duration
                step_duration = (df_filtered['endTimestamp'] - df_filtered['startTimestamp']).dt.total_seconds()

                # Calculate proportion and weighted value
                proportion = overlap_duration / step_duration
                weighted_value = proportion * df_filtered['doubleValue']

                # Sum the weighted values to get the total number of steps for this EMA block
                n_steps = weighted_value.sum()

                # Round n_steps to the nearest whole number
                n_steps_values.append(round(n_steps))

        # Assign the rounded n_steps values back to the df_ema
        self.df_ema['n_steps'] = n_steps_values

        return self.df_ema


    def count_gps_rows_within_blocks(self):
        """
        Counts the number of GPS rows (n_GPS) where startTimestamp lies between sensor_block_start and sensor_block_end.
        Fills with 0 if no GPS data is found within the block.

        Returns:
        - df_ema with an additional column 'n_GPS' containing the count of GPS rows within each block.
        """

        # Initialize a list to store the counts of GPS rows
        gps_counts = []

        # Filter df_data for GPS data (Latitude and Longitude)
        df_gps = self.df_data[self.df_data['type'].isin(['Latitude', 'Longitude'])]

        # Iterate over each row in df_ema
        for idx, ema_row in self.df_ema.iterrows():
            # Extract the current block's start and end times
            sensor_block_start = ema_row['sensor_block_start']
            sensor_block_end = ema_row['sensor_block_end']

            # Filter the data within the current EMA block for GPS data
            df_filtered = df_gps[(df_gps['startTimestamp'] >= sensor_block_start) & 
                                 (df_gps['startTimestamp'] <= sensor_block_end)]

            # Count GPS entries (each GPS entry has two rows: Latitude and Longitude)
            gps_count = df_filtered.shape[0] // 2 if not df_filtered.empty else 0

            # Append the count to the list
            gps_counts.append(gps_count)

        # Assign the counts back to the df_ema
        self.df_ema['n_GPS'] = gps_counts

        return self.df_ema

In [21]:
# Initialize the EMAMapper class
ema_mapper = EMAMapper(df_ema_udi_base, df_pass_act_base)

# Map passive steps to EMA blocks
df_ema_with_mapped_values = ema_mapper.map_steps_to_ema()

# Count GPS rows within EMA blocks
df_ema_with_gps_counts = ema_mapper.count_gps_rows_within_blocks()

In [25]:
df_test= df_ema_with_gps_counts.loc[df_ema_with_gps_counts.n_GPS == 0]

In [29]:
df_test1 = df_ema_with_gps_counts.loc[df_ema_with_gps_counts.n_steps == 0]

In [30]:
df_test1

Unnamed: 0,customer,createdAt_day,unique_day_id,assess,sensor_block_end,sensor_block_start,n_steps,n_GPS
1765,MYAi,2023-09-30,20230930_1,0,2023-09-30 09:00:56.682,2023-09-30 07:00:56.682,0,0
10313,f1J2,2023-08-28,20230828_1,0,2023-08-28 08:05:01.498,2023-08-28 06:05:01.498,0,62
10315,f1J2,2023-09-03,20230903_1,0,2023-09-03 08:14:41.018,2023-09-03 06:14:41.018,0,0
10476,f1J2,2023-09-02,20230902_2,0,2023-09-02 09:39:34.210,2023-09-02 07:39:34.210,0,44
11065,f1J2,2023-09-05,20230905_1,0,2023-09-05 07:36:58.530,2023-09-05 05:36:58.530,0,0
...,...,...,...,...,...,...,...,...
677894,asYV,2023-06-30,20230630_1,0,2023-06-30 08:08:19.130,2023-06-30 06:08:19.130,0,17
677926,WAu8,2023-11-11,20231111_5,0,2023-11-11 16:46:18.889,2023-11-11 14:46:18.889,0,0
677959,WAu8,2023-11-11,20231111_3,0,2023-11-11 13:08:02.992,2023-11-11 11:08:02.992,0,0
677995,WAu8,2023-11-10,20231110_7,0,2023-11-10 20:22:50.383,2023-11-10 18:22:50.383,0,0
