# 4 Detailed Preprocessing of Passive Data

This notebook shows the analysis of situational context using EMA and passive sensing data

1. **Load Data**: Load necessary data from pickle files.
2. **Preprocess EMA**:

In [1]:
import os
import sys
import regex as re
from tqdm import tqdm
from intervaltree import IntervalTree

%load_ext autoreload
%autoreload 2
import logging
# Configure logging
logging.basicConfig(level=logging.INFO)

# If your current working directory is the notebooks directory, use this:
notebook_dir = os.getcwd()  # current working directory
src_path = os.path.abspath(os.path.join(notebook_dir, '..', 'src'))
sys.path.append(src_path)

# Add the parent directory to sys.path
parent_dir = os.path.abspath(os.path.join(notebook_dir, '..'))
sys.path.append(parent_dir)
import glob
import pickle
from IPython.display import Markdown
from server_config import datapath, preprocessed_path_freezed, redcap_path, preprocessed_path

import pandas as pd
import numpy as np
import datetime as dt
import EMA_Mapper
import gps_features
from missing_data import summarize_missing_data

from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from math import radians, cos, sin, asin, sqrt
import statistics  # Make sure this is imported
import hdbscan

import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns 
import matplotlib.patches as mpatches
# Ensure matplotlib displays plots inline (if using Jupyter Notebook)
%matplotlib inline

import warnings
# Suppress only SettingWithCopyWarning
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

ModuleNotFoundError: No module named 'intervaltree'

In [None]:
backup_path = preprocessed_path_freezed + "/backup_data_passive_actual.feather"
df_backup = pd.read_feather(backup_path)

with open(preprocessed_path_freezed + '/ema_data.pkl', 'rb') as file:
    df_ema_framework = pickle.load(file)

with open(preprocessed_path_freezed + '/ema_content.pkl', 'rb') as file:
    df_ema_content = pickle.load(file)  

with open(preprocessed_path_freezed + '/monitoring_data.pkl', 'rb') as file:
    df_monitoring = pickle.load(file)
    
with open(preprocessed_path_freezed + '/redcap_data.pkl', 'rb') as file:
    df_redcap = pickle.load(file)

sp1_path = redcap_path + "/baseline_T5_data_incl_ns_freezed_241120.sav"
df_sp1 = pd.read_spss(sp1_path)
freezed_ids = df_sp1.for_id.unique().tolist()

In [None]:
#backup_path = preprocessed_path+ "/backup_data_passive_actual.feather"
#df_backup = pd.read_feather(backup_path)

#with open(preprocessed_path + '/ema_data.pkl', 'rb') as file:
#    df_ema_framework = pickle.load(file)

#with open(preprocessed_path + '/ema_content.pkl', 'rb') as file:
#    df_ema_content = pickle.load(file)  

#with open(preprocessed_path + '/monitoring_data.pkl', 'rb') as file:
#    df_monitoring = pickle.load(file)
    
#with open(preprocessed_path + '/redcap_data.pkl', 'rb') as file:
#    df_redcap = pickle.load(file)

#sp1_path = redcap_path + "/baseline_T5_data_incl_ns_freezed_241120.sav"
#df_sp1 = pd.read_spss(sp1_path)
#freezed_ids = df_sp1.for_id.unique().tolist()

In [None]:
# Configurations
# Check min. amount of EMA data available to map to passive data

#GPS data
speed_limit = 1.4
max_distance = 150 
kms_per_radian = 6371000
epsilon = 100/kms_per_radian
min_samples = 10
min_cluster_size = 20
min_nights_obs = 4
min_f_home = 0.5

# EMA
assessment_phase = [0] #1,2
min_num_daily = 4
min_days_data = 7


#Passive to EMA matching
timedelta_hours = 2
assess = 0

## Filter for participants with sufficient data

In [None]:
# first assessment phase finished
df_ema = df_ema_content.loc[df_ema_content.status.isin(["Abgeschlossen", "Post_Erhebung_1",
                                                             "Erhebung_2_aktiv","Post_Erhebung_2", "Erhebung_3_aktiv", "Dropout"])]


In [None]:
df_ema["quest_create_day"] = df_ema.quest_create.dt.normalize()
df_ema["quest_create_hour"] = df_ema.quest_create.dt.hour

In [None]:
extra_cols = ["assess", "study", "quest_create", "weekend", "quest_nr", "weekday", "season", "time_of_day", "quest_create_day", "quest_create_hour"]

aggregated_info = df_ema.groupby(["customer", "unique_day_id"])[extra_cols].first().reset_index()


In [None]:
df_ema_panas = df_ema.loc[df_ema.quest_title.isin(['panas_fear1', 'panas_fear2', 'panas_guilt1', 
            'panas_guilt2', 'panas_hostility1', 'panas_hostility2', 
             'panas_sadness1', 'panas_sadness2'])]

# Pivot the table as specified
df_piv = df_ema_panas.pivot_table(
    index=["customer", "unique_day_id", "assess"],
    columns="quest_title",
    values="choice_text",
    aggfunc='first'  # Using 'first' since each entry should theoretically be unique per group
)

# The columns are now a single level Index with just the quest_title values since 'values' is not a list anymore
df_piv.columns = [col for col in df_piv.columns.values]

# Reset the index to turn the MultiIndex into columns
df_piv = df_piv.reset_index()
df_piv = df_piv.drop_duplicates()


In [None]:
df_ema_TAI = df_ema.loc[df_ema.quest_title.isin(['ta_behavioral_2',
       'ta_kognitiv', 'ta_kognitiv_2', 'ta_behavioral', 'panas_selfassurance', 'panas_joviality2', 'panas_fatigue',
       'panas_joviality1', 'panas_fear1', 'panas_hostility2',
       'panas_serenity2', 'panas_shyness', 'panas_hostility1',
       'panas_guilt1', 'panas_fear2', 'panas_sadness1', 'panas_guilt2',
       'panas_loneliness', 'panas_serenity1', 'panas_sadness2',
       'panas_attentiveness', 'er_intensity', 'er_control',
       'er_distraction', 'er_reappraisal', 'er_rumination',
       'er_relaxation', 'er_suppression', 'er_acceptance'])]

# Pivot the table as specified
df_piv_tai = df_ema_TAI.pivot_table(
    index=["customer", "unique_day_id", "assess"],
    columns="quest_title",
    values="choice_text",
    aggfunc='first'  # Using 'first' since each entry should theoretically be unique per group
)

# The columns are now a single level Index with just the quest_title values since 'values' is not a list anymore
df_piv_tai.columns = [col for col in df_piv_tai.columns.values]

# Reset the index to turn the MultiIndex into columns
df_piv_tai = df_piv_tai.reset_index()
df_piv_tai = df_piv_tai.drop_duplicates()


In [None]:
df_piv_tai = pd.merge(aggregated_info, df_piv_tai, on=["customer","assess", "unique_day_id"])
df_piv = pd.merge(aggregated_info, df_piv, on=["customer","assess", "unique_day_id"])


In [None]:
df_piv_tai_csv_path = preprocessed_path + '/ema_tai_benni.csv'
df_piv_tai.to_csv(df_piv_tai_csv_path, index=False)

In [None]:
df_piv = df_piv.loc[df_piv.study.isin([24,25])] # first assessment phase


In [None]:

na_scale = ['panas_fear1', 'panas_fear2', 'panas_guilt1', 
            'panas_guilt2', 'panas_hostility1', 'panas_hostility2', 
             'panas_sadness1', 'panas_sadness2']

# Step 1: Ensure the columns in pa_scale and na_scale are numeric
df_piv[na_scale] = df_piv[na_scale].apply(pd.to_numeric, errors='coerce')
# Drop rows where any of the na_scale columns have NaN
df_piv_clean = df_piv.dropna(subset=na_scale, how='any')

# Step 2: Calculate the mean for PA and NA scales per unique_day_id
df_piv['mean_na'] = df_piv.groupby(['customer', 'unique_day_id'])[na_scale].transform('mean').mean(axis=1)

In [None]:

# Calculate the number of unique 'unique_day_id' per 'customer' and 'quest_complete_day'
df_piv['n_quest'] = df_piv.groupby(['customer', 'quest_create_day'])['unique_day_id'].transform('nunique')

In [None]:
# Check condition
df_piv = df_piv.loc[df_piv["n_quest"] >= min_num_daily]
df_piv["n_days_min"] = df_piv.groupby("customer")['quest_create_day'].transform("nunique")
df_piv = df_piv.loc[df_piv.n_days_min >= min_days_data]
df_ema1_customers = df_piv.customer.unique().tolist()

## 1. Prepare passive features

In [None]:
df_pass_act = df_backup.copy()

In [None]:
df_pass_act.startTimestamp.max()

In [None]:
# Only keep data that were collected during the first assessment phase
df_pass_act_base = df_pass_act[df_pass_act.startTimestamp <= (df_pass_act.ema_base_end + pd.Timedelta(days=1))]

In [None]:
df_pass_act_base = df_pass_act_base.loc[df_pass_act_base.customer.isin(df_ema1_customers)]

In [None]:
df_pass_act_base.customer.nunique()

### 1.1 Calculate GPS features

In [None]:
df_pass_act_loc =df_pass_act_base[df_pass_act_base.type.isin(["Latitude", "Longitude"])][["customer", "startTimestamp", "type", "doubleValue"]]

In [None]:
df_loc = df_pass_act_loc.pivot_table(
    index=["customer", "startTimestamp"],
    columns="type",
    values=["doubleValue"],
    aggfunc='first'  # Using 'first' since each type should theoretically have only one entry per customer and timestamp
)

# Flatten the MultiIndex in columns
df_loc.columns = ['_'.join(col).strip() for col in df_loc.columns.values]

df_loc = df_loc.rename_axis(None, axis=1).reset_index()

# Rename the columns for clarity
df_loc = df_loc.rename(columns={
    'doubleValue_Latitude': 'Latitude',
    'doubleValue_Longitude': 'Longitude',
})

In [None]:
df_loc.customer.nunique()

In [None]:
# Count the number of rows per customer
customer_counts = df_loc['customer'].value_counts()

# Get a description of the distribution
distribution_description = customer_counts.describe()
print("Description of the distribution of number of rows per customer:")
print(distribution_description)

# Display percentiles
percentiles = customer_counts.quantile([i / 20 for i in range(1, 21)])
print("\nPercentiles:")
print(percentiles)

# Find minimum threshold for data inclusion with a floor of 0
threshold = max(0, customer_counts.mean() - customer_counts.std())
print(f"\nMinimum threshold for inclusion: {threshold}")

In [None]:
# Example usage with HDBSCAN and normalized min_samples:
extractor = gps_features.HomeClusterExtractor(df_loc, speed_limit=speed_limit, max_distance=max_distance, epsilon=epsilon, min_samples=min_samples, 
                                 min_nights_obs = min_nights_obs, min_f_home=min_f_home, clustering_method='dbscan', 
                                 normalize_min_samples=False, min_data_points=50)
result = extractor.run()


In [None]:
# 20 customers not enough GPS data (i.e. less than 50 data points, so that no home cluster could be computed 


In [None]:
home_clusters_red = result[["customer", "startTimestamp", "at_home","transition", "distance", "stationary","time_diff", "speed", "clusterID", "homeID" ]]

In [None]:
gps_customer_list = home_clusters_red.customer.unique().tolist()

## 2. Prepare EMA data

In [None]:
df_ema_udi = df_piv[["customer", "quest_create_day","quest_create", "unique_day_id", "assess",  "quest_create_hour", "weekday", 
                     "weekend","season", "time_of_day","n_quest","mean_na"]]

In [None]:
df_ema_udi = df_ema_udi.loc[df_ema_udi.customer.isin(gps_customer_list)]

In [None]:
# Group by customer and unique_day_id and calculate the minimum quest_create
df_min_quest = df_ema_udi.groupby(['customer', 'unique_day_id'])['quest_create'].min().reset_index()

# Rename the column to sensor_block_end
df_min_quest.rename(columns={'quest_create': 'sensor_block_end'}, inplace=True)

# Merge the minimum quest_create back to the original DataFrame
df_ema_udi = pd.merge(df_ema_udi, df_min_quest, on=['customer', 'unique_day_id'], how='left')

# Create the sensor_block_start column, which is 2 hours before quest_create
df_ema_udi.drop(columns=['quest_create'], inplace=True)
df_ema_udi = df_ema_udi.drop_duplicates()

In [None]:
# prepare mapping of passing data by creating blocks

df_ema_udi['sensor_block_start'] = df_ema_udi['sensor_block_end'] - pd.Timedelta(hours=2)


In [None]:
# Only include first assessment phase

df_ema_udi_base = df_ema_udi.loc[df_ema_udi.assess == 0]

In [None]:
df_ema_udi_base = df_ema_udi_base.copy()
df_ema_udi_base["unique_blocks"] = df_ema_udi_base.customer + df_ema_udi_base.unique_day_id
df_ema_udi_base = df_ema_udi_base.drop_duplicates(subset = ["customer", "unique_blocks"])

In [None]:
df_ema_udi_merged = pd.merge(df_ema_udi_base, df_redcap, on="customer", how="left")

In [None]:
df_ema_udi_merged = df_ema_udi_merged.drop_duplicates(subset = ["customer", "unique_blocks"])

## 3. Merge EMA to passive data

In [None]:
# Ensure 'customer' columns are strings and stripped of whitespace
df_ema_udi_merged['customer'] = df_ema_udi_merged['customer'].astype(str).str.strip()
df_pass_act_base['customer'] = df_pass_act_base['customer'].astype(str).str.strip()
home_clusters_red['customer'] = home_clusters_red['customer'].astype(str).str.strip()

In [None]:
# Step 1: Initialize the EMAMapper
ema_mapper = EMA_Mapper.EMAMapper(df_ema_udi_merged, df_pass_act_base, home_clusters_red)

# Step 2: Run the mappings
ema_mapper.run_mappings()

# Step 3: Retrieve the enriched EMA DataFrame
df_ema_enriched = ema_mapper.get_result()

### Include weather data

In [None]:
import openmeteo_requests

import requests_cache
import pandas as pd
from retry_requests import retry

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
	"latitude": 52.5244,
	"longitude": 13.4105,
	"start_date": "2023-05-01",
	"end_date": "2024-12-31",
	"daily": ["temperature_2m_max", "temperature_2m_min", "temperature_2m_mean", "apparent_temperature_max", "apparent_temperature_min", "apparent_temperature_mean", "sunshine_duration", "precipitation_sum", "precipitation_hours"],
	"timezone": "Europe/Berlin"
}
responses = openmeteo.weather_api(url, params=params)

# Process first location. Add a for-loop for multiple locations or weather models
response = responses[0]
print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation {response.Elevation()} m asl")
print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

# Process daily data. The order of variables needs to be the same as requested.
daily = response.Daily()
daily_temperature_2m_max = daily.Variables(0).ValuesAsNumpy()
daily_temperature_2m_min = daily.Variables(1).ValuesAsNumpy()
daily_temperature_2m_mean = daily.Variables(2).ValuesAsNumpy()
daily_apparent_temperature_max = daily.Variables(3).ValuesAsNumpy()
daily_apparent_temperature_min = daily.Variables(4).ValuesAsNumpy()
daily_apparent_temperature_mean = daily.Variables(5).ValuesAsNumpy()
daily_sunshine_duration = daily.Variables(6).ValuesAsNumpy()
daily_precipitation_sum = daily.Variables(7).ValuesAsNumpy()
daily_precipitation_hours = daily.Variables(8).ValuesAsNumpy()

daily_data = {"date": pd.date_range(
	start = pd.to_datetime(daily.Time(), unit = "s", utc = True),
	end = pd.to_datetime(daily.TimeEnd(), unit = "s", utc = True),
	freq = pd.Timedelta(seconds = daily.Interval()),
	inclusive = "left"
)}

daily_data["apparent_temperature_mean"] = daily_apparent_temperature_mean
daily_data["sunshine_duration"] = daily_sunshine_duration/3600
daily_data["precipitation_hours"] = daily_precipitation_hours

daily_dataframe = pd.DataFrame(data = daily_data)

In [None]:
daily_dataframe['date'] = pd.to_datetime(daily_dataframe['date'], utc=True)
daily_dataframe['assessment_day'] = daily_dataframe['date'].dt.date
df_ema_enriched["quest_create_day"] = pd.to_datetime(df_ema_enriched["quest_create_day"]).dt.date

df_ema_weather = pd.merge(
    df_ema_enriched,
    daily_dataframe,
    left_on="quest_create_day",
    right_on='assessment_day',
    how='left'  # Use 'left' to keep all records from df_ema_enriched
)

# Drop redundant columns if necessary
df_ema_weather.drop(['date', 'assessment_day'], axis=1, inplace=True)

### Analyze missingness

In [None]:
# Remove customers with missing person-static information

missing_static = df_ema_weather[df_ema_weather.age.isna()][["customer","for_id","age"]].customer.unique().tolist()
df_ema_weather = df_ema_weather[~df_ema_weather.customer.isin(missing_static)]

In [None]:
df_ema_weather.shape

In [None]:
feature_group_pa = [
    'activity_102_minutes',
       'activity_103_minutes', 'activity_104_minutes', 'activity_105_minutes',
       'activity_106_minutes', 'activity_107_minutes'
]
feature_group_hr = ['hr_mean', 'hr_min', 'hr_max', 'hr_std', 'hr_median', 'range_heartrate',
       'iqr_heartrate', 'hr_zone_resting', 'hr_zone_moderate',
       'hr_zone_vigorous']

feature_group_gps = [ 'n_GPS', 'total_distance_km', 'at_home_minute',
       'time_in_transition_minutes', 'time_stationary_minutes']

feature_group_weather = ["apparent_temperature_mean", "sunshine_duration", "precipitation_hours"]

feature_group_person_static = [
    'age',
    'somatic_description',
    'psychotropic_description',
    'employability_description_simple',
    'prior_treatment_description_simple',
    'ema_smartphone_description'
]

columns_to_check = ['activity_102_minutes',
       'activity_103_minutes', 'activity_104_minutes', 'activity_105_minutes',
       'activity_106_minutes', 'activity_107_minutes', 'hr_mean', 'hr_min', 'hr_max', 'hr_std', 'hr_median', 'range_heartrate',
       'iqr_heartrate', 'skewness_heartrate', 'kurtosis_heartrate','hr_peak_counts', 'hr_zone_resting', 'hr_zone_moderate','hr_zone_vigorous', 
                    'n_GPS', 'total_distance_km', 'at_home_minute','time_in_transition_minutes', 'time_stationary_minutes', "apparent_temperature_mean", 
                    "sunshine_duration", "precipitation_hours",'n_steps', 'calories_burned', 'age','somatic_description','psychotropic_description',
                    'employability_description_simple', 'prior_treatment_description_simple', 'ema_smartphone_description']

In [None]:
group_missing_df = summarize_missing_data(
    df=df_ema_weather,
    feature_group_pa=feature_group_pa,
    feature_group_gps=feature_group_gps,
    feature_group_hr=feature_group_hr,
    feature_group_weather = feature_group_weather,
    feature_group_person_static = feature_group_person_static,
    columns_to_check=columns_to_check,
    customer_id_col = "customer")


In [None]:
# Calculate total beeps and condition beeps per participant
beep_counts = df_ema_weather.groupby('customer').agg(
    total_beeps=('sensor_block_end', 'count'),
    condition_beeps=('at_home_minute', lambda x: ((x == -1) & (df_ema_enriched.loc[x.index, 'n_steps'] > 625)).sum())
).reset_index()

# Calculate the ratio of condition beeps to total beeps
beep_counts['condition_beeps_ratio'] = beep_counts['condition_beeps'] / beep_counts['total_beeps']

# Identify participants where condition beeps >50% of total beeps
high_condition_participants = beep_counts[beep_counts['condition_beeps_ratio'] > 0.50]['customer']

# Remove those participants 
df_ema_enriched_filtered = df_ema_weather[~df_ema_weather['customer'].isin(high_condition_participants)].copy()


In [None]:
beep_counts.condition_beeps.sum()

In [None]:
group_missing_df = summarize_missing_data(
    df=df_ema_enriched_filtered,
    feature_group_pa=feature_group_pa,
    feature_group_gps=feature_group_gps,
    feature_group_hr=feature_group_hr,
    feature_group_weather = feature_group_weather,
    feature_group_person_static = feature_group_person_static,
    columns_to_check=columns_to_check,
    customer_id_col = "customer")


### GPS condition

In [None]:
# Define the conditions
conditions_home = [
    (df_ema_enriched_filtered['at_home_minute'] == -1) & (df_ema_enriched_filtered['n_steps'] > 625),
    (df_ema_enriched_filtered['at_home_minute'] == -1) & (df_ema_enriched_filtered['n_steps'] <= 625),
    (df_ema_enriched_filtered['at_home_minute'] != -1)
]

conditions = [
    (df_ema_enriched_filtered['n_GPS'] == -1) & (df_ema_enriched_filtered['n_steps'] > 625),
    (df_ema_enriched_filtered['n_GPS'] == -1) & (df_ema_enriched_filtered['n_steps'] <= 625),
    (df_ema_enriched_filtered['n_GPS'] != -1)
]
# Define the corresponding choices
choices = [
    'Steps>625',
    'Steps<=625',
    'GPS_present'
]

# Create the categorical column
df_ema_enriched_filtered['missing_GPS_home'] = np.select(conditions_home, choices, default='Unknown')
df_ema_enriched_filtered['missing_GPS'] = np.select(conditions, choices, default='Unknown')


In [None]:
df_ema_enriched_filtered.loc[df_ema_enriched_filtered['missing_GPS_home']=='Steps>625']

### Steps condition

In [None]:
# Define the condition for all specified columns being -1
cols_to_check = [
    'hr_mean', 
    'activity_102_minutes', 
    'activity_103_minutes', 
    'activity_104_minutes', 
    'activity_105_minutes', 
    'activity_106_minutes', 
    'activity_107_minutes'
]

all_negative = (df_ema_enriched_filtered[cols_to_check] == -1).all(axis=1)

# Define the conditions for the new "missing_steps" column
conditions_steps = [
    # Condition: n_steps is -1 and all other specified columns are -1
    (df_ema_enriched_filtered['n_steps'] == -1) & all_negative,
    # Condition: n_steps is -1 but at least one of the specified columns is not -1
    (df_ema_enriched_filtered['n_steps'] == -1) & (~all_negative),
    # Condition: n_steps is not -1
    (df_ema_enriched_filtered['n_steps'] != -1)
]

# Define the corresponding choices
choices_steps = ['step_missing', 'step_zero', 'not_missing']

# Create the "missing_steps" column
df_ema_enriched_filtered['missing_steps'] = np.select(conditions_steps, choices_steps, default='Unknown')


### PA condition

In [None]:
# Define the conditions for the new "missing_pa" column

conditions_pa = [
    # 1. If 'activity_102_minutes' is NOT -1, mark as "not_missing"
    (df_ema_enriched_filtered['activity_102_minutes'] != -1),
    
    # 2. If 'activity_102_minutes' is -1 and both 'n_steps' and 'hr_mean' are -1, mark as "pa_missing"
    (df_ema_enriched_filtered['activity_102_minutes'] == -1) &
    ((df_ema_enriched_filtered['n_steps'] == -1) & (df_ema_enriched_filtered['hr_mean'] == -1)),
    
    # 3. If 'activity_102_minutes' is -1 and at least one of 'n_steps' or 'hr_mean' is not -1, mark as "pa_zero"
    (df_ema_enriched_filtered['activity_102_minutes'] == -1) &
    (((df_ema_enriched_filtered['n_steps'] != -1) | (df_ema_enriched_filtered['hr_mean'] != -1)))
]

# Define the corresponding choices for each condition
choices_pa = ['not_missing', 'pa_missing', 'pa_zero']

# Create the "missing_pa" column using np.select
df_ema_enriched_filtered['missing_pa'] = np.select(conditions_pa, choices_pa, default='Unknown')


In [None]:
df_ema_enriched_filtered.unique_blocks.nunique()

In [None]:
with open(preprocessed_path_freezed + f'/map_ema_passive.pkl', 'wb') as file:
    pickle.dump(df_ema_enriched_filtered, file)

In [None]:
with open(preprocessed_path + f'/map_ema_passive.pkl', 'wb') as file:
    pickle.dump(df_ema_enriched_filtered, file)