# DGPS 2024: Situation Assessment in PREACT

This notebook shows the analysis of situational context using EMA and passive sensing data

1. **Load Data**: Load necessary data from pickle files.
2. **Preprocess EMA**:
- Keep only first assessment phase 
- Remove all entries that have no complete EMA assessment 
- Remove all participants with too few data 
- Create blocks of assessment 
3. **Perform Item Analysis according to Siepe et al. (2022)**

In [1]:
import os
import sys
import regex as re
# If your current working directory is the notebooks directory, use this:
notebook_dir = os.getcwd()  # current working directory
src_path = os.path.abspath(os.path.join(notebook_dir, '..', 'src'))
sys.path.append(src_path)

# Add the parent directory to sys.path
parent_dir = os.path.abspath(os.path.join(notebook_dir, '..'))
sys.path.append(parent_dir)
import glob
import pickle
from IPython.display import Markdown
from server_config import datapath, preprocessed_path

import pandas as pd
import numpy as np
import datetime as dt
from scipy.stats import entropy

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder


import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns 
import matplotlib.patches as mpatches

sns.set_context("notebook", rc={"axes.labelsize": 14, "xtick.labelsize": 14, "ytick.labelsize": 14})
sns.set_style("whitegrid", {'axes.grid': True})
%matplotlib inline
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots


ModuleNotFoundError: No module named 'config'

In [None]:
# Plotting
# Original colors for situation categories mapped to the simplified names
situation_colors = {
    'other': '#1f77b4',
    'care work': '#ff7f0e',
    'travelling': '#2ca02c',
    'chores': '#d62728',
    'eating - drinking - selfcare': '#9467bd',  # Updated name to match the new mapping
    'active leisure': '#8c564b',
    'smartphone - social media': '#e377c2',  # Updated name to match the new mapping
    'passive leisure': '#7f7f7f',
    'work or study': '#bcbd22'
}


# Paired colors for diagnosis categories
diagnosis_colors = {
    'Depressive Disorder': '#a6cee3',   # Light Blue
    'Social Anxiety Disorder': '#1f78b4',  # Dark Blue
    'Obsessive-Compulsive Disorder': '#b2df8a',  # Light Green
    'Generalized Anxiety Disorder': '#33a02c',  # Dark Green
    'Agoraphobia and/or Panic Disorder': '#fb9a99',  # Light Pink
    'Post-Traumatic Stress Disorder': '#e31a1c',  # Red
    'Specific Phobia': '#fdbf6f'}





In [None]:
# Plotting

# Function to get color for situation category
def get_situation_color(category):
    return situation_colors.get(category, '#000000')  # Default to black if category not found

# Function to get color for diagnosis category
def get_diagnosis_color(category):
    return diagnosis_colors.get(category, '#000000')  # Default to black if category not found


In [None]:
#backup_path = preprocessed_path + "backup_data_passive.feather"
#df_backup = pd.read_feather(backup_path)

with open(preprocessed_path + 'ema_data.pkl', 'rb') as file:
    df_ema_framework = pickle.load(file)

with open(preprocessed_path + '/ema_content.pkl', 'rb') as file:
    df_ema_content = pickle.load(file)  

with open(preprocessed_path + '/monitoring_data.pkl', 'rb') as file:
    df_monitoring = pickle.load(file)

with open(preprocessed_path + '/redcap_data.pkl', 'rb') as file:
    df_redcap = pickle.load(file)

with open(preprocessed_path + '/map_ema_passive.pkl', 'rb') as file:
    df_ema_passive = pickle.load(file)

In [None]:
df_ema_passive

In [None]:
b

### Configurations

In [None]:
# EMA
assessment_phase = [0] #1,2
min_num_daily = 4
min_days_data = 7

#stationary filtering
max_distance = 150 
speed_limit = 1.4  # Max allowed speed in m/s

# DBSCAN
kms_per_radian = 6371.0088 # equitorial radius of the earth = 6,371.1 
epsilon = 0.03/kms_per_radian
min_samples = 10

# Kmeans
DKmeans = 500

#home featurenight
min_nights_obs = 4
min_f_home = 0.5 

### 1. Include only patients with finished assessments and enough quests

In [None]:
# first assessment phase finished
df_ema = df_ema_content.loc[df_ema_content.status.isin(["Abgeschlossen", "Post_Erhebung_1",
                                                             "Erhebung_2_aktiv","Post_Erhebung_2", "Erhebung_3_aktiv", "Dropout"])]

In [None]:
df_ema1 = df_ema.loc[df_ema.study.isin([24,25])] # first assessment phase
df_ema2 = df_ema.loc[df_ema.study.isin([33,34])] # second assessment phase
df_ema3 = df_ema.loc[df_ema.study.isin([33,34])] # third assessment phase

In [None]:
df_ema1 = df_ema1.loc[df_ema1["n_quest"] >= min_num_daily]
df_ema1["n_days_min"] = df_ema1.groupby("customer")['quest_complete_day'].transform("nunique")
df_ema1 = df_ema1.loc[df_ema1.n_days_min >= min_days_data]
df_ema1_customers = df_ema1.customer.unique().tolist()

In [None]:
df_ema2 = df_ema2.loc[df_ema2["n_quest"] >= min_num_daily]
df_ema2["n_days_min"] = df_ema2.groupby("customer")['quest_complete_day'].transform("nunique")
df_ema2 = df_ema2.loc[df_ema2.n_days_min >= min_days_data]
df_ema2_customers = df_ema2.customer.unique().tolist()

In [None]:
df_ema3 = df_ema3.loc[df_ema3["n_quest"] >= min_num_daily]
df_ema3["n_days_min"] = df_ema3.groupby("customer")['quest_complete_day'].transform("nunique")
df_ema3 = df_ema3.loc[df_ema3.n_days_min >= min_days_data]
df_ema3_customers = df_ema3.customer.unique().tolist()

### 2. Pivot table to get assessments merged together

In [None]:
df_social = df_ema[df_ema.quest_title == "event_social2"]

In [None]:
df_sit = df_ema[df_ema.quest_title =="situation1"]

In [None]:
df_nosit = df_ema[(df_ema['quest_title'] != 'event_social2') & (df_ema['quest_title'] != 'situation1')]


In [None]:
# Pivot the table as specified
df_sit = df_sit.pivot_table(
    index=["customer", "unique_day_id", "quest_create", "choice_id"],
    columns="quest_title",
    values="choice_text",
    aggfunc='first'  # Using 'first' since each entry should theoretically be unique per group
)

# The columns are now a single level Index with just the quest_title values since 'values' is not a list anymore
df_sit.columns = [col for col in df_sit.columns.values]

# Reset the index to turn the MultiIndex into columns
df_sit = df_sit.reset_index()

df_sit = df_sit.drop_duplicates(subset=['customer', 'unique_day_id', 'choice_id'])


df_sit['situation_count'] = df_sit.groupby(["customer","unique_day_id"])["choice_id"].transform('count')

In [None]:
# Pivot the table as specified
df_social = df_social.pivot_table(
    index=["customer", "unique_day_id", "choice_id"],
    columns="quest_title",
    values="choice_text",
    aggfunc='first'  # Using 'first' since each entry should theoretically be unique per group
)

# The columns are now a single level Index with just the quest_title values since 'values' is not a list anymore
df_social.columns = [col for col in df_social.columns.values]

# Reset the index to turn the MultiIndex into columns
df_social = df_social.reset_index()
df_sit = df_sit.drop_duplicates(subset=['customer', 'unique_day_id', 'choice_id'])

df_social['social_contact_count'] = df_social.groupby(["customer","unique_day_id"])["choice_id"].transform('count')

In [None]:
df_social = df_social.drop(columns=['choice_id'])

In [None]:
# Pivot the table as specified
df_piv = df_nosit.pivot_table(
    index=["customer", "unique_day_id", "assess", "quest_complete_day", "absolute_day_index", "relative_day_index", "weekend", "quest_nr", "weekday"],
    columns="quest_title",
    values="choice_text",
    aggfunc='first'  # Using 'first' since each entry should theoretically be unique per group
)

# The columns are now a single level Index with just the quest_title values since 'values' is not a list anymore
df_piv.columns = [col for col in df_piv.columns.values]

# Reset the index to turn the MultiIndex into columns
df_piv = df_piv.reset_index()
df_piv = df_piv.drop_duplicates()


In [None]:
benni_list = ['customer', 'unique_day_id', 'assess', 'quest_complete_day',
       'absolute_day_index', 'relative_day_index',
       'er_acceptance', 'er_control', 'er_distraction', 'er_intensity',
       'er_reappraisal', 'er_relaxation', 'er_rumination', 'er_suppression',
       'panas_attentiveness', 'panas_fatigue', 'panas_fear1', 'panas_fear2',
       'panas_guilt1', 'panas_guilt2', 'panas_hostility1', 'panas_hostility2',
       'panas_joviality1', 'panas_joviality2', 'panas_loneliness',
       'panas_sadness1', 'panas_sadness2', 'panas_selfassurance',
       'panas_serenity1', 'panas_serenity2', 'panas_shyness',
       'ta_behavioral', 'ta_behavioral_2','ta_kognitiv', 'ta_kognitiv_2', 'mean_pa', 'mean_na']

In [None]:

pa_scale = ['panas_attentiveness', 'panas_joviality1', 'panas_joviality2', 
            'panas_selfassurance', 'panas_serenity1', 'panas_serenity2']
na_scale = ['panas_fatigue', 'panas_fear1', 'panas_fear2', 'panas_guilt1', 
            'panas_guilt2', 'panas_hostility1', 'panas_hostility2', 
            'panas_loneliness', 'panas_sadness1', 'panas_sadness2', 'panas_shyness']

# Step 1: Ensure the columns in pa_scale and na_scale are numeric
df_piv[pa_scale + na_scale] = df_piv[pa_scale + na_scale].apply(pd.to_numeric, errors='coerce')

# Step 2: Calculate the mean for PA and NA scales per unique_day_id
df_piv['mean_pa'] = df_piv.groupby(['customer', 'unique_day_id'])[pa_scale].transform('mean').mean(axis=1)
df_piv['mean_na'] = df_piv.groupby(['customer', 'unique_day_id'])[na_scale].transform('mean').mean(axis=1)


In [None]:
df_piv_benni = df_piv.loc[df_piv.assess == 1][benni_list]
df_piv_benni.to_csv(f"data_benni_19092024.csv")

In [None]:
# Keep only customers with complete assessment
df_piv_merged = df_piv.merge(df_sit, on=["customer", "unique_day_id"], how="right")

# Keep only customers with complete assessment
df_piv_merged = df_piv_merged.merge(df_social, on=["customer", "unique_day_id"], how="right")

In [None]:
# Define the situation mapping
situation_mapping = {
    'Etwas Anderes': 'other',  
    'Kümmern um Kinder / Angehörige': 'care work',
    'Unterwegs (z.B. in der U-Bahn)': 'travelling',
    'Hausarbeit oder Erledigungen': 'chores',
    'Essen/ Trinken/ Körperpflege': 'eating - drinking - selfcare',  # Changed commas to hyphens
    'Freizeitaktivität, eher aktiv (z.B. Sport, Unternehmungen)': 'active leisure',
    'Smartphone/ Soziale Medien': 'smartphone - social media',  # Changed commas to hyphens
    'Freizeitaktivität, eher passiv (z.B. Film schauen, Lesen)': 'passive leisure',
    'Arbeit oder Studium': 'work or study'
}

# Apply the mapping to the 'situation1' column
df_piv_merged['situation1_simplified'] = df_piv_merged['situation1'].map(situation_mapping)

In [None]:
# Keep only first assessment phase
df_piv_merged = df_piv_merged.loc[df_piv_merged.assess.isin(assessment_phase)]

# keep inly customers with engough data in first assessment phase
df_piv_merged = df_piv_merged.loc[df_piv_merged.customer.isin(df_ema1_customers)]
df_piv_merged = df_piv_merged.drop_duplicates()

### 3. Create binary negative affect score

In [None]:

# Step 1: Calculate the mode of 'mean_pa' and 'mean_na' for each customer
def calculate_customer_modes(df):
    # Group by customer and apply mode calculation
    modes = df.groupby('customer').agg({
        'mean_pa': lambda x: x.mode()[0] if not x.mode().empty else None,  # Mode for mean_pa
        'mean_na': lambda x: x.mode()[0] if not x.mode().empty else None   # Mode for mean_na
    })
    return modes

# Step 2: Add the mode values as new columns to the original dataframe
modes = calculate_customer_modes(df_piv_merged)

# Merge the mode values back to the original dataframe
df_piv_merged = df_piv_merged.merge(modes, on='customer', suffixes=('', '_mode'))

# Step 3: Create new columns for the binary classification based on mode comparison
def label_rows(df):
    # Label 'mean_pa' as 1 if above mode, otherwise 0
    df['mean_pa_label'] = (df['mean_pa'] < df['mean_pa_mode']).astype(int)
    
    # Label 'mean_na' as 1 if below mode (improvement), otherwise 0
    df['mean_na_label'] = (df['mean_na'] > df['mean_na_mode']).astype(int)
    
    return df

# Apply the labeling function
df_piv_merged = label_rows(df_piv_merged)


In [None]:
df_piv_merged.groupby("mean_na_label")["customer"].count()

### 4. Analyse situation (evaluation) variables

#### 4.1 Blau Index

In [None]:
df_piv_merged_sit = df_piv_merged[["customer", "situation1", 'situation1_simplified',"absolute_day_index"]]

In [None]:
# Assuming df_piv_merged is your DataFrame
# Step 1: Calculate the proportion of each situation in the entire dataset
situation_counts = df_piv_merged['situation1'].value_counts()
total_counts = situation_counts.sum()
proportions = situation_counts / total_counts

# Step 2: Calculate the Blau Index across all individuals
squared_proportions = proportions ** 2
blau_index_across_individuals = 1 - np.sum(squared_proportions)

print(f"Blau Index Across Individuals: {blau_index_across_individuals:.4f}")


In [None]:
# Step 1: Define a function to calculate the Blau Index for a single individual
def calculate_blau_index(group):
    situation_counts = group['situation1'].value_counts()
    total_counts = situation_counts.sum()
    proportions = situation_counts / total_counts
    squared_proportions = proportions ** 2
    blau_index = 1 - np.sum(squared_proportions)
    return blau_index

# Step 2: Apply this function to each individual
blau_index_within_individuals = df_piv_merged.groupby('customer').apply(calculate_blau_index).reset_index(name='blau_index')
# Step 3: Merge the BLAU Index back into the original DataFrame
df_piv_merged = pd.merge(df_piv_merged, blau_index_within_individuals, on='customer', how='left')

## 5. Match with Redcap data

In [None]:
df_redcap_full = pd.merge(df_redcap, df_piv_merged, on= "customer", how="right")

In [None]:
df_piv_merged.groupby("customer")["unique_day_id"].nunique().sum()

In [None]:
df_piv_merged.groupby("customer")["unique_day_id"].nunique().mean()

In [None]:
df_piv_merged.groupby("customer")["unique_day_id"].nunique().std()

#### 5.1 Situation distribution across categories

In [None]:
# Filter the situations of interest using the simplified labels
situation_diagnosis_dist = df_redcap_full.loc[df_redcap_full['situation1_simplified'].isin(situation_colors.keys())]

# Group by 'scid_cv_description', 'customer', and 'unique_day_id' to count unique occurrences of each situation
situation_diagnosis_dist = situation_diagnosis_dist.groupby(
    ['scid_cv_description', 'customer', 'unique_day_id', 'situation1_simplified']
).size().reset_index(name='count')

# Aggregate counts by 'scid_cv_description' and 'situation1_simplified'
situation_diagnosis_dist = situation_diagnosis_dist.groupby(
    ['scid_cv_description', 'situation1_simplified']
)['count'].sum().reset_index()


# Calculate the total number of unique assessments per customer, then sum per diagnosis
assessment_counts = df_redcap_full.groupby(['scid_cv_description', 'customer'])['unique_day_id'].nunique().reset_index(name='unique_days_per_customer')
assessment_counts = assessment_counts.groupby('scid_cv_description')['unique_days_per_customer'].sum().reset_index(name='total_assessments')

# Merge the counts with the assessment counts
situation_diagnosis_dist = pd.merge(situation_diagnosis_dist, assessment_counts, on='scid_cv_description')

# Normalize the counts by the total number of assessments with each diagnosis
situation_diagnosis_dist['normalized_count'] = situation_diagnosis_dist['count'] / situation_diagnosis_dist['total_assessments'] * 100

# Define the three categories to be included in the plot
selected_situations = ["smartphone - social media", "travelling", "eating - drinking - selfcare"]

# Filter the situation_diagnosis_dist DataFrame to only include these three categories
filtered_situation_diagnosis_dist = situation_diagnosis_dist[situation_diagnosis_dist['situation1_simplified'].isin(selected_situations)]

# Create an empty figure
fig = go.Figure()

# Loop through each situation in the selected categories and add a scatter trace
for situation in selected_situations:
    filtered_data = filtered_situation_diagnosis_dist[filtered_situation_diagnosis_dist['situation1_simplified'] == situation]
    fig.add_trace(
        go.Scatter(
            x=filtered_data['scid_cv_description'],
            y=filtered_data['normalized_count'],
            mode='lines+markers',  # Add both lines and dots
            name=situation,
            marker=dict(
                color=situation_colors.get(situation, '#d3d3d3'),  # Default color if not in situation_colors
                size=8  # Make the dots a bit smaller for better readability with lines
            ),
            line=dict(
                color=situation_colors.get(situation, '#d3d3d3'),  # Same color for the lines
                width=2
            )
        )
    )

# Update layout to adjust figure size and titles
fig.update_layout(
    width=1200,  # Width for the entire figure
    height=600,  # Adjusted height for easier stacking
    font=dict(size=16),  # Increase font size
    title_text="Normalized Distribution of Selected Situations per Diagnosis (by Unique Assessments)",
    xaxis_title="Diagnosis",
    yaxis_title="Normalized Count",
    legend_title_text="Situation Type",
)

# Show the plot
fig.show()


In [None]:


# Filter for the three diagnoses of interest
diagnoses_of_interest = ['Depressive Disorder', 'Social Anxiety Disorder', 'Obsessive-Compulsive Disorder']
situation_diagnosis_dist = situation_diagnosis_dist[situation_diagnosis_dist['scid_cv_description'].isin(diagnoses_of_interest)]

# Get the top 3 situations for each diagnosis
top_situations = situation_diagnosis_dist.groupby('scid_cv_description').apply(lambda x: x.nlargest(3, 'normalized_count')).reset_index(drop=True)

fig = make_subplots(
    rows=1, cols=3,  # 1 row, 3 columns
    subplot_titles=("Depressive", "Social Anxiety", "OCD"),  # Set your custom titles here
    column_widths=[0.3, 0.3, 0.3]  # Adjust column width to make plots smaller
)

# Add bar plots for each diagnosis
for i, diagnosis in enumerate(diagnoses_of_interest):
    diagnosis_data = top_situations[top_situations['scid_cv_description'] == diagnosis]
    if not diagnosis_data.empty:
        fig.add_trace(
            go.Bar(
                x=diagnosis_data['situation1_simplified'],
                y=diagnosis_data['normalized_count'],
                marker_color=[situation_colors[sit] for sit in diagnosis_data['situation1_simplified']],
                textposition='none'  # Remove percentages from bars
            ),
            row=1, col=i+1
        )

# Update layout with increased font size
fig.update_layout(
    height=400,
    width=1000,
    showlegend=False,
    font=dict(size=14),  # Increase font size
    margin=dict(l=50, r=50, t=50, b=50)
)

# Update y-axis to range from 0 to 100%
fig.update_yaxes(title_text='Normalized Count (%)', range=[0, 100], row=1, col=1)
fig.update_yaxes(range=[0, 100], row=1, col=2)
fig.update_yaxes(range=[0, 100], row=1, col=3)

# Show the plot
fig.show()


In [None]:
# Filter data for single situations using the `situation_count` variable
single_situations = df_redcap_full[df_redcap_full['situation_count'] == 1]

# Count occurrences of each single situation
top_single_situations = single_situations['situation1_simplified'].value_counts().nlargest(3).reset_index()
top_single_situations.columns = ['situation1_simplified', 'count']

# Identify situations that were frequently selected with others (situation_count > 1)
frequent_with_others = df_redcap_full[df_redcap_full['situation_count'] > 1]
top_frequent_with_others = frequent_with_others['situation1_simplified'].value_counts().nlargest(3).reset_index()
top_frequent_with_others.columns = ['situation1_simplified', 'count']

# Group by customer and unique_day_id and apply the renamed situations
combinations = frequent_with_others.groupby(['customer', 'unique_day_id'])['situation1_simplified'].apply(lambda x: ', '.join(sorted(x))).reset_index()

# Get the top 3 combinations
combinations_count = combinations['situation1_simplified'].value_counts().nlargest(3).reset_index()
combinations_count.columns = ['combination', 'count']

# Prepare data for the stacked bar plot
situation_combinations = combinations_count['combination'].apply(lambda x: x.split(', '))
top_combination_data = pd.DataFrame({
    'Combination': [f'Combination {i+1}' for i in range(len(combinations_count))],
    'Situations': situation_combinations,
    'Count': combinations_count['count']
})

In [None]:

from sklearn.linear_model import LinearRegression

# Remove rows with NaN or Inf in the relevant columns
df_clean = df_redcap_full[['customer','blau_index', 'bsi_gsi_base', 'age']].replace([np.inf, -np.inf], np.nan)

# Drop NaN rows to clean the data
df_clean = df_clean.dropna()
df_clean = df_clean.drop_duplicates()

# Create subplots: 1 row, 2 columns with reduced spacing
fig = make_subplots(rows=1, cols=2, subplot_titles=("BLAU Index vs Symptom Severity", "BLAU Index vs Age"), shared_yaxes=True, column_widths=[0.45, 0.45], horizontal_spacing=0.05)

# Scatterplot for BSI GSI Base vs BLAU Index
fig.add_trace(
    go.Scatter(
        x=df_clean['bsi_gsi_base'], 
        y=df_clean['blau_index'], 
        mode='markers',
        marker=dict(color='blue'),
        name='Symptom Severity'
    ),
    row=1, col=1
)

# Fit a linear regression model for BSI GSI Base vs BLAU Index
model_bsi = LinearRegression().fit(df_clean[['bsi_gsi_base']], df_clean['blau_index'])
trendline_bsi = model_bsi.predict(df_clean[['bsi_gsi_base']])

# Add trendline for BSI GSI Base plot
fig.add_trace(
    go.Scatter(
        x=df_clean['bsi_gsi_base'],
        y=trendline_bsi,
        mode='lines',
        line=dict(color='red'),
        showlegend=False
    ),
    row=1, col=1
)

# Scatterplot for Age vs BLAU Index
fig.add_trace(
    go.Scatter(
        x=df_clean['age'], 
        y=df_clean['blau_index'], 
        mode='markers',
        marker=dict(color='green'),
        name='Age'
    ),
    row=1, col=2
)

# Fit a linear regression model for Age vs BLAU Index
model_age = LinearRegression().fit(df_clean[['age']], df_clean['blau_index'])
trendline_age = model_age.predict(df_clean[['age']])

# Add trendline for Age plot
fig.add_trace(
    go.Scatter(
        x=df_clean['age'],
        y=trendline_age,
        mode='lines',
        line=dict(color='red'),
        showlegend=False
    ),
    row=1, col=2
)

# Update the layout to adjust figure size and titles
fig.update_layout(
    width=1200,  # Increased width
    height=400,  # Adjust height
    title_text="Scatterplots of BLAU Index vs Symptom Severity and Age",
    title_font=dict(size=20)
)

# Update axis labels
fig.update_xaxes(title_text="Symptom Severity (BSI GSI)", row=1, col=1)
fig.update_xaxes(title_text="Age", row=1, col=2)
fig.update_yaxes(title_text="BLAU Index", row=1, col=1)

fig.show()


#### 5.3 Situation complexity/ clustering

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Filter to get unique entries per customer and unique_day_id combination for situation count per assessment
df_unique_per_assessment = df_piv_merged.drop_duplicates(subset=['customer', 'unique_day_id'])

# Total number of unique (customer, unique_day_id) combinations for both plots
total_unique_customer_day_id_combinations = df_unique_per_assessment.shape[0]

# Group by the simplified situation1_simplified column for the first plot (Situation category count)
situation_counts = df_piv_merged.groupby("situation1_simplified")["customer"].count().reset_index()
situation_counts = situation_counts.sort_values(by="customer", ascending=False)

# Normalize situation_counts by the total number of unique (customer, unique_day_id) combinations
situation_counts['percentage'] = (situation_counts['customer'] / total_unique_customer_day_id_combinations) * 100

# For the second plot, group by "situation_count" using the filtered unique data
situation_count_counts = df_unique_per_assessment.groupby("situation_count")["customer"].count().reset_index()

# Normalize situation_count_counts by the total number of unique (customer, unique_day_id) combinations
situation_count_counts['percentage'] = (situation_count_counts['customer'] / total_unique_customer_day_id_combinations) * 100

# Make sure all situation_count categories are represented
all_situation_counts = list(range(int(df_unique_per_assessment["situation_count"].min()), int(df_unique_per_assessment["situation_count"].max()) + 1))
situation_count_counts = situation_count_counts.set_index("situation_count").reindex(all_situation_counts, fill_value=0).reset_index()

# Create subplots
fig = make_subplots(
    rows=1, cols=2,  # 1 row, 2 columns
    subplot_titles=("", "")
)

# Bar plot for simplified situation1_simplified with color mapping
fig.add_trace(
    go.Bar(
        x=situation_counts["situation1_simplified"],
        y=situation_counts["percentage"],  # Use percentage for normalized values
        marker=dict(color=[situation_colors[sit] for sit in situation_counts["situation1_simplified"]])
    ),
    row=1, col=1
)

# Bar plot for situation_count with no text inside bars (filtered for unique entries)
fig.add_trace(
    go.Bar(
        x=situation_count_counts["situation_count"],
        y=situation_count_counts["percentage"],  # Use percentage for normalized values
        marker=dict(color=situation_colors['other']),  # Using the color for 'other' as an example
        width=0.7  # Adjust width as needed
    ),
    row=1, col=2
)

# Update layout
fig.update_layout(
    height=600,
    width=1200,
    showlegend=False,  # Hide legends as the colors are self-explanatory
    title_text="",
    margin=dict(l=100, r=100, t=100, b=150),  # Adjust margins
    plot_bgcolor='rgba(0,0,0,0)',  # Transparent background
    barmode='group'  # Set barmode to group to control bar width manually
)

# Set both y-axes to have the same range from 0 to 100%
fig.update_yaxes(range=[0, 70], title_text="Percentage (%)", row=1, col=1)
fig.update_yaxes(range=[0, 70], title_text="Percentage (%)", row=1, col=2)

# Customize the x-axis labels
fig.update_xaxes(title_text="", row=1, col=1)
fig.update_xaxes(title_text="", row=1, col=2)

# Ensure all x-axis ticks are shown for situation_count
fig.update_xaxes(tickmode='linear', dtick=1, row=1, col=2)

# Show the plot
fig.show()



In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

# Group by the simplified situation1_simplified column
situation_counts = df_piv_merged[["customer", "unique_day_id","situation1_simplified", "situation_count"]]
situation_counts = situation_counts.groupby("situation1_simplified")["customer"].count().reset_index()
situation_counts = situation_counts.sort_values(by="customer", ascending=False)

# Group by "situation_count" and count the number of "customer"
situation_count_counts = df_piv_merged.groupby("situation_count")["customer"].count().reset_index()

# Make sure all situation_count categories are represented
all_situation_counts = list(range(int(df_piv_merged["situation_count"].min()), int(df_piv_merged["situation_count"].max()) + 1))
situation_count_counts = situation_count_counts.set_index("situation_count").reindex(all_situation_counts, fill_value=0).reset_index()

# Create subplots
fig = make_subplots(
    rows=1, cols=2,  # 1 row, 2 columns
    subplot_titles=("Situation category count", "Situation count per assessment")
)

# Bar plot for simplified situation1_simplified with color mapping
fig.add_trace(
    go.Bar(
        x=situation_counts["situation1_simplified"],
        y=situation_counts["customer"],
        marker=dict(color=[situation_colors[sit] for sit in situation_counts["situation1_simplified"]])
    ),
    row=1, col=1
)

# Bar plot for situation_count with no text inside bars
fig.add_trace(
    go.Bar(
        x=situation_count_counts["situation_count"],
        y=situation_count_counts["customer"],
        marker=dict(color=situation_colors['other']),  # Using the color for 'other' as an example
        width=0.7  # Adjust width as needed
    ),
    row=1, col=2
)

# Update layout
fig.update_layout(
    height=600,
    width=1200,
    showlegend=False,  # Hide legends as the colors are self-explanatory
    title_text="",
    margin=dict(l=100, r=100, t=100, b=150),  # Adjust margins
    plot_bgcolor='rgba(0,0,0,0)',  # Transparent background
    barmode='group'  # Set barmode to group to control bar width manually
)

# Customize the axes labels
fig.update_xaxes(title_text="", row=1, col=1)
fig.update_yaxes(title_text="Count", row=1, col=1)
fig.update_xaxes(title_text="", row=1, col=2)
fig.update_yaxes(title_text="Count", row=1, col=2)

# Ensure all x-axis ticks are shown for situation_count
fig.update_xaxes(tickmode='linear', dtick=1, row=1, col=2)

# Show the plot
fig.show()

In [None]:
df_redcap_age = df_redcap_full.loc[df_redcap_full.age >18][["customer", "age"]].drop_duplicates()

In [None]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Assuming df_redcap_full is your DataFrame
grouped_data = df_redcap_full.groupby("scid_cv_description")["customer"].nunique().reset_index()

# Rename the columns for clarity
grouped_data.columns = ['scid_cv_description', 'unique_customers']

# Apply color mapping to the scid_cv_description categories
colors = [diagnosis_colors[desc] for desc in grouped_data['scid_cv_description']]

# Create a subplot with 1 row and 1 column
fig = make_subplots(rows=1, cols=1, specs=[[{'type':'domain'}]])

# Add the pie chart to the subplot
fig.add_trace(go.Pie(labels=grouped_data['scid_cv_description'], 
                     values=grouped_data['unique_customers'], 
                     textinfo='label',  # Show only labels (no percentages)
                     textposition='outside',   # Place labels outside the pie chart
                     showlegend=True,
                     marker=dict(colors=colors)),  # Apply the color mapping here
              1, 1)

# Update the layout to position the legend inside the plot area
fig.update_layout(
    title_text='Unique Customers per SCID CV Description',
    annotations=[dict(text='', x=0.5, y=0.5, font_size=20, showarrow=False)],
)

# Show the plot
fig.show()


In [None]:
import plotly.express as px
import plotly.graph_objects as go

# Calculate mean age
mean_age = df_redcap_age['age'].mean()

# Violin plot for age
fig = px.violin(df_redcap_age, y='age', 
                title="Age Distribution",
                labels={'age': 'Age'},
                box=True,  # Shows box plot inside the violin
                height=400, 
                width=400)

# Customize layout
fig.update_layout(
    margin=dict(l=50, r=50, t=50, b=50),
    plot_bgcolor='rgba(0,0,0,0)'
)

# Add annotation for the mean age
fig.add_annotation(
    text=f"Mean Age: {mean_age:.2f}",
    xref="paper", yref="paper",
    x=0.5, y=-0.15,
    showarrow=False,
    font=dict(size=12)
)

# Show plot
fig.show()


## 6. Match with Passive data 

In [None]:
df_final = df_ema_passive.merge(df_redcap_full, on=["customer", "unique_day_id", "assess"], how="right")

In [None]:
df_final.columns.tolist()

In [None]:
df_final['quest_hour'] = df_final['sensor_block_end'].dt.hour
def categorize_time_of_day(hour):
    if 5 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 14:
        return 'noon'
    elif 14 <= hour < 18:
        return 'afternoon'
    else:
        return 'evening'

df_final['time_of_day'] = df_final['quest_hour'].apply(categorize_time_of_day)


In [None]:

def categorize_season(date):
    if pd.isna(date):  # Check if the value is NaT
        return None  # Return None (or you can choose another label like 'Unknown')

    day_of_year = date.timetuple().tm_yday

    if 80 <= day_of_year < 172:
        return 'spring'
    elif 172 <= day_of_year < 264:
        return 'summer'
    elif 264 <= day_of_year < 355:
        return 'autumn'
    else:
        return 'winter'

df_final['season'] = df_final['sensor_block_end'].apply(categorize_season)


In [None]:
df_final_red = df_final.loc[df_final.situation_count ==1]

In [None]:
# Calculate the average of the continuous variables per situation
averages = df_final_red.groupby('situation1_simplified').agg({
    'n_GPS': 'mean',
    'n_steps': 'mean'
}).reset_index()

# Sort the DataFrame for n_GPS and n_steps
averages_sorted_gps = averages.sort_values(by='n_GPS', ascending=False)
averages_sorted_steps = averages.sort_values(by='n_steps', ascending=False)

# Create a subplot figure with 1 row and 2 columns (side by side)
fig = make_subplots(rows=1, cols=2, 
                    subplot_titles=("Average Number of GPS Points per Situation", "Average Number of Steps per Situation"),
                    horizontal_spacing=0.15)  # Adjust horizontal spacing as needed

# First subplot for n_GPS
fig.add_trace(
    go.Bar(
        x=averages_sorted_gps['situation1_simplified'], 
        y=averages_sorted_gps['n_GPS'],
        marker=dict(color=[situation_colors.get(sit, '#d3d3d3') for sit in averages_sorted_gps['situation1_simplified']]),
        name='Number of GPS Points'
    ),
    row=1, col=1
)

# Second subplot for n_steps
fig.add_trace(
    go.Bar(
        x=averages_sorted_steps['situation1_simplified'], 
        y=averages_sorted_steps['n_steps'],
        marker=dict(color=[situation_colors.get(sit, '#d3d3d3') for sit in averages_sorted_steps['situation1_simplified']]),
        name='Number of Steps'
    ),
    row=1, col=2
)

# Update layout to set titles and size
fig.update_layout(
    height=500,  # Adjust height to fit both subplots
    width=1000,   # Adjust width
    title_text="Average GPS Points and Steps per Situation",
    showlegend=False  # Hide legend (since color is self-explanatory)
)

# Update axes for both subplots
fig.update_xaxes(title_text="Situation", row=1, col=1)
fig.update_yaxes(title_text="Average Number of GPS Points", row=1, col=1)

fig.update_xaxes(title_text="Situation", row=1, col=2)
fig.update_yaxes(title_text="Average Number of Steps", row=1, col=2)

# Show the plot
fig.show()

In [None]:
 df_final_red_red = df_final.loc[df_final.at_home_binary !=-1]
# Calculate the average of total_distance_km and transition_minutes per situation
averages = df_final_red.groupby('situation1_simplified').agg({
    'total_distance_km': 'mean',
    'transition_minutes': 'mean'
}).reset_index()

# Sort the DataFrame for total_distance_km and transition_minutes
averages_sorted_distance = averages.sort_values(by='total_distance_km', ascending=False)
averages_sorted_transition = averages.sort_values(by='transition_minutes', ascending=False)

# Create a subplot figure with 1 row and 2 columns
fig = make_subplots(
    rows=1, cols=2, 
    specs=[[{"secondary_y": False}, {"secondary_y": False}]],  # Separate y-axes for both subplots
    horizontal_spacing=0.1  # Adjust spacing between subplots
)

# First subplot for total_distance_km
fig.add_trace(
    go.Bar(
        x=averages_sorted_distance['situation1_simplified'], 
        y=averages_sorted_distance['total_distance_km'],
        marker=dict(color=[situation_colors.get(sit, '#d3d3d3') for sit in averages_sorted_distance['situation1_simplified']]),
        name='Total Distance (km)'
    ),
    row=1, col=1
)

# Second subplot for transition_minutes
fig.add_trace(
    go.Bar(
        x=averages_sorted_transition['situation1_simplified'], 
        y=averages_sorted_transition['transition_minutes'],
        marker=dict(color=[situation_colors.get(sit, '#d3d3d3') for sit in averages_sorted_transition['situation1_simplified']]),
        name='Transition Minutes'
    ),
    row=1, col=2
)

# Update layout to set titles and adjust size
fig.update_layout(
    height=500,  # Adjust height to fit both subplots
    width=1000,  # Adjust width
    title_text="",
    showlegend=False  # Hide legend (since color is self-explanatory)
)

# Update axes for both subplots
fig.update_xaxes(title_text="", row=1, col=1)
fig.update_yaxes(title_text="Average Distance Travelled (km)", row=1, col=1)

fig.update_xaxes(title_text="", row=1, col=2)
fig.update_yaxes(title_text="Average Transition Minutes", row=1, col=2)

# Show the plot
fig.show()

In [None]:
 df_final_red_red = df_final.loc[df_final.at_home_binary !=-1]

In [None]:

# Calculate the average of at_home_minute and transition_minutes per situation
averages = df_final_red_red.groupby('situation1_simplified').agg({
    'at_home_minute': 'mean',
    'transition_minutes': 'mean'
}).reset_index()

# Sort the DataFrame for at_home_minute and transition_minutes
averages_sorted_home = averages.sort_values(by='at_home_minute', ascending=False)
averages_sorted_transition = averages.sort_values(by='transition_minutes', ascending=False)

# Create a subplot figure with 1 row and 2 columns
fig = make_subplots(
    rows=1, cols=2, 
    specs=[[{"secondary_y": False}, {"secondary_y": False}]],  # Separate y-axes for both subplots
    horizontal_spacing=0.1  # Adjust spacing between subplots
)

# First subplot for at_home_minute
fig.add_trace(
    go.Bar(
        x=averages_sorted_home['situation1_simplified'], 
        y=averages_sorted_home['at_home_minute'],
        marker=dict(color=[situation_colors.get(sit, '#d3d3d3') for sit in averages_sorted_home['situation1_simplified']]),
        name='At Home Minutes'
    ),
    row=1, col=1
)

# Second subplot for transition_minutes
fig.add_trace(
    go.Bar(
        x=averages_sorted_transition['situation1_simplified'], 
        y=averages_sorted_transition['transition_minutes'],
        marker=dict(color=[situation_colors.get(sit, '#d3d3d3') for sit in averages_sorted_transition['situation1_simplified']]),
        name='Transition Minutes'
    ),
    row=1, col=2
)

# Update layout to set titles and adjust size
fig.update_layout(
    height=500,  # Adjust height to fit both subplots
    width=1000,  # Adjust width
    title_text="Average At Home and Transition Minutes per Situation",
    showlegend=False  # Hide legend (since color is self-explanatory)
)

# Update axes for both subplots
fig.update_xaxes(title_text="", row=1, col=1)
fig.update_yaxes(title_text="Average At Home Minutes", row=1, col=1)

fig.update_xaxes(title_text="", row=1, col=2)
fig.update_yaxes(title_text="Average Transition Minutes", row=1, col=2)

# Show the plot
fig.show()


## 7. Multiclass prediction of situations 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import ClassifierChain, MultiOutputClassifier
from sklearn.metrics import accuracy_score, hamming_loss, f1_score
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline



In [None]:
data = df_final[['n_steps', 'n_GPS','total_distance_km',  'transition_minutes','transition','situation1', 
                 'customer', 'unique_day_id', 'at_home_binary', 'at_home_minute', "weekend", "weekday", "season", "quest_hour", "time_of_day"]]
#data_add = df_final[['n_steps', 'n_GPS','total_distance_km', 'at_home', 'situation1', 'customer', 'unique_day_id',  
 #                    "weekend", "weekday", "season", "quest_hour", "time_of_day"]]
# Example
numerical_features = ["n_steps", "n_GPS", "total_distance_km", "transition_minutes", "at_home_minute"]  # replace with your numerical feature names
categorical_features = ["transition", "at_home_binary", "weekend", "weekday", "season", "quest_hour", "time_of_day"]  # replace with your categorical feature names
categorical_features_add = ['transition',"weekend", "weekday", "season", "quest_hour", "time_of_day"]


In [None]:
# Step 1: Identify the 6 most frequent categories in 'situation1'
top_6_situations = data['situation1'].value_counts().nlargest(9).index.tolist()

In [None]:

# Step 2: Filter the dataframe to only include rows where 'situation1' is in the top 6 categories
filtered_df = data[data['situation1'].isin(top_6_situations)]

# Step 1: One-hot encode the 'situation1' column to create binary indicator columns
situation_dummies = pd.get_dummies(filtered_df['situation1'], prefix='situation')

# Step 2: Add the binary situation columns back to the original dataframe
df_with_dummies = pd.concat([filtered_df, situation_dummies], axis=1)

# Step 3: Group by both 'customer' and 'unique_day_id' and aggregate using max for binary columns
aggregated_df = df_with_dummies.groupby(['customer', 'unique_day_id']).agg({
    'n_steps': 'first',           # For numerical features, we'll assume first value for the day
    'n_GPS': 'first',
    'total_distance_km': 'first',
    'transition_minutes': 'first',
    'transition': 'first',        # Categorical feature
    'at_home_binary': 'first',    # Binary feature (first because there should be one per day)
    'at_home_minute': 'first',
    'weekend': 'first',           # Categorical/binary features
    'weekday': 'first',
    'season': 'first',
    'quest_hour': 'first',
    'time_of_day': 'first',
    # Binary columns for situations (max ensures that if situation was selected, it is recorded as 1)
    **{col: 'max' for col in situation_dummies.columns}  
}).reset_index()

# Now 'aggregated_df' has one row per unique_day_id and binary indicator columns for each situation

# Display the reshaped dataframe
aggregated_df.shape


In [None]:
aggregated_df = aggregated_df.loc[aggregated_df.at_home_binary != -1]

In [None]:
import numpy as np
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, hamming_loss, f1_score, classification_report

# Function to apply log1p (log(1 + x)) to numerical features
log_transformer = FunctionTransformer(np.log1p, validate=True)

# ColumnTransformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('log', log_transformer),  # Apply log(1 + x) transformation
            ('scale', StandardScaler())  # Then apply standard scaling
        ]), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)  # Ensure OneHotEncoder works
    ])

# Base model (RandomForest) for the classifier chain
base_model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)

# Use 'aggregated_df' for the features and target labels
X = aggregated_df.drop(columns=['customer', 'unique_day_id'] + situation_dummies.columns.tolist())  # Drop non-feature and situation columns

# Target labels (top situations)
y_filtered = aggregated_df[situation_dummies.columns.tolist()]  # Add your binary situation columns

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_filtered, test_size=0.2, random_state=42)

# Number of classifier chains to use in the ensemble
n_chains = 10

# Create an empty array to store the predictions
y_pred_ensemble = np.zeros((X_test.shape[0], y_test.shape[1]))

# Train multiple classifier chains with different random orders
for i in range(n_chains):
    print(f"Training classifier chain {i+1}/{n_chains}...")
    chain = ClassifierChain(base_model, order='random', random_state=i)  # Random order for each chain
    
    # Create a pipeline with preprocessing and the current classifier chain
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', chain)])
    
    # Train the pipeline on the training data
    pipeline.fit(X_train, y_train)
    
    # Make predictions and add to the ensemble prediction array
    y_pred = pipeline.predict(X_test)
    y_pred_ensemble += y_pred

# Step 8: Aggregate the predictions by majority voting (for binary labels)
y_pred_final = (y_pred_ensemble >= (n_chains / 2)).astype(int)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_final)
hamming = hamming_loss(y_test, y_pred_final)
f1_micro = f1_score(y_test, y_pred_final, average='micro')

print(f'Accuracy: {accuracy}')
print(f'Hamming Loss: {hamming}')
print(f'Micro-averaged F1-score: {f1_micro}')

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_final, target_names=y_filtered.columns, zero_division=0))

# Function to calculate subset accuracy (exact match ratio)
def subset_accuracy_score(y_true, y_pred):
    return np.mean(np.all(y_true == y_pred, axis=1))

# Calculate subset accuracy
subset_accuracy = subset_accuracy_score(y_test, y_pred_final)
print(f'Subset Accuracy (Exact Match Ratio): {subset_accuracy}')


In [None]:
import numpy as np
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score, roc_auc_score

data = df_final[['n_steps', 'n_GPS','total_distance_km',  'transition_minutes','transition', 'mean_na_label','situation1',
                 'customer', 'unique_day_id', 'at_home_binary', 'at_home_minute', "weekend", "weekday", "season", "quest_hour", "time_of_day"]]
#data_add = df_final[['n_steps', 'n_GPS','total_distance_km', 'at_home', 'situation1', 'customer', 'unique_day_id',  
 #                    "weekend", "weekday", "season", "quest_hour", "time_of_day"]]
# Example
numerical_features = ["n_steps", "n_GPS", "total_distance_km", "transition_minutes", "at_home_minute"]  # replace with your numerical feature names
categorical_features = ["transition", "at_home_binary", "weekend", "weekday", "season", "quest_hour", "time_of_day"]  # replace with your categorical feature names


In [None]:
# Step 1: One-hot encode the 'situation1' column to create binary indicator columns
situation_dummies = pd.get_dummies(data['situation1'], prefix='situation')

# Step 2: Add the binary situation columns back to the original dataframe
df_with_dummies = pd.concat([data, situation_dummies], axis=1)

# Step 3: Group by both 'customer' and 'unique_day_id' and aggregate using max for binary columns
aggregated_df = df_with_dummies.groupby(['customer', 'unique_day_id']).agg({
    'n_steps': 'first',           # For numerical features, we'll assume first value for the day
    'n_GPS': 'first',
    'total_distance_km': 'first',
    'transition_minutes': 'first',
    'transition': 'first',        # Categorical feature
    'at_home_binary': 'first',    # Binary feature (first because there should be one per day)
    'at_home_minute': 'first',
    'weekend': 'first',           # Categorical/binary features
    'weekday': 'first',
    'season': 'first',
    'quest_hour': 'first',
    'time_of_day': 'first',
    'mean_na_label':'first',
    # Binary columns for situations (max ensures that if situation was selected, it is recorded as 1)
    **{col: 'max' for col in situation_dummies.columns}  
}).reset_index()

# Now 'aggregated_df' has one row per unique_day_id and binary indicator columns for each situation

# Display the reshaped dataframe
aggregated_df.shape


In [None]:
aggregated_df = aggregated_df.loc[aggregated_df.at_home_binary != -1]

In [None]:
# Function to apply log1p (log(1 + x)) to numerical features
log_transformer = FunctionTransformer(np.log1p, validate=True)

# ColumnTransformer for preprocessing (numerical and categorical features)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('log', log_transformer),  # Apply log(1 + x) transformation
            ('scale', StandardScaler())  # Then apply standard scaling
        ]), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])

# RandomForestClassifier as the model for binary classification
classifier = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)

# Create a pipeline with preprocessing and RandomForestClassifier
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', classifier)])

# Use 'aggregated_df_with_dummies' for the features and target labels
# X: Features including dummy-encoded 'situation' variables as predictors
X = aggregated_df.drop(columns=['customer', 'unique_day_id', 'mean_na_label'])  # Drop non-feature and target columns

# y: Target variable (mean_na_label) is now binary
y = aggregated_df['mean_na_label']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model using the pipeline
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'F1-score: {f1}')
print(f'ROC-AUC score: {roc_auc}')

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

In [None]:
b

In [None]:
import shap
# Initialize the SHAP explainer for the RandomForestClassifier
explainer = shap.TreeExplainer(base_classifier)

# Calculate SHAP values for the test set
shap_values = explainer.shap_values(X_test)

# Visualize the SHAP values for a single prediction
shap.initjs()
shap.force_plot(explainer.expected_value[0], shap_values[0][0], X_test.iloc[0,:])

# Summary plot of SHAP values across all test samples
shap.summary_plot(shap_values[0], X_test, feature_names=X.columns)
