In [1]:
import pandas as pd
import ast
import numpy as np

In [2]:
from server_config import DATA_PATH

### 1. Therapist data

In [3]:
df_th = pd.read_csv(DATA_PATH + "/df_th_nopr_experience_corrected_200624_LB.csv", sep=";",low_memory = "False")

In [4]:
keep_cols_th = ['therapist_id', 't0_method_t_CBT','t0_method_t_PDT','t0_method_t_ST','t0_method_t_AP', 't0_appliedmethod_t_VT', 't0_appliedmethod_t_TP',
             't0_appliedmethod_t_AP', 't0_appliedmethod_t_ST','t0_appliedmethod_t_Other','t0_location_t','t0_gender_t',
 't0_relationship_t','t0_sexualorientation_t','t0_children_t','t0_age_t', 't0_experience_t','t0_license_t', 't0_bfi_t_1',
 't0_bfi_t_2','t0_bfi_t_3','t0_bfi_t_4','t0_bfi_t_5','t0_bfi_t_6','t0_bfi_t_7','t0_bfi_t_8','t0_bfi_t_9','t0_bfi_t_10',]

In [5]:
labels = ["VT", "TP", "AP", "ST", "Other"]  

col = df_th["t0_appliedmethod_t_A"]

# 1) Parse strings like "[True, False, ...]" -> list
col = col.map(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# 2) Unwrap nested lists like [[True, False, ...]] -> [True, False, ...]
col = col.map(lambda x: x[0] if isinstance(x, (list, tuple)) and len(x) == 1 and isinstance(x[0], (list, tuple)) else x)

# 3) Safety: replace missing/invalid with all-False (or raise if you prefer)
def ensure_five(x):
    if isinstance(x, (list, tuple)) and len(x) == 5:
        return list(x)
    if pd.isna(x):
        return [False]*5

col = col.map(ensure_five)

# 4) Build the indicator columns
method_df = (
    pd.DataFrame(col.tolist(), index=df_th.index, columns=labels)
      .astype(int)
      .add_prefix("t0_appliedmethod_t_")
)

In [6]:
df_th = df_th.join(method_df)


In [7]:
df_th = df_th[keep_cols_th]

In [None]:
# BFI-10 items (1–5), reverse-code, then build subscale SUMS using df_th and t0-prefix
bfi_items = [f"t0_bfi_t_{i}" for i in range(1, 11)]
df_th[bfi_items] = df_th[bfi_items].apply(pd.to_numeric, errors="coerce")

# Reverse-coded items: 1, 3, 4, 5, 7
rev_items = ["t0_bfi_t_1", "t0_bfi_t_3", "t0_bfi_t_4", "t0_bfi_t_5", "t0_bfi_t_7"]
for c in rev_items:
    df_th[c + "_r"] = 6 - df_th[c]

# Subscale sums (2 items each)
df_th["t0_bfi_t_extraversion"]     = df_th[["t0_bfi_t_1_r", "t0_bfi_t_6"]].sum(axis=1)
df_th["t0_bfi_t_agreeableness"]    = df_th[["t0_bfi_t_2",   "t0_bfi_t_7_r"]].sum(axis=1)
df_th["t0_bfi_t_conscientiousness"]= df_th[["t0_bfi_t_3_r", "t0_bfi_t_8"]].sum(axis=1)
df_th["t0_bfi_t_neuroticism"]      = df_th[["t0_bfi_t_4_r", "t0_bfi_t_9"]].sum(axis=1)
df_th["t0_bfi_t_openness"]         = df_th[["t0_bfi_t_5_r", "t0_bfi_t_10"]].sum(axis=1)

# (Optional) drop helper reverse-coded columns:
df_th.drop(columns=[c + "_r" for c in rev_items], inplace=True)


### 2. Patient data

In [8]:
df_pat = pd.read_json(DATA_PATH + "/TONI_pat_corrected_20241025_raw.json")

In [9]:
df_pat = df_pat.drop(columns=[c for c in df_pat.columns if c.startswith(('t2','t3','t4','u_','w', 'dropout'))])

In [10]:
keep_cols = (
    # IDs / study
    ['patient_id', 'therapist_id'] +

    # Demographics / baseline
    ['t0_age_p','t0_gender_p','t0_school_p','t0_employment_p','t0_location_p'] +
    [f"t0_therapy_p_{i}" for i in range(1, 4)] +
    ['t0_familyhistory_p','t0_meds_p','t0_relationship_p','t0_ses_p','t0_distance_p'] +
    [f"t0_migration_p_{i}" for i in range(1, 4)] +
    [f"surveys.t0.answers.1585.discrimination_{i}" for i in range(1, 6)] +
    [f"t0_disease_p_{c}" for c in list('ABCDEFG')] +

    # t0 scales
    [f"t0_phq_p_{i}" for i in range(1, 9)] +
    [f"t0_gad_p_{i}" for i in range(1, 7+1)] +
    [f"t0_swls_p_{i}" for i in range(1, 5+1)] +
    [f"t0_aqol_p_{i}" for i in range(1, 35+1)] +
    [f"t0_ede_p_{i}" for i in range(1, 8+1)] +
    (['t0_audit_p_filter'] + [f"t0_audit_p_{i}" for i in range(1, 3+1)]) +
    (['t0_dudit_p_filter'] + [f"t0_dudit_p_{i}" for i in range(1, 4+1)]) +
    [f"t0_msfq_p_{i}" for i in range(1, 5+1)] +
    [f"t0_pid_p_{i}" for i in range(1, 36+1)] +
    [f"t0_opd_p_{i}" for i in range(1, 12+1)] +
    [f"t0_ace_p_{i}" for i in range(1, 10+1)] +
    [f"t0_ask_p_{i}" for i in range(1, 3+1)] +
    [f"t0_mhse_p_{i}" for i in range(1, 6+1)] +

    # t1 scales
    [f"t1_phq_p_{i}" for i in range(1, 8+1)] +
    [f"t1_gad_p_{i}" for i in range(1, 7+1)] +

    # alliance / matching
    ['t0_change_tp_1', 't0_change_tp_2', 't0_matching_tp_1'] +
    [f"t1_wai_tp_{i}" for i in range(1, 9)] +
    [f"t1_wai_p_{i}" for i in range(1, 11)] +
    ['t1_matching_tp_1','t0_matching_p_1', 't1_matching_p_1']
)


In [11]:
df_pat= df_pat[keep_cols]

#### 2.1 Calculate Subscales for self-report data

#### Outcome T1 (PHQ + GAD)

In [12]:
# define item columns
phq_cols_t1 = [f"t1_phq_p_{i}" for i in range(1, 9)]
gad_cols_t1 = [f"t1_gad_p_{i}" for i in range(1, 8)]

mask_complete = df_pat[phq_cols_t1 + gad_cols_t1].notna().all(axis=1)
df_pat_clean = df_pat.loc[mask_complete].copy()

In [13]:
df_pat_clean["t1_phq8_total"] = df_pat_clean[phq_cols_t1].sum(axis=1)
df_pat_clean["t1_gad7_total"] = df_pat_clean[gad_cols_t1].sum(axis=1)

In [14]:
df_pat_clean["t1_distress_sum"] = (
    df_pat_clean["t1_phq8_total"] + df_pat_clean["t1_gad7_total"]
)

#### Outcome T0 (PHQ + GAD)

In [15]:
# define item columns
phq_cols_t0 = [f"t0_phq_p_{i}" for i in range(1, 9)]
gad_cols_t0 = [f"t0_gad_p_{i}" for i in range(1, 8)]

In [16]:
df_pat_clean["t0_phq8_total"] = df_pat_clean[phq_cols_t0].sum(axis=1)
df_pat_clean["t0_gad7_total"] = df_pat_clean[gad_cols_t0].sum(axis=1)

In [17]:
df_pat_clean["t0_distress_sum"] = (
    df_pat_clean["t0_phq8_total"] + df_pat_clean["t0_gad7_total"]
)

#### SWLS Baseline

In [18]:
swls_cols_t0 = [f"t0_swls_p_{i}" for i in range(1, 6)]

In [19]:
df_pat_clean["t0_swls_total"] = df_pat_clean[swls_cols_t0].sum(axis=1)

#### Aqol Baseline

In [20]:
aqol_map = {
    "t0_aqol_p_coping": ['t0_aqol_p_1', 't0_aqol_p_21', 't0_aqol_p_29'],
    "t0_aqol_p_relationship": ['t0_aqol_p_2', 't0_aqol_p_4', 't0_aqol_p_9', 't0_aqol_p_10',
                               't0_aqol_p_23', 't0_aqol_p_31', 't0_aqol_p_34'],
    "t0_aqol_p_independent_living": ['t0_aqol_p_3', 't0_aqol_p_15', 't0_aqol_p_19', 't0_aqol_p_30'],
    "t0_aqol_p_mental_health": ['t0_aqol_p_5', 't0_aqol_p_8', 't0_aqol_p_12', 't0_aqol_p_14',
                                't0_aqol_p_16', 't0_aqol_p_18', 't0_aqol_p_33', 't0_aqol_p_35'],
    "t0_aqol_p_pain": ['t0_aqol_p_6', 't0_aqol_p_22', 't0_aqol_p_24'],
    "t0_aqol_p_self_worth": ['t0_aqol_p_7', 't0_aqol_p_13', 't0_aqol_p_26'],
    "t0_aqol_p_happiness": ['t0_aqol_p_17', 't0_aqol_p_20', 't0_aqol_p_25', 't0_aqol_p_27'],
    "t0_aqol_p_senses": ['t0_aqol_p_11', 't0_aqol_p_28', 't0_aqol_p_32'],
}

In [21]:
df_pat_clean = df_pat_clean.assign(**{scale: df_pat_clean[items].mean(axis=1) for scale, items in aqol_map.items()})


#### EDE Baseline

In [22]:
df_pat_clean["t0_ede_sum"] = df_pat_clean[[f"t0_ede_p_{i}" for i in range(1, 9)]].sum(axis=1)


#### Audit Baseline

In [23]:
# sum AUDIT items only if filter == 1; otherwise set total to 0
aud_items = [f"t0_audit_p_{i}" for i in range(1, 4)]
df_pat_clean["t0_audit_sum"] = np.where(df_pat_clean["t0_audit_p_filter"] == 1,
                                df_pat_clean[aud_items].sum(axis=1),
                                0)

#### Dudit Baseline

In [24]:
# sum AUDIT items only if filter == 1; otherwise set total to 0
dud_items = [f"t0_dudit_p_{i}" for i in range(1, 4)]
df_pat_clean["t0_dudit_sum"] = np.where(df_pat_clean["t0_dudit_p_filter"] == 1,
                                df_pat_clean[dud_items].sum(axis=1),
                                0)

#### MFSQ Baseline

In [None]:
msfq_cols = [f"t0_msfq_p_{i}" for i in range(1, 6)]
df[msfq_cols] = df[msfq_cols].apply(pd.to_numeric, errors="coerce")

df["t0_msfq_p_sum"] = df[msfq_cols].sum(axis=1)

#### PID5BF+ Baseline

In [25]:

# PID-5 BF+ domains → item columns
pid_map = {
    "t0_pid_negative_affect": ['t0_pid_p_1','t0_pid_p_7','t0_pid_p_13','t0_pid_p_19','t0_pid_p_25','t0_pid_p_31'],
    "t0_pid_antagonism":      ['t0_pid_p_2','t0_pid_p_8','t0_pid_p_14','t0_pid_p_20','t0_pid_p_26','t0_pid_p_32'],
    "t0_pid_disinhibition":   ['t0_pid_p_3','t0_pid_p_9','t0_pid_p_15','t0_pid_p_21','t0_pid_p_27','t0_pid_p_33'],
    "t0_pid_detachment":      ['t0_pid_p_4','t0_pid_p_10','t0_pid_p_16','t0_pid_p_22','t0_pid_p_28','t0_pid_p_34'],
    "t0_pid_psychoticism":    ['t0_pid_p_5','t0_pid_p_11','t0_pid_p_17','t0_pid_p_23','t0_pid_p_29','t0_pid_p_35'],
    "t0_pid_anankastia":      ['t0_pid_p_6','t0_pid_p_12','t0_pid_p_18','t0_pid_p_24','t0_pid_p_30','t0_pid_p_36'],
}

# 1) Coerce all PID item columns to numeric (e.g., "2" → 2, invalid → NaN)
item_cols = sorted({c for cols in pid_map.values() for c in cols})
df_pat_clean[item_cols] = df_pat_clean[item_cols].apply(pd.to_numeric, errors="coerce")

# 2) Domain sums (NaNs treated as 0 by default)
for scale, items in pid_map.items():
    df_pat_clean[scale] = df_pat_clean[items].mean(axis=1)


#### OPD Baseline

In [26]:
# OPD-SFK domains → item columns
opd_map = {
    "t0_opd_self":         ['t0_opd_p_1', 't0_opd_p_2', 't0_opd_p_5', 't0_opd_p_8'],
    "t0_opd_contact":      ['t0_opd_p_4', 't0_opd_p_6', 't0_opd_p_10', 't0_opd_p_11'],
    "t0_opd_relationship": ['t0_opd_p_3', 't0_opd_p_7', 't0_opd_p_9', 't0_opd_p_12'],
}

# coerce item columns to numeric (e.g., "2" → 2; invalid → NaN)
opd_items = sorted({c for cols in opd_map.values() for c in cols})
df_pat_clean[opd_items] = df_pat_clean[opd_items].apply(pd.to_numeric, errors="coerce")

# domain sums (NaNs treated as 0 by default)
for scale, items in opd_map.items():
    df_pat_clean[scale] = df_pat_clean[items].mean(axis=1)

#### ACE Baseline

In [28]:
ace_cols = [f"t0_ace_p_{i}" for i in range(1, 11)]

# ensure numeric (e.g., "1" -> 1; invalid -> NaN)
df_pat_clean[ace_cols] = df_pat_clean[ace_cols].apply(pd.to_numeric, errors="coerce")

df_pat_clean["t0_ace_p_sum"] = df_pat_clean[ace_cols].sum(axis=1)

#### ASK Baseline

In [30]:
ask_cols = [f"t0_ask_p_{i}" for i in range(1, 4)]
df_pat_clean[ask_cols] = df_pat_clean[ask_cols].apply(pd.to_numeric, errors="coerce")

df_pat_clean["t0_ask_p_sum"] = df_pat_clean[ask_cols].sum(axis=1)

#### MHSE Baseline

In [31]:
mhse_cols = [f"t0_mhse_p_{i}" for i in range(1, 7)]
df_pat_clean[mhse_cols] = df_pat_clean[mhse_cols].apply(pd.to_numeric, errors="coerce")

df_pat_clean["t0_mhse_p_sum"] = df_pat_clean[mhse_cols].sum(axis=1)

### 2.2 Preprocess data