In [0]:
import pandas as pd
import numpy as np
import os

patient_data = "/Workspace/Users/heranr@pennmedicine.upenn.edu/eRADAR/penn_eradar_CLIP/merged_output.csv"

def calculate_percentiles(df):
    """Precentile thresholds for EHR interpretation + substitution error rate"""
    pred_prob = pd.to_numeric(df['pred_proba'], errors='coerce').fillna(0.0)
    sub_rate = pd.to_numeric(df['mlm_sub_error_rate'], errors='coerce').fillna(0.0)
    
    return {
        'ehr_95': np.percentile(pred_prob, 95),
        'ehr_90': np.percentile(pred_prob, 90),
        'ehr_85': np.percentile(pred_prob, 85),
        'ehr_75': np.percentile(pred_prob, 75),
        'subtitution_90': np.percentile(sub_rate, 90)
    }

def create_percentile_fact(percentiles):
    """CLIPS facts for the thresholds"""
    return (f'(percentile_thresholds '
            f'(ehr_95_percentile_is {percentiles["ehr_95"]:.6f}) '
            f'(ehr_90_percentile_is {percentiles["ehr_90"]:.6f}) '
            f'(ehr_85_percentile_is {percentiles["ehr_85"]:.6f}) '
            f'(ehr_75_percentile_is {percentiles["ehr_75"]:.6f}) '
            f'(subtitution_90th_percentile_is {percentiles["subtitution_90"]:.6f}))')


def data_to_clips(data_path, out_path):
    """complete conversion of merged sample data to CLIPS facts format - had 12 samples that met requirements"""
    df = pd.read_csv(data_path)
    facts = [create_percentile_fact(calculate_percentiles(df)), '']
    
    for _, row in df.iterrows():
        pid = str(row['patient_id'])
        facts.extend([
            f'(patient (patient_id_is "{pid}"))',
            f'(ehr_risk_analysis (patient_id_is "{pid}") (predicted_probability_is {float(row["pred_proba"]):.4f}) (ehr_risk_level_is unknown))',
            f'(speech_analysis (patient_id_is "{pid}") (repetition_rate_is {float(row["unia_repetition_rate"]):.6f}) (substitution_error_rate_is {float(row["mlm_sub_error_rate"]):.6f}) (ifd_distance_is {float(row["kw_mean_IFD_imp"]):.2f}))',
            ''
        ])
    
    with open(out_path, 'w') as f:
        f.write('\n'.join(facts).rstrip() + '\n')


if __name__ == "__main__":
    data_to_clips(patient_data, f"{os.path.splitext(os.path.basename(patient_data))[0]}_facts_final.clp")
