In [144]:
import Simulation.EHR_sensitive as ehrsens
from Simulation.loaddata import aggregate_dataset
from Simulation.extract_feats import generate_features
from Code.Analysis.compute_qualitymeasures import compute_qualitymeasures
import Code.Beam_Search.beam_search as bs

import pandas as pd
import ast
from collections import Counter

This file is run to:
* Generate the EHR data
* Clean and extract features from the ECG signals
* Aggregate the dataset into the wished format

The only values to change are the ones in the codeblock below:


In [145]:
# Only change these!
dataset = 'CPSC'
ehr_generate = False #Change to True if you actually want to generate new synthetic data
do = False #Change to True if you actually want to run the feature extraction process
exists = False #Change to True if you already have a dataset that you want to build on with the features
aggregate = False #Change to True if you want to aggregate the dataset of features

# Preferably, use the datasets as they are in this repository

# Generate EHR


In [146]:
if ehr_generate:
    df_EHR = ehrsens.generate_EHR(dataset, vals = 97, other=True)
    df_EHR.to_csv('Data/Synthetic EHR/synthetic_ehr.csv')

df_EHR = pd.read_csv('Data/Synthetic EHR/synthetic_ehr.csv')


# Generate features

In [147]:
if do:
    fts = generate_features(dataset)

In [148]:
if do:
    df_fts = pd.DataFrame.from_dict(fts)
    df_fts.head()

In [149]:
if do:
    if not(exists):
        df_fts.to_csv('Data/Processed ECG/synth_feats.csv', index=False) 
    else:
        df_old = pd.read_csv('Data/Processed ECG/synth_feats.csv')
        df_new = pd.concat([df_old, df_fts], axis=0)
        df_new.to_csv('Data/Processed ECG/synth_feats.csv', index=False) 

# Aggregate databse

In [150]:
if aggregate:
    df_old = pd.read_csv('Data/Processed ECG/synth_feats.csv')
    df_agg = aggregate_dataset(df_old)

    # Create a new database per patient, including which subgroup(s) it is included in and what they counter says
    nonAF = []
    AFpers = []
    AFparo = []

    for row in range(len(df_agg)):
        pat = df_agg.iloc[row]
        cs = Counter( ast.literal_eval(pat['AF']))
        nonAF.append(cs['non atrial fibrillation'])
        AFpers.append(cs['persistent atrial fibrillation'])
        AFparo.append(cs['paroxysmal atrial fibrillation'])

    df_agg['Non AF'] = nonAF
    df_agg['Persistent AF'] = AFpers
    df_agg['Paroxysmal AF'] = AFparo
    df_agg.to_csv('synth_aggregated_feats.csv', index=False)

# Get model class values

In [151]:
path = 'Data/Processed ECG/synth_aggregated_feats.csv'
savepath = 'Data/Processed ECG/synth_QualityMeasures.csv'

doqm = False
if doqm:
    compute_qualitymeasures(path, savepath)

qm = pd.read_csv(savepath)


In [152]:
df_EHR['PID'] = [i for i in range(len(qm))]

In [153]:
df_agg = pd.read_csv("Data/Processed ECG/synth_aggregated_feats.csv")
pid_eval = []
AF_eval = []

for r in range(len(df_agg)):
    sel = df_agg.iloc[r]

    if sel['Persistent AF'] != 0 or sel['Paroxysmal AF'] != 0:
        pid_eval.append(sel['PID'])
        AF_eval.append('AF')

evaluation = pd.DataFrame()
evaluation['PID'] = pid_eval
evaluation['AF'] = AF_eval
    

In [159]:
beamdo = True

csize = round(0.05 * len(df_EHR))
resset = 2

if beamdo:
    resultsDct = {'Theta': [], 'Rank': [], 'Description': [], 'QM': [], 'Indexes': []}
    for opt in ['SDSD', 'RMSSD', 'SDRR', 'P-count', 'F-count', 'SDSD_P', 'RMSSD_P', 'SDRR_P', 'SDSD_F', 'RMSSD_F', 'SDRR_F']:
        print(opt)
        resultSet = bs.beam_search(df_EHR, qm, evaluation, theta = opt, q = resset, w = 50, d = 3, b = 3, c = csize)
        resultsDct['Rank'] += [i for i in range(1, resset+1)]
        resultsDct['Theta'] += [opt for i in range(1, resset+1)]
        for res in resultSet:
        
            resultsDct['Description'].append([res[0]])
            resultsDct['QM'].append(float(res[1]))
            resultsDct['Indexes'].append([int(i) for i in res[2]])

        resultSet = []
        
    df_results = pd.DataFrame.from_dict(resultsDct)
    df_results.to_csv('Data/synth_results.csv', index=False)

SDSD
RMSSD
SDRR
P-count
F-count
SDSD_P
RMSSD_P
SDRR_P
SDSD_F
RMSSD_F
SDRR_F


In [160]:
df_results = pd.read_csv("Data/synth_results.csv")
for res in range(len(df_results)):
    print(df_results.iloc[res])

Theta                                                       SDSD
Rank                                                           1
Description    [{'description': {'a43': ['Taken'], 'a1': (np....
QM                                                           2.1
Indexes                             [12, 36, 56, 63, 64, 79, 81]
Name: 0, dtype: object
Theta                                                       SDSD
Rank                                                           2
Description    [{'description': {'a43': ['Taken'], 'a1': (np....
QM                                                          2.08
Indexes                                     [12, 36, 56, 64, 79]
Name: 1, dtype: object
Theta                                                      RMSSD
Rank                                                           1
Description    [{'description': {'a43': ['Taken'], 'a1': (np....
QM                                                          2.84
Indexes                             [12, 36,