In [1]:
import _pickle as pickle
import csv
import os
import sys
import numpy as np
import sklearn.model_selection as ms
import tensorflow as tf
import pandas as pd

In [117]:
class Patient(object):
    def __init__(self, patient_id, adm_ids, diag_codes, medication_codes, procedure_codes, medical_history):
        self.patient_id = patient_id
        self.nvisits = len(adm_ids)
        #print(self.nvisits)    
        self.diagnosis_codes = np.concatenate(diag_codes, axis=0)
        self.medication_codes = np.concatenate(medication_codes, axis=0)
        self.procedure_codes = np.concatenate(procedure_codes, axis=0)
        
        self.medical_history = medical_history
        
        
def process_patient(infile, patient_history_dict):

    patients = pd.read_parquet(infile)
    
    patient_dict = {}
    
    count=0
    for rowindex, line in patients.iterrows():
        if count == 100:
            break
        if count % 10000 == 0:
            print(count, end='\r')
            
        patient_id = line['subject_id']
        encounter_id = line['hadm_id']
        readmission = line["label"]
        icd_code = line["diagnos_code"]
        ndc = line["medication_code"]
        procedures = line["procedure_code"]

        if patient_id not in patient_dict:
            patient_dict[patient_id] = []
        patient_dict[patient_id].append(encounter_id)
        patient_dict[patient_id].append(readmission)
        patient_dict[patient_id].append(icd_code)
        patient_dict[patient_id].append(ndc)
        patient_dict[patient_id].append(procedures)
        
        count +=1
    
    for patient_id, information in patient_dict.items():
        if sum(information[1]) > 0:
            medical_history = []
            icd_codes, med_codes, proc_codes = [], [], []
            for visit_no in range(information[0].shape[0]):
                c_hadm = information[0][visit_no]
                c_readmission = information[1][visit_no]
                c_icd_code = information[2][visit_no]
                c_ndc = information[3][visit_no]
                c_procedures = information[4][visit_no]
                
                icd_codes.append(c_icd_code)
                med_codes.append(c_ndc)
                proc_codes.append(c_procedures)
                
                medical_history.extend(c_icd_code)
                medical_history.extend(c_ndc)
                medical_history.extend(c_procedures)
                medical_history.extend(['SEP'])
                
            pat = Patient(patient_id, information[0], icd_codes, med_codes, proc_codes, medical_history)
            patient_history_dict[patient_id] = pat
        
    return patient_history_dict

In [141]:
def count_conditional_prob(patient_object):
    print("Conditional probabilites")
    
    tot_visits = 0
    dx_freqs = {}
    med_freqs = {}
    proc_freqs = {}
    
    dd_freqs = {}
    dm_freqs = {}
    
    
    for patient, pat_object in patient_object.items():
        #tot_visits +=1
        #print(pat_object.nvisits)
        tot_visits += pat_object.nvisits
        
        # Calculate occurences of diganose codes
        for dx_code in pat_object.diagnosis_codes:
            if dx_code not in dx_freqs:
                dx_freqs[dx_code] = 0
            dx_freqs[dx_code] += 1
        
        # Calculate occurences of medications codes
        for med_code in pat_object.medication_codes:
            if med_code not in med_freqs:
                med_freqs[med_code] = 0
            med_freqs[med_code] += 1
            
        # Calculate occurences of procedure_codes
        for proc_code in pat_object.procedure_codes:
            if proc_code == -1:
                continue 
            if proc_code not in proc_freqs:
                proc_freqs[proc_code] = 0
            proc_freqs[proc_code] += 1
        
        # Calculate occurences of diagnos and diagnos
        for dx_code1 in pat_object.diagnosis_codes:
            for dx_code2 in pat_object.diagnosis_codes: # Do only look forward in time
                comb = str(dx_code1) + ',' + str(dx_code2)
                if comb not in dd_freqs:
                    dd_freqs[comb] = 0
                dd_freqs[comb] += 1
        print(dd_freqs)
        print(pat_object.diagnosis_codes)
        break
        # Calculate occurences of diagnos and medications occurences
        for dx_code in pat_object.diagnosis_codes:
            for med_code in pat_object.medication_codes:
                comb = str(dx_code) + ',' + str(med_code)
                if comb not in dm_freqs:
                    dm_freqs[comb] = 0
                dm_freqs[comb] += 1
    
    dx_probs = dict([(k, v / float(tot_visits )) for k, v in dx_freqs.items()]) # P(D)
    med_probs = dict([(k, v / float(tot_visits)) for k, v in med_freqs.items()]) # P(M)
    proc_probs = dict([(k, v / float(tot_visits)) for k, v in proc_freqs.items()]) # P(Procs)
    
    dd_probs = dict([(k, v / float(tot_visits)) for k, v in dd_freqs.items()]) # P(D and D)
    dm_probs = dict([(k, v / float(tot_visits)) for k, v in dm_freqs.items()]) # P(D and M)
    
   # for k, item in dm_probs.items():
    #    if item > 1:
     #       print(k, item)
    #print(dd_probs)
    print(tot_visits)

In [128]:
pat_hist = {}
pat_hist = process_patient('../data/datasets/Synthea/Small_cohorts/train.parquet', pat_hist)

0

In [142]:
count_conditional_prob(pat_hist)

Conditional probabilites
{'59621000,59621000': 1, '59621000,224295006': 1, '59621000,160903007': 31, '59621000,706893006': 4, '59621000,15777000': 1, '59621000,160904001': 1, '59621000,423315002': 1, '224295006,59621000': 1, '224295006,224295006': 1, '224295006,160903007': 31, '224295006,706893006': 4, '224295006,15777000': 1, '224295006,160904001': 1, '224295006,423315002': 1, '160903007,59621000': 31, '160903007,224295006': 31, '160903007,160903007': 961, '160903007,706893006': 124, '160903007,15777000': 31, '160903007,160904001': 31, '160903007,423315002': 31, '706893006,59621000': 4, '706893006,224295006': 4, '706893006,160903007': 124, '706893006,706893006': 16, '706893006,15777000': 4, '706893006,160904001': 4, '706893006,423315002': 4, '15777000,59621000': 1, '15777000,224295006': 1, '15777000,160903007': 31, '15777000,706893006': 4, '15777000,15777000': 1, '15777000,160904001': 1, '15777000,423315002': 1, '160904001,59621000': 1, '160904001,224295006': 1, '160904001,160903007':