In [6]:
import pandas as pd
df_cohort = pd.read_csv('data_gen/cohort.csv')

In [7]:
df_BMI = pd.read_csv('data/BMI.csv')
# Drop unreasltic values
df_BMI = df_BMI[(df_BMI.Värde > 3) & (df_BMI.Värde < 70)]
df_BMI.sort_values(by='Observation_datum', ascending=False, inplace = True)


# Heigh and weight values(cm,kg)
df_height = pd.read_csv('data/Height.csv').sort_values(by='Observation_datum' ,ascending=False)[['Patient_ID','Värde']]
df_weight = pd.read_csv('data/Weight.csv').sort_values(by='Observation_datum' ,ascending=False)[['Patient_ID','Värde']]
# Force convert or drop
df_height.Värde = pd.to_numeric(df_height.Värde, errors='coerce')
df_weight.Värde = pd.to_numeric(df_weight.Värde, errors='coerce')


# Place latest results first
df_HbA1c = pd.read_csv('data/HbA1c.csv').sort_values(by='Observation_datum', ascending=False)
print('Enheter HbA1C')
display( df_HbA1c.Enhet.value_counts())


# Mortality and sex
df_age_sex = pd.read_csv('data/age_sex.csv')


# Glucose
df_glucose = pd.read_csv('data/Glucose2.csv')
print('Enheter P-glucos/glukos')
display(df_glucose.Enhet.value_counts())
# Drop unrealistic values
df_glucose = df_glucose[df_glucose.Värde < 80]


# Blood pressure
df_blood_pressure = pd.read_csv('data/BloodPressure.csv')
# Convert Värde to numeric, if not possible, drop
df_blood_pressure.Värde = pd.to_numeric(df_blood_pressure.Värde, errors='coerce')
df_blood_pressure.dropna(subset=['Värde'], inplace = True)
# Drop unrealistic values
df_blood_pressure = df_blood_pressure[df_blood_pressure.Värde < 500]

print('Enheter Bloodpressure')
display(df_blood_pressure.Kod_Text.value_counts())

# Split blood pressure for systoliskt(max) and diastoliskt(min) and sort with latest date first   
df_blood_pressure_systoliskt = df_blood_pressure[df_blood_pressure.Kod_Text=='Blodtryck systoliskt'].sort_values(by='Observation_datum', ascending=False)
df_blood_pressure_diastoliskt = df_blood_pressure[df_blood_pressure.Kod_Text=='Blodtryck diastoliskt'].sort_values(by='Observation_datum', ascending=False)


# ICDs
df_ICD = pd.read_csv('data/BASELINE_ICDs.csv')
# Merge and drop
df_ICD = df_ICD.merge(df_cohort, how='right',on='Patient_ID').drop_duplicates(subset=['Patient_ID', 'Kod'])[['Patient_ID','Kod']]
# Top results
ICD_list = df_ICD.Kod.value_counts().index.values[0:10].tolist()

Enheter HbA1C


mmol/mol                3861
(taget)/                   4
Name: Enhet, dtype: int64

Enheter P-glucos/glukos


mmol/l                  259404
Name: Enhet, dtype: int64

Enheter Bloodpressure


Blodtryck systoliskt     353238
Blodtryck diastoliskt    352587
Name: Kod_Text, dtype: int64

# Support functions

In [9]:
def get_ICDs(Patient_ID):
    results = df_ICD[df_ICD.Patient_ID == Patient_ID]
    list = each_ICD(results)
    return list

def each_ICD(results):
    list = []
    for icd in ICD_list:
        if results[results.Kod==icd].shape[0] > 0:
            list = list+[1]
        else:
            list = list+[0]  
    return list
    
    
def get_BMI(Patient_ID):
    results = df_BMI[df_BMI.Patient_ID==Patient_ID]
    
    # Can't find BMI from BMI data set, check weight and height and calculate if possible
    if results.shape[0] < 1:
        BMI = get_BMI_w_h(Patient_ID)
        return BMI
    else:
        for item in results[['Värde']].values:
            try:
                return float(item)
            except:
                continue;
    
def get_BMI_w_h(Patient_ID):
    results_h = df_height[df_height.Patient_ID==Patient_ID]
    results_w = df_weight[df_weight.Patient_ID==Patient_ID]
    if (results_w.shape[0] > 0) & (results_h.shape[0] > 0 ):
        try:
            height = float(results_h[['Värde']].values[0][0])
            weight= float(results_w[['Värde']].values[0][0])
            BMI = weight / ((height/100)**2)
            # sanity check 
            if (BMI < 3) | (BMI > 100):
                return float('NaN')
            return BMI
        except:
            pass
    return float('NaN')
    
def get_blood_pressure(df, Patient_ID):
    results = df[df.Patient_ID==Patient_ID]
    if results.shape[0] > 0:
        return float(results[['Värde']].values[0][0])
    return float('NaN')
    

def get_glucose(Patient_ID):
    results = df_glucose[df_glucose.Patient_ID==Patient_ID]
    if results.shape[0] > 0:
        return float(results.Värde.mean())
    return float('NaN')

def get_HbA1c(Patient_ID):
    results = df_HbA1c[df_HbA1c.Patient_ID==Patient_ID]

    if len(results) < 1:
        return float('NaN')
    else:
        for item in results[['Värde']].values:
            try:
                return float(item)
            except:
                continue
        return float('NaN')
    
# expired, 1=died, 0=survived, NaN=-
def get_mortality(Patient_ID):
    results =  df_age_sex[df_age_sex.Patient_ID==Patient_ID]
    try:
        rtr = results[['AvlidenDatum']].values[0][0]
        return 1 if rtr != '-         ' else 0
    except:
        raise Exception('Error: AvlidenDatum missing, should not happen')


# Should only be one value, but can't hurt to catch errors
# Gender, 1=man=male, 0=kvinna=female
def get_gender(Patient_ID):
    results =  df_age_sex[df_age_sex.Patient_ID==Patient_ID]
    try:
        rtr = results[['Kön']].values[0][0]
        if (rtr=='M') | (rtr=='K'):
            return 1 if rtr=='M' else 0
        else:
            raise Exception('Error: Kön missing, should not happen') 
    except:
        raise Exception('Error: Kön missing, should not happen') 

In [10]:
def gen_dataset(df):
    patient = df['Patient_ID']
    return [
                patient,
                df['Ålder'],
                get_gender(patient),
                get_HbA1c(patient),
                get_glucose(patient),
                get_blood_pressure(df_blood_pressure_diastoliskt,patient),
                get_blood_pressure(df_blood_pressure_systoliskt, patient),
                get_BMI(patient),
                get_mortality(patient), 
    ]+get_ICDs(patient)


res = df_cohort.apply(gen_dataset, axis=1).tolist()


samples_transformed = pd.DataFrame(res, columns=['Patient_ID',
                                     'age',
                                     'gender',
                                     'HbA1c',
                                     'P-glucose', 
                                     'blood_pressure_diastoliskt',
                                     'blood_pressure_systoliskt',    
                                     'BMI',           
                                     'expired',
                                     ]+ICD_list
            )

In [12]:
samples_transformed.drop(columns=['Patient_ID']).to_csv('./data_gen/samples_transformed.csv', index=False)