In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2
import getpass
import seaborn as sns
# for configuring connection 
from configobj import ConfigObj
import os

%matplotlib inline

In [2]:
datadir = '/home/mei/nas/docker/dataset/EICU/eicu-collaborative-research-database-2.0/'
porcesseddir2 = '/home/mei/nas/docker/processedData_2/'

In [3]:
df=pd.read_csv(porcesseddir2 + 'embedded_pastHistory_sum.csv',index_col=0)

In [4]:
from fasterrisk.fasterrisk import RiskScoreOptimizer, RiskScoreClassifier

In [5]:
gender_distr_nan = df.groupby('gender', dropna = False)['gender'].size().to_frame('NoPatients')
gender_distr_nan.head()

Unnamed: 0_level_0,NoPatients
gender,Unnamed: 1_level_1
Female,25454
Male,28137
Unknown,3
,18


In [6]:
distr_nan = df.groupby('unitdischargestatus', dropna = False)['unitdischargestatus'].size().to_frame('NoPatients')
distr_nan.head()

Unnamed: 0_level_0,NoPatients
unitdischargestatus,Unnamed: 1_level_1
Alive,50826
Expired,2783
,3


In [7]:
distr_nan = df.groupby('apacheadmissiondx', dropna = False)['apacheadmissiondx'].size().to_frame('NoPatients')
print(distr_nan)

                                                    NoPatients
apacheadmissiondx                                             
ARDS-adult respiratory distress syndrome, non-c...         203
Abdomen only trauma                                         34
Abdomen/extremity trauma                                     7
Abdomen/face trauma                                          1
Abdomen/multiple trauma                                     29
...                                                        ...
Vascular surgery, other                                      8
Ventriculostomy                                              1
Weaning from mechanical ventilation (transfer f...          34
Whipple-surgery for pancreatic cancer                        1
NaN                                                        120

[302 rows x 1 columns]


In [8]:
distr_nan = df.groupby('admissionweight', dropna = False)['admissionweight'].size().to_frame('NoPatients')
print(distr_nan)

                 NoPatients
admissionweight            
0.0                       3
0.5                       6
0.6                       1
1.5                       1
1.9                       1
...                     ...
713.0                     1
735.0                     1
855.0                     1
953.0                     1
NaN                     961

[2934 rows x 1 columns]


In [9]:
distr_nan = df.groupby('dischargeweight', dropna = False)['dischargeweight'].size().to_frame('NoPatients')
print(distr_nan)

                 NoPatients
dischargeweight            
0.00                      5
1.00                      4
1.81                      1
4.00                      1
8.50                      1
...                     ...
305.60                    1
318.74                    1
328.20                    1
340.00                    1
NaN                   20468

[4356 rows x 1 columns]


# preprocessing data

In [5]:
data=df.copy()
# only keeps male and female 
data = data[data['gender'].isin(['Male', 'Female'])]
# drop rows where 'unitdischargestatus','unitdischargelocation','apacheadmissiondx' is NaN
data = data.dropna(subset=['unitdischargestatus','unitdischargelocation','apacheadmissiondx'])


### encode apacheadmissiondx to graph structure

In [6]:
data['apacheadmissiondx'] = data['apacheadmissiondx'].str.replace(" ", "", regex=True)
data['apacheadmissiondx'] = data['apacheadmissiondx'].str.replace(',', '/')
split = data['apacheadmissiondx'].str.split('/')
data['first'] = split.str[0]
data['second'] = split.str[1]

In [7]:
import networkx as nx
from node2vec import Node2Vec  

# Create the graph
G = nx.DiGraph()

# Adding edges based on 'first', 'second', and 'third' relationships
for _, row in data.iterrows():
    if pd.notna(row['first']) and pd.notna(row['second']):
        G.add_edge(row['first'], row['second'])
    elif pd.notna(row['first']):
        G.add_edge(row['first'], row['first'])
        
# Generate embeddings using node2vec
node2vec = Node2Vec(G, dimensions=16, walk_length=10, num_walks=100,workers=4)
model = node2vec.fit(window=5,min_count=1, batch_words=4)

# Obtain node embeddings
node_embeddings = {node: model.wv[node] for node in G.nodes()}

# Map embeddings back to the dataset
def get_embedding(row):
    if pd.notna(row['first']) and pd.notna(row['second']):

        first_embed = node_embeddings.get(row['first'])
        second_embed = node_embeddings.get(row['second'])
        combined_embedding = [(f + s) / 2 for f, s in zip(first_embed, second_embed)]
    elif pd.notna(row['first']):

        combined_embedding = node_embeddings.get(row['first'])
    return combined_embedding


data['apacheadmissiondx_embedding'] = data.apply(get_embedding, axis=1)


Computing transition probabilities: 100%|██████████| 357/357 [00:00<00:00, 74566.33it/s]
Generating walks (CPU: 1): 100%|██████████| 25/25 [00:00<00:00, 35.48it/s]
Generating walks (CPU: 3): 100%|██████████| 25/25 [00:00<00:00, 36.28it/s]
Generating walks (CPU: 2): 100%|██████████| 25/25 [00:00<00:00, 35.34it/s]
Generating walks (CPU: 4): 100%|██████████| 25/25 [00:00<00:00, 36.12it/s]


In [None]:
def classify_discharge_location(location):
    high_risk = ['ICU', 'Other ICU', 'Other ICU (CABG)', 'Operating Room']
    medium_risk = ['Telemetry', 'Other Hospital', 'Other External',  'Other Internal','Step-Down Unit (SDU)']
    low_risk = ['Nursing Home', 'Skilled Nursing Facility', 'Floor', 'Acute Care/Floor','Rehabilitation']
    home= ['Home']
    death = ['Death']
    
    if location in high_risk:
        return '3'
    elif location in medium_risk:
        return '2'
    elif location in low_risk:
        return '1'
    elif location in death:
        return '4'
    elif location in home:
        return '0'
    else:
        return 'Unknown'

data['discharge_risk_category'] = data['unitdischargelocation'].apply(classify_discharge_location)

In [72]:
data = data[data['discharge_risk_category'] != 'Unknown']
dataset=data.drop(columns=['unitadmitsource','apacheadmissiondx','first','second','unitdischargelocation'])
dataset['gender'] = dataset['gender'].map({'Male': 0, 'Female': 1})
dataset['unitdischargestatus'] = dataset['unitdischargestatus'].map({'Alive': -1, 'Expired': 1})


In [73]:
dataset.head()

Unnamed: 0,patientunitstayid,gender,age,admissionweight,dischargeweight,unitdischargeoffset,unitdischargestatus,max_pooled_embedding,apacheadmissiondx_embedding,discharge_risk_category
0,141168,1,70,84.3,85.8,3596,1,[ 1.2620435 -0.40905985 2.139895 3.092707...,"[-0.030633728951215744, -0.015972275286912918,...",4
1,141265,0,67,100.0,91.8,6068,-1,[ 0.2664819 -0.5805186 0.06527747 0.296352...,"[0.0007660947740077972, 0.017550092190504074, ...",1
2,141266,0,73,120.4,112.9,1501,-1,[ 0.45944792 -0.32571676 0.28904593 0.498963...,"[-0.04389399290084839, -0.002439655363559723, ...",1
3,141276,1,59,156.6,156.6,1684,-1,[ 0.2664819 -0.5805186 0.06527747 0.296352...,"[0.03449591249227524, -0.01944483444094658, 0....",0
4,141284,0,63,,88.5,2076,-1,[ 0.45944792 -0.62813926 0.16925086 0.403259...,"[-2.948194, -1.5538762, 2.7141335, 0.51363313,...",1


In [None]:
# dataset.to_csv(porcesseddir2 + 'embedded_dataset.csv')
# dataset=pd.read_csv(porcesseddir2 + 'embedded_dataset.csv')

## load dataset

In [74]:
from sklearn.model_selection import train_test_split

X=dataset.drop(columns=['discharge_risk_category','unitdischargestatus'])
y=dataset[['unitdischargestatus']]

In [83]:
X.head()

Unnamed: 0,patientunitstayid,gender,age,admissionweight,dischargeweight,unitdischargeoffset,max_pooled_embedding,apacheadmissiondx_embedding
0,141168,1,70,84.3,85.8,3596,[ 1.2620435 -0.40905985 2.139895 3.092707...,"[-0.030633728951215744, -0.015972275286912918,..."
1,141265,0,67,100.0,91.8,6068,[ 0.2664819 -0.5805186 0.06527747 0.296352...,"[0.0007660947740077972, 0.017550092190504074, ..."
2,141266,0,73,120.4,112.9,1501,[ 0.45944792 -0.32571676 0.28904593 0.498963...,"[-0.04389399290084839, -0.002439655363559723, ..."
3,141276,1,59,156.6,156.6,1684,[ 0.2664819 -0.5805186 0.06527747 0.296352...,"[0.03449591249227524, -0.01944483444094658, 0...."
4,141284,0,63,,88.5,2076,[ 0.45944792 -0.62813926 0.16925086 0.403259...,"[-2.948194, -1.5538762, 2.7141335, 0.51363313,..."


In [78]:
y = y['unitdischargestatus'].values


In [81]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Train Risk Score Models


In [82]:
sparsity =6
parent_size = 6

RiskScoreOptimizer_m = RiskScoreOptimizer(X = X_train, y = y_train, k = sparsity, parent_size = parent_size)

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


TypeError: loop of ufunc does not support argument 0 of type float which has no callable sqrt method

In [None]:
start_time = time.time()
RiskScoreOptimizer_m.optimize()
print("Optimization takes {:.2f} seconds.".format(time.time() - start_time))