In [128]:
import numpy as np
import pandas as pd
import os
import shutil
import csv
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer as imputer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import glob
from driver import stack_list, stack_labels, return_shapes, return_to_original, split_probs




In [2]:
df_list = []
labels = []

for f in sorted(glob.glob("training/" + "/*.psv")):
    df_list.append(pd.read_csv(f,sep='|'))
    labels.append(df_list[len(df_list)-1]["SepsisLabel"])
    

#Making sure that imputation does not removes the EtC02 column
#Also removing the sepsislable column from the original data
for i in df_list:
    i.iloc[0,i.columns.get_loc('EtCO2')] = 0
    i.drop('SepsisLabel', axis=1, inplace=True)
    
    
df_list2 = []
labels2 = []

for f in sorted(glob.glob("training_setB/" + "/*.psv")):
    df_list2.append(pd.read_csv(f,sep='|'))
    labels2.append(df_list2[len(df_list2)-1]["SepsisLabel"])
    
    
for i in df_list2:
    i.drop('SepsisLabel', axis=1 , inplace=True)

In [74]:
# x_train, x_test, y_train, y_test = train_test_split(df_list,labels,test_size=0.4)
x_train = df_list
y_train = labels

x_test = df_list2
y_test = labels2

#Getting the datas original shapes before stacking tjem
train_shapes = return_shapes(x_train)
test_shapes = return_shapes(x_test)

#Normalizing and stacking the normalized data
x_train = stack_list(x_train)
x_train = (x_train-x_train.mean()) / x_train.std()
x_test = stack_list(x_test)
x_test = (x_test - x_test.mean()) / x_test.std()

y_train = stack_labels(y_train)
# y_test = stack_labels(y_test)


# #Shuffling the train data for better randomized result
# randomizer = np.random.permutation(len(x_train))
# x_train = x_train[randomizer]
# y_train = y_train[randomizer]

In [130]:
classifier = sklearn.ensemble.RandomForestClassifier(n_estimators=60)
classifier.fit(x_train, y_train)

('Creating the model is done ! Now commencing to the prediction part...')
predicted_labels = return_to_original(classifier.predict(x_test),test_shapes)
probabilities = split_probs(return_to_original(classifier.predict_proba(x_test),test_shapes))
# real_values = return_to_original(y_test, test_shapes)


print('This is the length of the predicted_labels list: {}'.format(len(predicted_labels)))
print('This is the length of the predicted_probabilities list: {}'.format(len(probabilities)))

print('*************************************************************************************************')
ran_val = np.random.randint(low=5, high= len(predicted_labels)-1)
print('This is the shape of predicted_labels for patient no {}: {}'.format(ran_val,predicted_labels[ran_val].shape))
print('This is the shape of predicted_probabilities for patient no {}: {}'.format(ran_val,probabilities[ran_val].shape))
print('And the real shape of this patient no {} is : {}'.format(ran_val,test_shapes[ran_val]))

This is the length of the predicted_labels list: 20000
This is the length of the predicted_probabilities list: 20000
*************************************************************************************************
This is the shape of predicted_labels for patient no 9543: (22,)
This is the shape of predicted_probabilities for patient no 9543: (22,)
And the real shape of this patient no 9543 is : 22


In [138]:
predicted_labels = classifier.predict(x_test)
result = []
index = 0
    
for i,k in enumerate(test_shapes):
    result.append(predicted_labels[index:index+k])
    index += k
    

print(len(predicted_labels))
print(len(result))
print(result[1].shape)
print(result[1])
print(result[2])




761995
20000
(25,)
[0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0]


In [139]:
# predicted_labels = classifier.predict(x_test)
predicted_labels = result

check_list(y_train)
check_list(y_test)

print('**********************')

check_list(predicted_labels)
check_list(real_values)

773079
751215
**********************
14666
20000


In [140]:
print(predicted_labels[28])
print(predicted_labels[12])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [55]:
#Writing files to a directory under predictions
output_directory = "predictions"
if os.path.exists(output_directory):
    shutil.rmtree(output_directory)
    os.mkdir(output_directory)
else: 
    os.mkdir(output_directory)

for i,f in enumerate(predicted_labels):
    with open('predictions/p1%05d.psv' % i, 'w') as f:
        f.write('PredictedProbability|PredictedLabel\n')
        for(s,l) in zip(probabilities[i],predicted_labels[i]):
            f.write('%g|%d\n' % (s,l))
    

In [56]:
def predict_outcomes():
    df_list = []
    labels = []

    for f in sorted(glob.glob("training/" + "/*.psv")):
        df_list.append(pd.read_csv(f,sep='|'))
        labels.append(df_list[len(df_list)-1]["SepsisLabel"])


    #Making sure that imputation does not removes the EtC02 column
    #Also removing the sepsislable column from the original data
    for i in df_list:
        i.iloc[0,i.columns.get_loc('EtCO2')] = 0
        i.drop('SepsisLabel', axis=1, inplace=True)


    df_list2 = []
    labels2 = []

    for f in sorted(glob.glob("training_setB/" + "/*.psv")):
        df_list2.append(pd.read_csv(f,sep='|'))
        labels2.append(df_list2[len(df_list2)-1]["SepsisLabel"])


    for i in df_list2:
        i.drop('SepsisLabel', axis=1 , inplace=True)


    x_train = df_list
    y_train = labels

    x_test = df_list2
    y_test = labels2

    #Getting the datas original shapes before stacking tjem
    train_shapes = return_shapes(x_train)
    test_shapes = return_shapes(x_test)

    #Normalizing and stacking the normalized data
    x_train = stack_list(x_train)
    x_train = (x_train-x_train.mean()) / x_train.std()
    x_test = stack_list(x_test)
    x_test = (x_test - x_test.mean()) / x_test.std()

    y_train = stack_labels(y_train)
    y_test = stack_labels(y_test)
    
    randomizer = np.random.permutation(len(x_train))
    x_train = x_train[randomizer]
    y_train = y_train[randomizer]

    classifier = sklearn.ensemble.RandomForestClassifier(n_estimators=100)
    classifier.fit(x_train, y_train)
    
    
    #Checking the dimensions of the data, as well as returning them to their original shapes
    predicted_labels = return_to_original(classifier.predict(x_test),test_shapes)
    probabilities = split_probs(return_to_original(classifier.predict_proba(x_test),test_shapes))
    real_values = return_to_original(y_test, test_shapes)

    print('This is the length of the predicted_labels list: {}'.format(len(predicted_labels)))
    print('This is the length of the predicted_probabilities list: {}'.format(len(probabilities)))

    ran_val = np.random.randint(low=5, high= len(predicted_labels)-1)
    print('This is the shape of predicted_labels for patient no {}: {}'.format(ran_val,predicted_labels[ran_val].shape))
    print('This is the shape of predicted_probabilities for patient no {}: {}'.format(ran_val,probabilities[ran_val].shape))
    print('And the real shape of this patient no {} is : {}'.format(ran_val,test_shapes[ran_val]))
    
    
    #Writing files to a directory under predictions
    output_directory = "predictions"
    if os.path.exists(output_directory):
        shutil.rmtree(output_directory)
        os.mkdir(output_directory)
    else: 
        os.mkdir(output_directory)

    for i,f in enumerate(predicted_labels):
        with open('predictions/p1%05d.psv' % i, 'w') as f:
            f.write('PredictedProbability|PredictedLabel\n')
            for(s,l) in zip(probabilities[i],predicted_labels[i]):
                f.write('%g|%d\n' % (s,l))
    
    print('Done')
    return predicted_labels, probabilities

In [73]:
#Returns the detection hour of the patient.
def detection_hour(label, prediction):
    if np.max(label) == 0 or np.max(prediction) == 0:
        return 500
    
    real_time = np.argmax(label)
    prediction_time = np.argmax(prediction)
    
    return prediction_time - real_time
    
#Checks how many 0's present in the current list.
def check_list(list):
    counter = 0
    for i in list:
        if np.max(i) == 0:
            counter+= 1
        
    print(counter)
    
    return