In [1]:
import numpy as np
import pandas as pd
import os
import shutil
import csv
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer as imputer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import glob
from driver import stack_list, stack_labels, return_shapes, return_to_original, split_probs




In [2]:
df_list = []
labels = []

for f in sorted(glob.glob("training/" + "/*.psv")):
    df_list.append(pd.read_csv(f,sep='|'))
    labels.append(df_list[len(df_list)-1]["SepsisLabel"])
    

#Making sure that imputation does not removes the EtC02 column
#Also removing the sepsislable column from the original data
for i in df_list:
    i.iloc[0,i.columns.get_loc('EtCO2')] = 0
    i.drop('SepsisLabel', axis=1, inplace=True)
    
    
df_list2 = []
labels2 = []

for f in sorted(glob.glob("training_setB/" + "/*.psv")):
    df_list2.append(pd.read_csv(f,sep='|'))
    labels2.append(df_list2[len(df_list2)-1]["SepsisLabel"])
    
    
for i in df_list2:
    i.drop('SepsisLabel', axis=1 , inplace=True)

In [3]:
# x_train, x_test, y_train, y_test = train_test_split(df_list,labels,test_size=0.4)
x_train = df_list
y_train = labels

x_test = df_list2
y_test = labels2

#Getting the datas original shapes before stacking tjem
train_shapes = return_shapes(x_train)
test_shapes = return_shapes(x_test)

#Normalizing and stacking the normalized data
x_train = stack_list(x_train)
x_train = (x_train-x_train.mean()) / x_train.std()
x_test = stack_list(x_test)
x_test = (x_test - x_test.mean()) / x_test.std()

y_train = stack_labels(y_train)
y_test = stack_labels(y_test)

In [6]:
classifier = sklearn.ensemble.RandomForestClassifier(n_estimators=60)
classifier.fit(x_train, y_train)

predicted_labels = return_to_original(classifier.predict(x_test),test_shapes)
probabilities = split_probs(return_to_original(classifier.predict_proba(x_test),test_shapes))
real_values = return_to_original(y_test, test_shapes)


print('This is the length of the predicted_labels list: {}'.format(len(predicted_labels)))
print('This is the length of the predicted_probabilities list: {}'.format(len(probabilities)))

ran_val = np.random.randint(low=5, high= len(predicted_labels)-1)
print('This is the shape of predicted_labels for patient no {}: {}'.format(ran_val,predicted_labels[ran_val].shape))
print('This is the shape of predicted_probabilities for patient no {}: {}'.format(ran_val,probabilities[ran_val].shape))
print('And the real shape of this patient no {} is : {}'.format(ran_val,test_shapes[ran_val]))

This is the length of the predicted_labels list: 20000
This is the length of the predicted_probabilities list: 20000
This is the shape of predicted_labels for patient no 17360: (36,)
This is the shape of predicted_probabilities for patient no 17360: (36,)
And the real shape of this patient no 17360 is : 36


In [29]:
#Writing files to a directory under predictions
output_directory = "predictions"
if os.path.exists(output_directory):
    shutil.rmtree(output_directory)
    os.mkdir(output_directory)
else: 
    os.mkdir(output_directory)

for i,f in enumerate(predicted_labels):
    with open('predictions/p1%05d.psv' % i, 'w') as f:
        f.write('PredictedProbability|PredictedLabel\n')
        for(s,l) in zip(probabilities[i],predicted_labels[i]):
            f.write('%g|%d\n' % (s,l))
    

In [None]:
def predict_outcomes():
    df_list = []
    labels = []

    for f in sorted(glob.glob("training/" + "/*.psv")):
        df_list.append(pd.read_csv(f,sep='|'))
        labels.append(df_list[len(df_list)-1]["SepsisLabel"])


    #Making sure that imputation does not removes the EtC02 column
    #Also removing the sepsislable column from the original data
    for i in df_list:
        i.iloc[0,i.columns.get_loc('EtCO2')] = 0
        i.drop('SepsisLabel', axis=1, inplace=True)


    df_list2 = []
    labels2 = []

    for f in sorted(glob.glob("training_setB/" + "/*.psv")):
        df_list2.append(pd.read_csv(f,sep='|'))
        labels2.append(df_list2[len(df_list2)-1]["SepsisLabel"])


    for i in df_list2:
        i.drop('SepsisLabel', axis=1 , inplace=True)


    x_train = df_list
    y_train = labels

    x_test = df_list2
    y_test = labels2

    #Getting the datas original shapes before stacking tjem
    train_shapes = return_shapes(x_train)
    test_shapes = return_shapes(x_test)

    #Normalizing and stacking the normalized data
    x_train = stack_list(x_train)
    x_train = (x_train-x_train.mean()) / x_train.std()
    x_test = stack_list(x_test)
    x_test = (x_test - x_test.mean()) / x_test.std()

    y_train = stack_labels(y_train)
    y_test = stack_labels(y_test)

    classifier = sklearn.ensemble.RandomForestClassifier(n_estimators=100)
    classifier.fit(x_train, y_train)
    
    
    #Checking the dimensions of the data, as well as returning them to their original shapes
    predicted_labels = return_to_original(classifier.predict(x_test),test_shapes)
    probabilities = split_probs(return_to_original(classifier.predict_proba(x_test),test_shapes))
    real_values = return_to_original(y_test, test_shapes)

    print('This is the length of the predicted_labels list: {}'.format(len(predicted_labels)))
    print('This is the length of the predicted_probabilities list: {}'.format(len(probabilities)))

    ran_val = np.random.randint(low=5, high= len(predicted_labels)-1)
    print('This is the shape of predicted_labels for patient no {}: {}'.format(ran_val,predicted_labels[ran_val].shape))
    print('This is the shape of predicted_probabilities for patient no {}: {}'.format(ran_val,probabilities[ran_val].shape))
    print('And the real shape of this patient no {} is : {}'.format(ran_val,test_shapes[ran_val]))
    
    
    #Writing files to a directory under predictions
    output_directory = "predictions"
    if os.path.exists(output_directory):
        shutil.rmtree(output_directory)
        os.mkdir(output_directory)
    else: 
        os.mkdir(output_directory)

    for i,f in enumerate(predicted_labels):
        with open('predictions/p1%05d.psv' % i, 'w') as f:
            f.write('PredictedProbability|PredictedLabel\n')
            for(s,l) in zip(probabilities[i],predicted_labels[i]):
                f.write('%g|%d\n' % (s,l))
    
    print('Done')
    return predicted_labels, probabilities

In [None]:
import numpy as np, os, sys

def load_challenge_data(file):
    with open(file, 'r') as f:
        header = f.readline().strip()
        column_names = header.split('|')
        data = np.loadtxt(f, delimiter='|')

    # Ignore SepsisLabel column if present.
    if column_names[-1] == 'SepsisLabel':
        column_names = column_names[:-1]
        data = data[:, :-1]

    return data

def save_challenge_predictions(file, scores, labels):
    with open(file, 'w') as f:
        f.write('PredictedProbability|PredictedLabel\n')
        for (s, l) in zip(scores, labels):
            f.write('%g|%d\n' % (s, l))

if __name__ == '__main__':
    # Parse arguments.
    if len(sys.argv) != 3:
        raise Exception('Include the input and output directories as arguments, e.g., python driver.py input output.')

    input_directory = "training_setB"
    output_directory = "predictions2"

    # Find files.
    files = []
    for f in os.listdir(input_directory):
        if os.path.isfile(os.path.join(input_directory, f)) and not f.lower().startswith('.') and f.lower().endswith('psv'):
            files.append(f)

    if not os.path.isdir(output_directory):
        os.mkdir(output_directory)



    # Iterate over files.
    print('Predicting sepsis labels...')
    num_files = len(files)
    for i, f in enumerate(files):
        print('    {}/{}...'.format(i+1, num_files))

        # Load data.
        input_file = os.path.join(input_directory, f)
        data = load_challenge_data(input_file)

        # Make predictions.
        num_rows = len(data)
        scores = np.zeros(num_rows)
        labels = np.zeros(num_rows)
        for t in range(num_rows):
            current_data = data[:t+1]
            
            imp = imputer(missing_values = np.nan, strategy = 'constant')
            current_data = imp.fit_transform(current_data)
            
            current_label = classifier.predict(current_data)[t]
            current_score = classifier.predict_proba(current_data)[t,0]
#             print(current_label)

            scores[t] = current_score
            labels[t] = current_label

        # Save results.
        output_file = os.path.join(output_directory, f)
        save_challenge_predictions(output_file, scores, labels)

    print('Done.')