We did provide the file ``` requirements.txt ``` though. 

In [2]:
#imports
import data
import preprocessing
from sklearn.linear_model import RidgeCV
from sklearn.metrics import r2_score, roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier 
import prepare_submission
from sklearn.model_selection import train_test_split
import importlib
import copy
from sklearn.model_selection import KFold
import threading

### Data Pre Processing

As described above, the train and test features contain a lot of missing values. Therefore, we need to develop a
strategy to deal with these missing values. In a nutshell, we are imputing with the median value for each feature
and standardizing each feature to zero mean and unit variance. This is happening in a number of steps

1. ```preprocessing.prepare_features()``` Fills up values for each for each patient based on their data. This means
that if patient ```i```has a missing feature at a certain timestamp, it will be filled up with the median from the other
timestamps. Important: If a patient is missing a value for a feature fora all timestamps, those values are left as
 missing and will be filled up in a next step. Finally, each patient is flattened into a single row vector.

2. ```preprocessing.impute_features()``` Looks at the flattened vectors of patients in the train and test dataset. Here,
missing values are again imputed based on the median, so if patient ```i```was missing values for blood pressure at all
timestamps, then those are filled up here with the median blood pressure of all patients for each timestamp ```j```.
After calling this method, the train and test features will not contain any ```np.nan```anymore.

3. ```preprocessing.standardize_features()``` standardizes the features to unit variance and zero mean over train and
test data.


In [None]:
split = False

raw_train_features = data.get_training_features()
raw_train_labels = data.get_training_labels()
raw_test_features = data.get_test_features()

if split==False:
    #load data from files
    #reshape the features in order to have one row per patient and impute on a per-patient-level
    importlib.reload(preprocessing)
    reshaped_train_features = preprocessing.prepare_features(raw_train_features,
                                                            appendix="train", read_from_file=False)
    reshaped_test_features = preprocessing.prepare_features(raw_test_features, appendix='test')

    #Fill out the missing data points
    train, test = preprocessing.impute_features(reshaped_train_features,reshaped_test_features)

    #Standardize the features
    train_features, test_features = preprocessing.standardize_features(train, test)
    train_labels = raw_train_labels

else:
    #splitting strategy in order to be able to give a score to our models
    del raw_test_features # we dont need them anymore so lets free memory

    reshaped_train_features = preprocessing.prepare_features(raw_train_features, read_from_file=False)

    # make a split
    s_train_features, s_test_features, s_train_labels, s_test_labels = train_test_split(
        copy.deepcopy(reshaped_train_features), copy.deepcopy(raw_train_labels),  test_size = 0.33)


    # fill out values that were not imputed in last step becasue a patient was missing all of them
    train, test = preprocessing.impute_features(s_train_features,s_test_features)

    train_features, test_features = preprocessing.standardize_features(train, test)
    train_labels = s_train_labels
    test_labels = s_test_labels

In [None]:
# Prepare submission instance
our_submission = prepare_submission.Submission(test_features.index)

## Subtask 1

For this model, we use GradientBoostingClassifier, which fits a number of decision trees on the data. Trees are great
for this application because they offer non-linearity without us having to use a neural network.
When researching the best methods for fitting trees to our data, we read that GBCs offer a very good model
because additional trees are fitted to predict the negative gradient of the cost function.

In [None]:

relevant_labels_1 = ['LABEL_BaseExcess','LABEL_Fibrinogen','LABEL_AST','LABEL_Alkalinephos',
 'LABEL_Bilirubin_total','LABEL_Lactate','LABEL_TroponinI','LABEL_SaO2',
 'LABEL_Bilirubin_direct','LABEL_EtCO2']

models = {label:None for label in relevant_labels}
probabilities_1 = copy.deepcopy(models)
threads = copy.deepcopy(models)

model_params = {'loss': 'deviance', 'random_state': 0}

# Here we fit all the models
for label, model in models.items():
    models[label] = GradientBoostingClassifier(**model_params)
    threads[label] = threading.Thread(target=models[label].fit, args=[train_features,train_labels[label]])
    threads[label].start()

for label, thread in threads.items():
    thread.join()

In [None]:
# Get the predictions on the test set
for label, model in models.items():
    prob_vector = model.predict_proba(test_features)[:,1]
    probabilities_1[label] = prob_vector

# Store predictions in a separate csv file
importlib.reload(prepare_submission)
our_submission.add_task_1_dict(probabilities_1)

## Subtask 2

Here we use the same strategy as in task 1.

In [None]:
label_2 = 'LABEL_Sepsis'
model_params = {'loss': 'deviance',  'random_state': 0}

model =  GradientBoostingClassifier(**model_params, n_estimators=200)
model.fit(train_features,train_labels[labe_2l])

#compute predicitons on the test set
probabilities_2 = model.predict_proba(test_features)

#store predictions in a separate file
importlib.reload(prepare_submission)
our_submission.add_task_2(probabilities_2[:,1])

## Subtask 3

The idea here is to fit a linear regressor with a ridge penalty determined with cross validation.

In [None]:

relevant_labels_3 = ['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']
train_labels = train_labels[relevant_labels_3]

splitter = KFold(n_splits=5, random_state=0, shuffle=True)

#Build model
def scorer(estimator, X, y):
    y_pred = estimator.predict(X)
    return r2_score(y,y_pred)

general_parameters = {'fit_intercept':True, 'scoring':scorer, 'cv': splitter}
models = {key:RidgeCV(**general_parameters) for key in relevant_labels}

#Fit model
for index,label in enumerate(relevant_labels_3):
    models[label].fit(train_features,train_labels[label])

#Make predictions
submission_predictions = {label: None for label in relevant_labels}
for label, model in models.items():
    submission_predictions[label] = model.predict(test_features)

#Store prediction
our_submission.add_task_3(submission_predictions)

### How do the models perform?
Now if we set the variable split to True in the beginning of the notebook, we can split the training data into a train set and a validation set. This way we can see how our models perform.

In [None]:
if split:
    importlib.reload(score_submission)
    score_submission.get_score(test_labels,our_submission.data)
else:
    our_submission.save()
    our_submission.data.to_csv('final_submission.csv', index = True, float_format='%.3f')
    our_submission.data.to_csv('final_submission.zip', index = True, float_format='%.3f', compression='zip')