In [14]:
# import packages
from statistics import mean
import pandas as pd
import numpy as np

# import Sklearn packages
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier

# Read data
df_features = pd.read_csv("train_features.csv")
df_labels = pd.read_csv("train_labels.csv")
df_test = pd.read_csv("test_features.csv")

features_subtask1 = ["BaseExcess" ,"Fibrinogen","AST","Alkalinephos","Bilirubin_total","Lactate","TroponinI","SaO2","Bilirubin_direct","EtCO2"]
label_subtask1 = ["LABEL_BaseExcess" ,"LABEL_Fibrinogen","LABEL_AST","LABEL_Alkalinephos","LABEL_Bilirubin_total","LABEL_Lactate","LABEL_TroponinI","LABEL_SaO2","LABEL_Bilirubin_direct","LABEL_EtCO2"]

# create a vector with only pids
patientID = df_features["pid"].drop_duplicates()
nbrpatient = patientID.size

patientIDTest = df_test["pid"].drop_duplicates()
nbrpatientTest = patientIDTest.size

In [15]:
# FOR X_TRAIN

# preprocess 1:
# age range 
Bool10 = (df_features["Age"] <= 15) & (df_features["Age"] >= 10)
Bool15 = (df_features["Age"] <= 20) & (df_features["Age"] >= 15)
Bool20 = (df_features["Age"] <= 25) & (df_features["Age"] >= 20)
Bool25 = (df_features["Age"] <= 30) & (df_features["Age"] >= 25)
Bool30 = (df_features["Age"] <= 35) & (df_features["Age"] >= 30)
Bool35 = (df_features["Age"] <= 40) & (df_features["Age"] >= 35)
Bool40 = (df_features["Age"] <= 45) & (df_features["Age"] >= 40)
Bool45 = (df_features["Age"] <= 50) & (df_features["Age"] >= 45)
Bool50 = (df_features["Age"] <= 55) & (df_features["Age"] >= 50)
Bool55 = (df_features["Age"] <= 60) & (df_features["Age"] >= 55)
Bool60 = (df_features["Age"] <= 65) & (df_features["Age"] >= 60)
Bool65 = (df_features["Age"] <= 70) & (df_features["Age"] >= 65)
Bool70 = (df_features["Age"] <= 75) & (df_features["Age"] >= 70)
Bool75 = (df_features["Age"] <= 80) & (df_features["Age"] >= 75)
Bool80 = (df_features["Age"] <= 85) & (df_features["Age"] >= 80)
Bool85 = (df_features["Age"] <= 90) & (df_features["Age"] >= 85)
Bool90 = (df_features["Age"] <= 95) & (df_features["Age"] >= 90)
Bool95 = (df_features["Age"] <= 101) & (df_features["Age"] >= 95)

A = df_features[Bool10]
df_features[Bool10] = A.fillna(df_features[Bool10].median()) 
A = df_features[Bool20]
df_features[Bool20] = A.fillna(df_features[Bool20].median()) 
A = df_features[Bool30]
df_features[Bool30] = A.fillna(df_features[Bool30].median()) 
A = df_features[Bool40]
df_features[Bool40] = A.fillna(df_features[Bool40].median()) 
A = df_features[Bool50]
df_features[Bool50] = A.fillna(df_features[Bool50].median()) 
A = df_features[Bool60]
df_features[Bool60] = A.fillna(df_features[Bool60].median()) 
A = df_features[Bool70]
df_features[Bool70] = A.fillna(df_features[Bool70].median()) 
A = df_features[Bool80]
df_features[Bool80] = A.fillna(df_features[Bool80].median()) 
A = df_features[Bool90]
df_features[Bool90] = A.fillna(df_features[Bool90].median()) 
A = df_features[Bool15]
df_features[Bool15] = A.fillna(df_features[Bool15].median()) 
A = df_features[Bool25]
df_features[Bool25] = A.fillna(df_features[Bool25].median()) 
A = df_features[Bool35]
df_features[Bool35] = A.fillna(df_features[Bool35].median()) 
A = df_features[Bool45]
df_features[Bool45] = A.fillna(df_features[Bool45].median())
A = df_features[Bool55]
df_features[Bool55] = A.fillna(df_features[Bool55].median()) 
A = df_features[Bool65]
df_features[Bool65] = A.fillna(df_features[Bool65].median()) 
A = df_features[Bool75]
df_features[Bool75] = A.fillna(df_features[Bool75].median()) 
A = df_features[Bool85]
df_features[Bool85] = A.fillna(df_features[Bool85].median()) 
A = df_features[Bool95]
df_features[Bool95] = A.fillna(df_features[Bool95].median()) 

# preprocess 2:
# if there is still NaN nbr (actually we have them in HCO3)
# replace NaN by the mean value of the whole column
df_features.fillna(df_features.mean(), inplace=True)

In [16]:
# FOR X_TEST

# preprocess 1:
# age range 
Bool10 = (df_test["Age"] <= 15) & (df_test["Age"] >= 10)
Bool15 = (df_test["Age"] <= 20) & (df_test["Age"] >= 15)
Bool20 = (df_test["Age"] <= 25) & (df_test["Age"] >= 20)
Bool25 = (df_test["Age"] <= 30) & (df_test["Age"] >= 25)
Bool30 = (df_test["Age"] <= 35) & (df_test["Age"] >= 30)
Bool35 = (df_test["Age"] <= 40) & (df_test["Age"] >= 35)
Bool40 = (df_test["Age"] <= 45) & (df_test["Age"] >= 40)
Bool45 = (df_test["Age"] <= 50) & (df_test["Age"] >= 45)
Bool50 = (df_test["Age"] <= 55) & (df_test["Age"] >= 50)
Bool55 = (df_test["Age"] <= 60) & (df_test["Age"] >= 55)
Bool60 = (df_test["Age"] <= 65) & (df_test["Age"] >= 60)
Bool65 = (df_test["Age"] <= 70) & (df_test["Age"] >= 65)
Bool70 = (df_test["Age"] <= 75) & (df_test["Age"] >= 70)
Bool75 = (df_test["Age"] <= 80) & (df_test["Age"] >= 75)
Bool80 = (df_test["Age"] <= 85) & (df_test["Age"] >= 80)
Bool85 = (df_test["Age"] <= 90) & (df_test["Age"] >= 85)
Bool90 = (df_test["Age"] <= 95) & (df_test["Age"] >= 90)
Bool95 = (df_test["Age"] <= 101) & (df_test["Age"] >= 95)

A = df_test[Bool10]
df_test[Bool10] = A.fillna(df_test[Bool10].median()) 
A = df_test[Bool20]
df_test[Bool20] = A.fillna(df_test[Bool20].median()) 
A = df_test[Bool30]
df_test[Bool30] = A.fillna(df_test[Bool30].median()) 
A = df_test[Bool40]
df_test[Bool40] = A.fillna(df_test[Bool40].median()) 
A = df_test[Bool50]
df_test[Bool50] = A.fillna(df_test[Bool50].median()) 
A = df_test[Bool60]
df_test[Bool60] = A.fillna(df_test[Bool60].median()) 
A = df_test[Bool70]
df_test[Bool70] = A.fillna(df_test[Bool70].median()) 
A = df_test[Bool80]
df_test[Bool80] = A.fillna(df_test[Bool80].median()) 
A = df_test[Bool90]
df_test[Bool90] = A.fillna(df_test[Bool90].median()) 
A = df_test[Bool15]
df_test[Bool15] = A.fillna(df_test[Bool15].median()) 
A = df_test[Bool25]
df_test[Bool25] = A.fillna(df_test[Bool25].median()) 
A = df_test[Bool35]
df_test[Bool35] = A.fillna(df_test[Bool35].median()) 
A = df_test[Bool45]
df_test[Bool45] = A.fillna(df_test[Bool45].median())
A = df_test[Bool55]
df_test[Bool55] = A.fillna(df_test[Bool55].median()) 
A = df_test[Bool65]
df_test[Bool65] = A.fillna(df_test[Bool65].median()) 
A = df_test[Bool75]
df_test[Bool75] = A.fillna(df_test[Bool75].median()) 
A = df_test[Bool85]
df_test[Bool85] = A.fillna(df_test[Bool85].median()) 
A = df_test[Bool95]
df_test[Bool95] = A.fillna(df_test[Bool95].median()) 

# preprocess 2:
df_test.fillna(df_test.mean(), inplace=True) 

In [17]:
Results1 = pd.DataFrame()
Results1["pid"] = patientIDTest
for i in features_subtask1:
    X = df_features[[i]].to_numpy().reshape(nbrpatient,12)
    X = pd.DataFrame(X, columns = ['Hour 1','Hour 2','Hour 3','Hour 4','Hour 5','Hour 6','Hour 7','Hour 8','Hour 9','Hour 10','Hour 11','Hour 12'])
    X_test = df_test[[i]].to_numpy().reshape(nbrpatientTest,12)
    X_test = pd.DataFrame(X_test, columns = ['Hour 1','Hour 2','Hour 3','Hour 4','Hour 5','Hour 6','Hour 7','Hour 8','Hour 9','Hour 10','Hour 11','Hour 12'])
        
    # adding the standard deviation as another feature
    X["std Dev"] = X.std(axis=1)
    X_test["std Dev"] = X_test.std(axis=1)
    # adding the age feature
    AgeFeature_array = df_features[["Age"]].to_numpy().reshape(nbrpatient,12)
    AgeFeature = pd.DataFrame(AgeFeature_array[:,0])
    X["age"] = AgeFeature
    AgeFeatureTest_array = df_test[["Age"]].to_numpy().reshape(nbrpatientTest,12)  # we do the same transformation to the 
    AgeFeatureTest = pd.DataFrame(AgeFeature_array[:,0])                           # X_test matrix because it will be fed
    X_test["age"] = AgeFeatureTest                                                 # to the same classifier
    
    B = df_features[["Temp"]].to_numpy().reshape(nbrpatient,12)  # adding only min and max of temperature as new feature
    X["Temp"] = pd.DataFrame(B.min(axis=1))
    X["Temp"] = pd.DataFrame(B.max(axis=1))
    C = df_test[["Temp"]].to_numpy().reshape(nbrpatientTest,12)
    X_test["Temp"] = pd.DataFrame(C.min(axis=1))
    X_test["Temp"] = pd.DataFrame(C.max(axis=1))
    
    B = df_features[["RRate"]].to_numpy().reshape(nbrpatient,12)  # same for RRate, Glucose, and Chloride
    X["RRate"] = pd.DataFrame(B.min(axis=1))
    X["RRate"] = pd.DataFrame(B.max(axis=1))
    C = df_test[["RRate"]].to_numpy().reshape(nbrpatientTest,12)
    X_test["RRate"] = pd.DataFrame(C.min(axis=1))
    X_test["RRate"] = pd.DataFrame(C.max(axis=1))
    
    B = df_features[["Glucose"]].to_numpy().reshape(nbrpatient,12)
    X["Glucose"] = pd.DataFrame(B.min(axis=1))
    X["Glucose"] = pd.DataFrame(B.max(axis=1))
    C = df_test[["Glucose"]].to_numpy().reshape(nbrpatientTest,12)
    X_test["Glucose"] = pd.DataFrame(C.min(axis=1))
    X_test["Glucose"] = pd.DataFrame(C.max(axis=1))
    
    B = df_features[["Chloride"]].to_numpy().reshape(nbrpatient,12)
    X["Chloride"] = pd.DataFrame(B.min(axis=1))
    X["Chloride"] = pd.DataFrame(B.max(axis=1))
    C = df_test[["Chloride"]].to_numpy().reshape(nbrpatientTest,12)
    X_test["Chloride"] = pd.DataFrame(C.min(axis=1))
    X_test["Chloride"] = pd.DataFrame(C.max(axis=1))
    
    scaler = MinMaxScaler(feature_range=(-1,1)).fit(X)   # thanks to the advice of a friend, we get a better model when 
    X = scaler.transform(X)                              # rescalling the feature's value between -1 and 1
    scaler = MinMaxScaler(feature_range=(-1,1)).fit(X_test)
    X_test = scaler.transform(X_test)
    
    Y = df_labels["LABEL_"+i]

    model1 = GradientBoostingClassifier()          # Before I used SVC but it was going slower and return a less accurate model
    model1.fit(X,Y)
    
    Y_predicted = model1.decision_function(X_test)
    Y_predicted = 1/(1 + np.exp(-Y_predicted))     # getting the final value (between 0 and 1 thanks to the sigmoid function)
    Results1["LABEL_"+i] = Y_predicted             # storing the results for subtask 1

compression_opts = dict(method='zip', archive_name='prediction.csv') # finally saving and compressing the predicted results
Results1.to_csv('prediction.zip', index=False, float_format='%.3f',compression=compression_opts) 