In [17]:
# import packages
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle


# import Sklearn packages
from sklearn.datasets import load_linnerud
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import Ridge

# Read data
df_features = pd.read_csv("train_features.csv")
df_labels = pd.read_csv("train_labels.csv")

In [18]:
all_features = ['pid','Time','Age','EtCO2', 'PTT', 'BUN', 'Lactate', 'Temp', 'Hgb', 'HCO3', 'BaseExcess', 'RRate', 'Fibrinogen', 'Phosphate', 'WBC', 'Creatinine', 'PaCO2', 'AST', 'FiO2', 'Platelets', 'SaO2', 'Glucose', 'ABPm', 'Magnesium', 'Potassium', 'ABPd', 'Calcium', 'Alkalinephos', 'SpO2', 'Bilirubin_direct', 'Chloride', 'Hct', 'Heartrate', 'Bilirubin_total', 'TroponinI', 'ABPs', 'pH']

features = ['EtCO2', 'PTT', 'BUN', 'Lactate', 'Temp', 'Hgb', 'HCO3', 'BaseExcess', 'RRate', 'Fibrinogen', 'Phosphate', 'WBC', 'Creatinine', 'PaCO2', 'AST', 'FiO2', 'Platelets', 'SaO2', 'Glucose', 'ABPm', 'Magnesium', 'Potassium', 'ABPd', 'Calcium', 'Alkalinephos', 'SpO2', 'Bilirubin_direct', 'Chloride', 'Hct', 'Heartrate', 'Bilirubin_total', 'TroponinI', 'ABPs', 'pH']

labels_test = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total', 'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2', 'LABEL_Bilirubin_direct', 'LABEL_EtCO2', 'LABEL_Sepsis']
label_measure = ['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2','LABEL_Heartrate']

In [19]:
#we isolate the patient who dont have undergo test
df_test = df_labels[labels_test]
untested_patients = []

for index, row in tqdm(df_test.iterrows()):
    if not True in df_test.values[index]:
        untested_patients.append(df_labels['pid'][index])

print(str(100*len(untested_patients)/len(df_labels)) + '% of patient are untested')

18995it [00:01, 13032.47it/s]

41.4108976046328% of patient are untested





In [20]:
df_features_untested = df_features[df_features['pid'].isin(untested_patients)]
df_features_tested = df_features[~df_features['pid'].isin(untested_patients)]

 we check if there is difference between tested and untested patient

In [21]:
median_untested = []
median_tested = []
for feature in features:
    feature_median = np.nanmedian(df_features_untested[feature])
    median_untested.append(feature_median)
    feature_median = np.nanmedian(df_features_tested[feature])
    median_tested.append(feature_median)

In [22]:
def std_perso(aMedian,aList):
    val = 0
    num = 0
    for element in aList:
        if not np.isnan(element):
            val = val + (element-aMedian)**2
            num = num + 1
    if num != 0:
        val =(val/num)**(0.5)
        return val
    else:
        return val

In [23]:
std_untested = []
for index,feature in tqdm(enumerate(features)):
    feature_median = std_perso(median_untested[index],df_features_untested[feature])
    std_untested.append(feature_median)

34it [00:04,  6.93it/s]


In [24]:
std_tested = []
for index,feature in tqdm(enumerate(features)):
    feature_median = std_perso(median_tested[index],df_features_tested[feature])
    std_tested.append(feature_median)

34it [00:07,  4.78it/s]


In [25]:
print('feature : median diff | std difference')
for index, feature in enumerate(features):
    print(feature +' : '+str(std_untested[index])+'-'+str(std_tested[index])+' | '+str(median_untested[index])+'-'+str(median_tested[index])) 

feature : median diff | std difference
EtCO2 : 8.881188558074962-7.608736952272735 | 35.0-33.0
PTT : 26.108717114691775-27.513056440535625 | 30.0-33.2
BUN : 18.61003499972745-22.279982569428654 | 16.0-17.0
Lactate : 1.4893767783665541-2.6880309156320457 | 1.75-2.2
Temp : 0.7958491878168131-0.9385414999067422 | 37.0-37.0
Hgb : 2.0660329435437914-2.0747726313124017 | 10.8-10.4
HCO3 : 4.166118602394253-4.47275515561897 | 24.0-23.0
BaseExcess : 3.3765570133818907-4.333454202621264 | 0.0--1.0
RRate : 4.678503215257392-5.275542717209728 | 18.0-18.0
Fibrinogen : 125.09142767141546-137.085682530986 | 270.0-229.0
Phosphate : 1.1920750299635157-1.4943754619015706 | 3.4-3.5
WBC : 5.8492929530657145-11.844241409943043 | 10.1-10.7
Creatinine : 1.945863658714876-1.986326546749564 | 0.9-0.99
PaCO2 : 7.591250035890977-9.241907947573985 | 41.0-40.0
AST : 76.10149838149387-818.8694909769968 | 25.0-43.0
FiO2 : 54.65486271566962-0.21422680442725414 | 0.5-0.5
Platelets : 100.64102381251584-106.926196758057

In [27]:
def median_perso(aMedian,aList):
    val = aMedian
    num = 0
    for element in aList:
        if not np.isnan(element):
            val = val + (element-aMedian)
            num = num + 1
    if num != 0:
        return (val/num)
    else:
        return val

In [28]:
#we create a matrix containing for each row all the data of each patient that we found relevent
number_of_patients = int(df_features.shape[0]/12)
X_median = []

for k in tqdm(range(0,number_of_patients)):
    patient_data = []
    age = df_features['Age'][k*12]
    patient_data.append(age)
    patient_features = df_features[features][k*12:(k+1)*12]
    for index,feature in enumerate(features):
        feature_median_dev = median_perso(median_untested[index],patient_features[feature])
        patient_data.append(feature_median_dev)
        feature_std_dev = std_perso(median_untested[index],patient_features[feature])-std_untested[index]
        patient_data.append(feature_std_dev)
    X_median.append(patient_data)

import pickle
filename = 'data_median.pkl'
outfile = open(filename,'wb')
pickle.dump(X_median,outfile)
outfile.close()

100%|██████████| 18995/18995 [07:37<00:00, 41.55it/s]


In [80]:
pickle_off = open("data_median.pkl","rb")
X = pickle.load(pickle_off)

In [82]:
print(len(X))
print(len(features))
len(X.values[0])
#X = X.dropna()
#X

18995
34


AttributeError: 'list' object has no attribute 'values'

In [83]:
dummy = ['pid','LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']
label_train = ['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']

Y = df_labels[dummy]
#Y = Y[Y['pid'].isin(X['pid'])]

#X = X.iloc[: , 1:]

In [85]:
X[0]

[34.0,
 35.0,
 -8.881188558074962,
 30.0,
 -26.108717114691775,
 1.3333333333333333,
 -14.61003499972745,
 1.75,
 -1.4893767783665541,
 4.375,
 -0.08874240663026556,
 1.366666666666666,
 0.16928955369607435,
 9.333333333333334,
 -2.5331254405388006,
 -0.6666666666666666,
 -2.2218564750026393,
 0.5,
 -2.6785032152573924,
 270.0,
 -125.09142767141546,
 4.6,
 0.00792497003648407,
 -1.4999999999999998,
 -0.9245256150756926,
 0.5,
 -1.5458636587148762,
 16.0,
 -4.945498724826386,
 25.0,
 -76.10149838149387,
 0.05000000000000002,
 -54.56826017529118,
 143.0,
 -37.64102381251584,
 98.0,
 -7.447173281166348,
 120.0,
 -46.87045814043436,
 -5.916666666666667,
 -1.4735383317090296,
 0.28000000000000014,
 -0.05099703039148945,
 1.2733333333333332,
 -0.45018541928067934,
 -7.5,
 0.3308404151525721,
 7.6,
 -2.0599303819097075,
 71.0,
 -68.49647826999966,
 10.166666666666666,
 -0.6017536959807392,
 0.2,
 -0.29201284661924304,
 41.333333333333336,
 0.6027209736135886,
 -3.3000000000000007,
 2.95965877

In [86]:
from sklearn.datasets import make_regression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [92]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1)

In [93]:
# Create the SVR regressor
svr = SVR(epsilon=0.2)

In [94]:
# Create the Multioutput Regressor
mor = MultiOutputRegressor(svr)

# Train the regressor
mor = mor.fit(X_train, y_train)

In [95]:
# Generate predictions for testing data
y_pred = mor.predict(X_test)

In [96]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred) 

0.23184161339975012