In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
sns.set(rc = {'figure.figsize':(16,9)}, font_scale = 1.5)

In [2]:
df_375 = pd.read_excel(io = "time_series_375_preprocess_en.xlsx")
# df_110 = pd.read_excel(io = "time_series_test_110_preprocess_en.xlsx")

df_375

Unnamed: 0,PATIENT_ID,RE_DATE,age,gender,Admission time,Discharge time,outcome,Hypersensitive cardiac troponinI,hemoglobin,Serum chloride,...,mean corpuscular hemoglobin,Activation of partial thromboplastin time,Hypersensitive c-reactive protein,HIV antibody quantification,serum sodium,thrombocytocrit,ESR,glutamic-pyruvic transaminase,eGFR,creatinine
0,1.0,2020-01-31 01:09:00,73,1,2020-01-30 22:12:47,2020-02-17 12:40:09,0,,,,...,,,,,,,,,,
1,,2020-01-31 01:25:00,73,1,2020-01-30 22:12:47,2020-02-17 12:40:09,0,,136.0,,...,31.9,,,,,0.12,,,,
2,,2020-01-31 01:44:00,73,1,2020-01-30 22:12:47,2020-02-17 12:40:09,0,,,103.1,...,,,43.1,,137.7,,,16.0,46.6,130.0
3,,2020-01-31 01:45:00,73,1,2020-01-30 22:12:47,2020-02-17 12:40:09,0,,,,...,,,,,,,,,,
4,,2020-01-31 01:56:00,73,1,2020-01-30 22:12:47,2020-02-17 12:40:09,0,19.9,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6115,,2020-02-16 11:21:00,68,1,2020-02-08 23:25:01,2020-02-19 01:31:58,1,84.9,,,...,,,,,,,,,,
6116,,2020-02-16 12:04:00,68,1,2020-02-08 23:25:01,2020-02-19 01:31:58,1,,,,...,,,,,,,,,,
6117,,2020-02-16 12:14:00,68,1,2020-02-08 23:25:01,2020-02-19 01:31:58,1,,,105.2,...,,,267.0,,139.3,,,17.0,88.6,77.0
6118,,2020-02-16 14:11:00,68,1,2020-02-08 23:25:01,2020-02-19 01:31:58,1,,155.0,,...,31.6,,,,,,,,,


In [3]:
# Remove unwanted columns
biomarkers = ['Hypersensitive cardiac troponinI', 'Prothrombin time', 'procalcitonin', '(%)lymphocyte', 'D-D dimer', 'Lactate dehydrogenase', 'Interleukin 6', 'Hypersensitive c-reactive protein']
patient_data = ['PATIENT_ID', 'RE_DATE', 'age', 'outcome']
df_test = df_375.loc[:, patient_data + biomarkers]

# Remove the time element
df_test['RE_DATE'] = df_test['RE_DATE'].dt.normalize()

# Fill up empty patient IDs
df_test['PATIENT_ID'] = df_test['PATIENT_ID'].ffill()
df_test = df_test.fillna(0)

df_test

Unnamed: 0,PATIENT_ID,RE_DATE,age,outcome,Hypersensitive cardiac troponinI,Prothrombin time,procalcitonin,(%)lymphocyte,D-D dimer,Lactate dehydrogenase,Interleukin 6,Hypersensitive c-reactive protein
0,1.0,2020-01-31 00:00:00,73,0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0
1,1.0,2020-01-31 00:00:00,73,0,0.0,0.0,0.00,22.6,0.0,0.0,0.0,0.0
2,1.0,2020-01-31 00:00:00,73,0,0.0,0.0,0.00,0.0,0.0,306.0,0.0,43.1
3,1.0,2020-01-31 00:00:00,73,0,0.0,13.9,0.00,0.0,2.2,0.0,0.0,0.0
4,1.0,2020-01-31 00:00:00,73,0,19.9,0.0,0.09,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
6115,375.0,2020-02-16 00:00:00,68,1,84.9,0.0,0.00,0.0,0.0,0.0,0.0,0.0
6116,375.0,2020-02-16 00:00:00,68,1,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0
6117,375.0,2020-02-16 00:00:00,68,1,0.0,0.0,0.00,0.0,0.0,783.0,0.0,267.0
6118,375.0,2020-02-16 00:00:00,68,1,0.0,0.0,0.00,6.7,0.0,0.0,0.0,0.0


There have been numerous studies examining the relationship between biomarkers and mortality in COVID-19 patients, and the specific biomarkers that most strongly correlate with mortality may vary depending on factors such as patient population, study design, and other factors.

That being said, some of the most commonly reported biomarkers associated with increased mortality in COVID-19 patients include:

- Hypersensitive cardiac troponinI: This is a marker of heart damage, and elevated levels have been associated with increased mortality in COVID-19 patients.
- Procalcitonin: This is a marker of bacterial infection, and elevated levels have been associated with increased mortality in COVID-19 patients.
- D-dimer: This is a marker of blood clotting, and elevated levels have been associated with increased mortality in COVID-19 patients.
- Lactate dehydrogenase: This is a marker of tissue damage, and elevated levels have been associated with increased mortality in COVID-19 patients.
- C-reactive protein: This is a marker of inflammation, and elevated levels have been associated with increased mortality in COVID-19 patients.
- Interleukin-6: This is a cytokine involved in inflammation and immune response, and elevated levels have been associated with increased mortality in COVID-19 patients.
- Ferritin: This is a marker of iron metabolism and inflammation, and elevated levels have been associated with increased mortality in COVID-19 patients.

It's important to note that these biomarkers are not diagnostic for COVID-19 or predictive of mortality on their own, and that the interpretation of biomarker results should always be considered in the context of other clinical factors and patient characteristics.

In [4]:
# Find the average value of the biomarkers for each patient
df_cleaned = df_test.loc[:, ['PATIENT_ID', 'Hypersensitive cardiac troponinI', 'Prothrombin time', 'procalcitonin', '(%)lymphocyte', 'D-D dimer', 'Lactate dehydrogenase', 'Interleukin 6', 'Hypersensitive c-reactive protein']] 
df_cleaned = df_cleaned.groupby('PATIENT_ID')[['Hypersensitive cardiac troponinI', 'Prothrombin time', 'procalcitonin', '(%)lymphocyte', 'D-D dimer', 'Lactate dehydrogenase', 'Interleukin 6', 'Hypersensitive c-reactive protein']].apply(lambda x: x[x != 0].mean()).round(1).reset_index()

# Combine the outcome of 1 patient into a single row
df_outcome = df_test.loc[:, ['PATIENT_ID', 'outcome']]
df_outcome = df_outcome.groupby('PATIENT_ID').mean().reset_index()

# Merge the outcome with the biomarkers and drop the patients with no biomarkers recorded
df_cleaned = pd.merge(df_cleaned, df_outcome, on=['PATIENT_ID'])
df_cleaned = df_cleaned.dropna(subset=df_cleaned.columns.difference(['PATIENT_ID', 'outcome']), how='all')

df_cleaned

Unnamed: 0,PATIENT_ID,Hypersensitive cardiac troponinI,Prothrombin time,procalcitonin,(%)lymphocyte,D-D dimer,Lactate dehydrogenase,Interleukin 6,Hypersensitive c-reactive protein,outcome
0,1.0,19.9,13.5,0.1,22.7,1.3,232.0,,16.4,0.0
1,2.0,6.9,13.3,0.1,13.6,0.7,450.2,38.6,27.4,0.0
2,3.0,,13.6,0.1,26.5,1.0,274.3,47.8,23.0,0.0
3,4.0,4.8,16.3,0.4,18.2,1.3,293.5,,61.4,0.0
4,5.0,5.6,14.6,0.0,30.7,0.4,187.0,1.5,3.9,0.0
...,...,...,...,...,...,...,...,...,...,...
370,371.0,1741.5,14.4,1.5,15.0,2.6,573.0,248.9,152.0,1.0
371,372.0,30.7,17.9,1.6,1.6,4.2,383.7,284.3,232.2,1.0
372,373.0,124.8,14.9,0.6,6.3,,702.0,,205.8,1.0
373,374.0,372.4,23.2,,2.3,21.0,1706.3,23.2,109.8,1.0


In [5]:
df_plot = df_cleaned.drop(['PATIENT_ID'], axis=1)

df_cleaned = df_cleaned.fillna(0)

# df_plot

# sns.scatterplot(data=df_plot, x=[['Hypersensitive cardiac troponinI', 'Prothrombin time', 'procalcitonin', '(%)lymphocyte', 'D-D dimer', 'Lactate dehydrogenase', 'Interleukin 6', 'Hypersensitive c-reactive protein']], y='outcome', hue='outcome')

# sns.scatterplot(data=df_plot, x='(%)lymphocyte', y='outcome', hue='outcome')
# pd.set_option('display.max_rows', None)

df_cleaned

Unnamed: 0,PATIENT_ID,Hypersensitive cardiac troponinI,Prothrombin time,procalcitonin,(%)lymphocyte,D-D dimer,Lactate dehydrogenase,Interleukin 6,Hypersensitive c-reactive protein,outcome
0,1.0,19.9,13.5,0.1,22.7,1.3,232.0,0.0,16.4,0.0
1,2.0,6.9,13.3,0.1,13.6,0.7,450.2,38.6,27.4,0.0
2,3.0,0.0,13.6,0.1,26.5,1.0,274.3,47.8,23.0,0.0
3,4.0,4.8,16.3,0.4,18.2,1.3,293.5,0.0,61.4,0.0
4,5.0,5.6,14.6,0.0,30.7,0.4,187.0,1.5,3.9,0.0
...,...,...,...,...,...,...,...,...,...,...
370,371.0,1741.5,14.4,1.5,15.0,2.6,573.0,248.9,152.0,1.0
371,372.0,30.7,17.9,1.6,1.6,4.2,383.7,284.3,232.2,1.0
372,373.0,124.8,14.9,0.6,6.3,0.0,702.0,0.0,205.8,1.0
373,374.0,372.4,23.2,0.0,2.3,21.0,1706.3,23.2,109.8,1.0


In [6]:
from sklearn.model_selection import train_test_split

# Prepare features and targets
X = df_cleaned.drop(['outcome', 'PATIENT_ID'], axis=1) # Features
y = df_cleaned['outcome'] # Target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the training and testing sets
print("Training set shape: X_train={}, y_train={}".format(X_train.shape, y_train.shape))
print("Testing set shape: X_test={}, y_test={}".format(X_test.shape, y_test.shape))


Training set shape: X_train=(287, 8), y_train=(287,)
Testing set shape: X_test=(72, 8), y_test=(72,)


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# create a logistic regression model
logreg = LogisticRegression(max_iter=10000)

# train the model on the training set
logreg.fit(X_train, y_train)

# make predictions on the test set
y_pred = logreg.predict(X_test)

# evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)


Accuracy: 0.9583333333333334
