<a href="https://colab.research.google.com/github/krishkankure/covid-gb-classifier/blob/main/gb_binaryclassifier_covid_v1_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Setup (run once)
import os
import glob

!git clone -b master https://github.com/krishkankure/covid-gb-classifier.git
os.chdir('covid-gb-classifier')
!pip install numpy
!pip install scikit-learn
!pip install lightgbm
!pip install imblearn


In [None]:
#@title Train Model - This is OPTIONAL, you may skip to the next step.
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, f1_score
import sklearn.metrics
import joblib
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
from sklearn.calibration import calibration_curve
from sklearn.calibration import CalibratedClassifierCV

data = pd.read_csv("covid.csv"); # import covid data set
X = data.drop(columns=['DATE_DIED']) # X set drops DATE_DIED
y = data['DATE_DIED'] # y set exclusively uses DATE_DIED

y = y.replace('9999-99-99', 0).replace('[^0]', 1, regex=True) # Replaces 9999-99-99 with 0, indicated no death, otherwise a 1, indicating death

undersample = RandomUnderSampler(random_state=42)

X_resampled, y_resampled = undersample.fit_resample(X, y)

X_train, X_rem, y_train, y_rem = train_test_split(X_resampled, y_resampled, train_size=0.7, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=42)

model = LGBMClassifier(boosting_type = 'dart', objective = 'binary', max_depth = 20, # add early_stopping_rounds if using gbdt or goss
                       bagging_fraction = 0.6, first_metric_only=False, 
                       verbose=0, min_gain_to_split=0.5, reg_alpha=0.1, reg_lambda=0.1, min_child_samples = 20)

model.num_leaves = 2^(model.max_depth)
model.num_iterations = 1000
model.learning_rate = 0.05
model.fit(X_train.values, y_train)

calibrated_model = CalibratedClassifierCV(model, cv=10, method='isotonic') # isotonic regression calibration, long training time for plat scaling
calibrated_model.fit(X_train, y_train)

# Validation Set Accuracy
y_pred = calibrated_model.predict(X_valid)
accuracy = accuracy_score(y_valid, y_pred)

# Test Set Accuracy
y_pred_test = calibrated_model.predict(X_test)
accuracy_test = accuracy_score(y_test, y_pred_test)

#True Positive Rate (Test Set)

tpr_test = recall_score(y_test, y_pred_test)

# F1 Score (precision)

f1 = f1_score(y_valid, y_pred)

# Print the accuracy score



print("Test set accuracy:", accuracy_test)

print("Validation set accuracy:", accuracy)

print("Test F1 Score:", f1)
print("TPR:", tpr_test)

y_pred_proba = calibrated_model.predict_proba(X_test)[::,1]

auc = sklearn.metrics.roc_auc_score(y_test, y_pred_proba)
print("AUC ", auc)
joblib.dump(calibrated_model, 'calibrated_model.joblib')

In [None]:
#@title Define Probability Function, open joblib model
import joblib
import pandas as pd
loaded_calibrated_model = joblib.load('calibrated_model.joblib')# calibrated_model = pickle.load(open('model.pkl', 'rb'))
def getProbability(usmer, med, sex, patient_type, intubed, pneu, age, preg, diab, copd, asth, inms, hype, oth, card,
                  obe, ren, toba, clas, icu):
    df = pd.DataFrame()
    df['USMER'] = [usmer]
    df['MEDICAL_UNIT'] = [med]
    df['SEX'] = [sex]
    df['PATIENT_TYPE'] = [patient_type]
    df['INTUBED'] = [intubed]
    df['PNEUMONIA'] = [pneu]
    df['AGE'] = [age]
    df['PREGNANT'] = preg
    df['DIABETES'] = [diab]
    df['COPD'] = [copd]
    df['ASTHMA'] = [asth]
    df['INMSUPR'] = [inms]
    df['HIPERTENSION'] = [hype] # [sic.]
    df['OTHER_DISEASE'] = [oth]
    df['CARDIOVASCULAR'] = [card]
    df['OBESITY'] = obe
    df['RENAL_CHRONIC'] = [ren]
    df['TOBACCO'] = [toba]
    df['CLASIFFICATION_FINAL'] = [clas] #[sic.]
    df['ICU'] = [icu]
    outcome = (loaded_calibrated_model.predict_proba(df))
    return ((float(int(outcome[0][0]*10000)))/100)

# Instructions (READ THIS)



*   Medical Level indicates the level of hospital care recieved (1-2)
*   Medical unit that patient is being treated in, in order of significance, ascending

*   Patient_Type refers to the situation of the patient. If the patient has been sent home, enter **1**, else if they are recieving hospital care, enter **2**
*   For all other values, 1 represents that they **do** have the condition, 2 represents that they **do not** have the condition
*   Covid Test Results between 1-3 indicate a positive test, 4-6 indicates inconclusive test results


In [None]:
#@title Predict - slide the values and then run cell
medical_level = 1 #@param {type:"slider", min:1, max:2, step:1}
medical_unit = 7 #@param {type:"slider", min:2, max:12, step:1}
sex = 1 #@param {type:"slider", min:1, max:2, step:1}
patient_type = 1 #@param {type:"slider", min:1, max:2, step:1}
intubed = 2 #@param {type:"slider", min:1, max:2, step:1}
pneumonia = 2 #@param {type:"slider", min:1, max:2, step:1}
age = 19 #@param {type:"slider", min:0, max:99, step:1}
pregnant = 2 #@param {type:"slider", min:1, max:2, step:1}
diabetes = 2 #@param {type:"slider", min:1, max:2, step:1}
copd = 2 #@param {type:"slider", min:1, max:2, step:1}
asthma = 2 #@param {type:"slider", min:1, max:2, step:1}
immunosuppresed = 2 #@param {type:"slider", min:1, max:2, step:1}
hypertension = 2 #@param {type:"slider", min:1, max:2, step:1}
other_disease = 2 #@param {type:"slider", min:1, max:2, step:1}
cardiovascular_disease = 2 #@param {type:"slider", min:1, max:2, step:1}
obese = 2 #@param {type:"slider", min:1, max:2, step:1}
renal_chronic = 2 #@param {type:"slider", min:1, max:2, step:1}
tobacco_user = 2 #@param {type:"slider", min:1, max:2, step:1}
covid_test_results = 5 #@param {type:"slider", min:1, max:6, step:1}
icu = 2 #@param {type:"slider", min:1, max:2, step:1}
print("Probability of Survival:", getProbability(medical_level, medical_unit, sex, patient_type, intubed, 
                                                pneumonia, age, pregnant, diabetes, copd, asthma, 
                                                immunosuppresed, hypertension, other_disease,
                                                cardiovascular_disease, obese, renal_chronic, tobacco_user, 
                                                covid_test_results, icu), "%")


Probability of Survival: 99.0 %
