# Importing Modules and Packages

In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

In [5]:
pip install -U imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.10.1-py3-none-any.whl (226 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.0/226.0 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hCollecting joblib>=1.1.1
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
Installing collected packages: joblib, imbalanced-learn
  Attempting uninstall: joblib
    Found existing installation: joblib 1.1.0
    Uninstalling joblib-1.1.0:
      Successfully uninstalled joblib-1.1.0
Successfully installed imbalanced-learn-0.10.1 joblib-1.2.0
Note: you may need to restart the kernel to use updated packages.


In [6]:
from imblearn.under_sampling import RandomUnderSampler

In [24]:
pip install sklearn

Collecting sklearn
  Downloading sklearn-0.0.post1.tar.gz (3.6 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25ldone
[?25h  Created wheel for sklearn: filename=sklearn-0.0.post1-py3-none-any.whl size=2936 sha256=c6e6c6d821c616a5d1f9dc3e5d1a0f5f25e97949e9aae4e119747385256d36e2
  Stored in directory: /home/user/.cache/pip/wheels/f8/e0/3d/9d0c2020c44a519b9f02ab4fa6d2a4a996c98d79ab2f569fa1
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0.post1
Note: you may need to restart the kernel to use updated packages.


In [26]:
from sklearn.model_selection import train_test_split

# Loading Data

In [7]:
data = pd.read_csv("corona_tested_individuals_ver_0083.english.csv")

In [13]:
display(data)

Unnamed: 0,test_date,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above,gender,test_indication
0,2020-11-12,0,0,0,0,0,negative,No,male,Other
1,2020-11-12,0,1,0,0,0,negative,No,male,Other
2,2020-11-12,0,0,0,0,0,negative,Yes,female,Other
3,2020-11-12,0,0,0,0,0,negative,No,male,Other
4,2020-11-12,0,1,0,0,0,negative,No,male,Contact with confirmed
...,...,...,...,...,...,...,...,...,...,...
158601,2020-10-30,0,0,0,0,0,negative,,male,Other
158602,2020-10-30,0,0,0,0,0,negative,,female,Other
158603,2020-10-30,0,0,0,0,0,negative,,male,Other
158604,2020-10-30,0,0,0,0,0,negative,,female,Other


In [14]:
data.columns

Index(['test_date', 'cough', 'fever', 'sore_throat', 'shortness_of_breath',
       'head_ache', 'corona_result', 'age_60_and_above', 'gender',
       'test_indication'],
      dtype='object')

# Data Overview

# Model predictors and exact variable names (true = 1, false = 0)
- Age over 60
- Sex
- Cough
- Shortness of breath
- Fever
- Sore throat
- Headache
- Test indication

# Gradient Boosting Classifier
- Learning rate: learning rate shrinks the contribution of each tree by learning_rate. There is a trade-off between learing_rate and n_estimators
- n_estimators: the number of boosting stages to perorm. Gradient boosting is fairly robust to over-fitting so a large number usually results in better performance
- max depth: the maximum depth of the individual regression estimators. The maximum depth limits the number of nodes in the tree. Tune this parameter for best performance; the best value depends on the interaction of the input variables.

In [28]:
def build_model(X,y):
    learning_rate = 0.2
    n_estimators = 200
    max_depth = 3
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    model = GradientBoostingClassifier(
        learning_rate = learning_rate, \
        n_estimators=n_estimators, \
        max_depth=max_depth
    )
    
    model.fit(X_train, y_train)
    
    return X_train, X_test, y_train, y_test, model

In [29]:
def predict(X_test):
    y_pred = model.predict(X_test)
    return y_pred

In [48]:
X = data[
    [
        #'test_date', 
        'cough', 
        'fever', 
        'sore_throat', 
        'shortness_of_breath',
        'head_ache',
        #'corona_result', 
        #'age_60_and_above', 
        #'gender',
        #'test_indication'
    ]
]

In [49]:
y = data[['corona_result']]

In [50]:
X_train, X_test, y_train, y_test, model = build_model(X,y)

  y = column_or_1d(y, warn=True)


In [52]:
y_pred = predict(X_test)

In [53]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, classification_report, plot_roc_curve

In [62]:
def metrics(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    recall = recall_score(y_test, y_pred)
    print("Accuracy: %.2f%%" % (recall * 100.0))
    precision = precision_score(y_test, y_pred)
    print("Accuracy: %.2f%%" % (precision * 100.0))

In [63]:
from sklearn.metrics import confusion_matrix

In [64]:
from matplotlib import pyplot as plt

In [65]:
def plot_conf_matrix(y_test, y_pred):
    conf_mat = confusion_matrix(y_true=y_test, y_pred=y_pred)
    print('COnfusion Matrix: \n', conf_mat)
    
    labels = ["False", "True"]
    fig = plt.figure()
    
    ax = fig.add_subplot(111)
    
    cax = ax.matshow(conf_mat, cmap=plt.cm.Blues)
    
    fig.colorbar(cax)
    
    ax.set_xticklables(labels)
    ax.set_yticklabels(labels)
    
    plt.xlabel("Predicted")
    plt.ylabel("Expected")
    plt.show()
    
def classification_report_func(y_text, y_pred, target):
    report = classification_report(y_test, y_pred, target_names=['False', 'True'])
    return report

def roc_curve(model, X_test, y_test):
    plot_roc_curve(model, X_test, y_test)
    return plot_roc_curve

In [66]:
X_train, X_test, y_train, y_test, model = build_model(X,y)
y_pred = predict(X_test)
metrics(y_test, y_pred)
classification_report = classification_report_func(y_test, y_pred, target=['False', 'True'])
print(classification_report)
plot_conf_matrix(y_test,y_pred)
roc_curve(modelm, X_test, y_test)

  y = column_or_1d(y, warn=True)


Accuracy: 96.98%


ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].