In [1]:
from lib import loading_and_pre_processing_pipeline

feature_data = loading_and_pre_processing_pipeline()
cleaned_feature_data = feature_data.dropna(axis=0)

print(f'{len(feature_data)} records from {len(set(cleaned_feature_data["user_id"].values))} users present. ')
print(f'{len(cleaned_feature_data)} of which are complete.')

288087 records from 7236 users present. 
62313 of which are complete.


In [2]:
feature_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 288087 entries, 0 to 288086
Data columns (total 21 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   user_id                  288087 non-null  int64         
 1   test_week_start          288087 non-null  object        
 2   rhr_metric               248815 non-null  float64       
 3   steps_metric             258233 non-null  float64       
 4   sleep_duration_metric    162098 non-null  float64       
 5   vaccination_status       220797 non-null  float64       
 6   days_since_last_dose     215895 non-null  float64       
 7   chills                   281190 non-null  object        
 8   body_pain                281190 non-null  object        
 9   loss_of_taste_and_smell  281190 non-null  object        
 10  fatigue                  280131 non-null  object        
 11  cough                    281190 non-null  object        
 12  cold            

In [12]:
df = feature_data.dropna(axis=0, subset=['test_result']).drop(columns=['test_result', 'user_id', 'test_week_start', 'date'])
y = feature_data.dropna(axis=0, subset=['test_result'])[['test_result']].astype(bool)

classes_count = y.value_counts()
classes_count

test_result
False          152903
True             7153
dtype: int64

In [13]:
df

Unnamed: 0,rhr_metric,steps_metric,sleep_duration_metric,vaccination_status,days_since_last_dose,chills,body_pain,loss_of_taste_and_smell,fatigue,cough,cold,diarrhea,sore_throat,asymptomatic,fittness,sex,age
2,,-0.034250,-2.518027,2.0,170.0,False,True,False,False,False,False,True,False,False,377.0,773.0,813.0
16,1.204198,0.221307,0.493535,,,False,False,False,False,False,False,False,False,True,,,
34,2.664284,-0.044351,0.325328,,,False,False,False,False,False,False,False,False,True,375.0,774.0,812.0
36,0.532857,-0.163233,-0.544589,,,False,False,False,False,False,False,False,False,True,375.0,774.0,812.0
37,0.532857,2.377660,-0.795024,,,False,False,False,False,False,False,False,False,True,375.0,774.0,812.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
288076,,,,3.0,207.0,False,False,False,False,False,False,False,False,True,379.0,774.0,813.0
288080,,,,3.0,145.0,False,False,False,False,False,False,False,False,True,378.0,774.0,814.0
288081,,,,3.0,146.0,False,False,False,True,False,False,False,False,False,381.0,773.0,818.0
288083,,,,3.0,176.0,False,False,False,False,False,False,False,False,True,378.0,774.0,813.0


As expected, the dataset is heavily imbalanced with a much higher number of negative than positive test results.
The problems with this are illustrated below:

In [14]:
from sklearn.model_selection import cross_validate
from sklearn.dummy import DummyClassifier
import pandas as pd

dummy_clf = DummyClassifier(strategy="most_frequent")
scoring = ["accuracy", "balanced_accuracy"]

index = []
scores = {"Accuracy": [], "Balanced accuracy": []}

# Score a dummy classifier as baseline
index += ["Dummy classifier"]
cv_result = cross_validate(dummy_clf, df, y, scoring=scoring)
scores["Accuracy"].append(cv_result["test_accuracy"].mean())
scores["Balanced accuracy"].append(cv_result["test_balanced_accuracy"].mean())
print(f"Accuracy score of a dummy classifier: {cv_result['test_accuracy'].mean():.3f}")

pd.DataFrame(scores, index=index)

Accuracy score of a dummy classifier: 0.955


Unnamed: 0,Accuracy,Balanced accuracy
Dummy classifier,0.955309,0.5


This shows that a dummy classifier that classifies all records as the majority class has very high accuracy, simply because it classifies most of the cases correctly. If one corrects for the class imbalance, the accuracy is still 0.5 (random).

In [15]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector as selector
from sklearn.linear_model import LogisticRegression

num_pipe = make_pipeline(
    StandardScaler(), SimpleImputer(strategy="mean", add_indicator=True)
)
cat_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value=False),
    OrdinalEncoder(),
)
preprocessor_linear = make_column_transformer(
    [num_pipe, selector(dtype_include="number")],
    [cat_pipe, selector(dtype_include="object")],
    n_jobs=2,
)
lr_clf = make_pipeline(preprocessor_linear, LogisticRegression(max_iter=1000))

In [19]:
# Train and score logistic regression
index += ["Logistic regression"]
cv_result = cross_validate(lr_clf, df, y, scoring=scoring)
scores["Accuracy"].append(cv_result["test_accuracy"].mean())
scores["Balanced accuracy"].append(cv_result["test_balanced_accuracy"].mean())

pd.DataFrame(scores, index=index)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Unnamed: 0,Accuracy,Balanced accuracy
Dummy classifier,0.955309,0.5
Logistic regression,0.965737,0.675035


So the accuracy of the logistic regression is essentially the same as a dummy baseline, if it is not corrected for the imbalance of classes in the dataset. Correcting for imbalance, the accuracy is better than chance, but still not very high. There are two things to improve the performance of the model:
* First, subsampling the majority class during training to get a balanced training set and
* Second, adjusting the loss function of the classifier to weigh cases of the minority class higher.

In [20]:
lr_clf.set_params(logisticregression__class_weight="balanced")

index += ["Logistic regression with balanced class weights"]
cv_result = cross_validate(lr_clf, df, y, scoring=scoring)
scores["Accuracy"].append(cv_result["test_accuracy"].mean())
scores["Balanced accuracy"].append(cv_result["test_balanced_accuracy"].mean())

pd.DataFrame(scores, index=index)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Unnamed: 0,Accuracy,Balanced accuracy
Dummy classifier,0.955309,0.5
Logistic regression,0.965737,0.675035
Logistic regression with balanced class weights,0.840375,0.858352


In [21]:
from imblearn.pipeline import make_pipeline as make_pipeline_with_sampler
from imblearn.under_sampling import RandomUnderSampler

lr_clf = make_pipeline_with_sampler(
    preprocessor_linear,
    RandomUnderSampler(random_state=42),
    LogisticRegression(max_iter=1000),
)

In [22]:
index += ["Under-sampling + Logistic regression"]
cv_result = cross_validate(lr_clf, df, y, scoring=scoring)
scores["Accuracy"].append(cv_result["test_accuracy"].mean())
scores["Balanced accuracy"].append(cv_result["test_balanced_accuracy"].mean())

pd.DataFrame(scores, index=index)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Unnamed: 0,Accuracy,Balanced accuracy
Dummy classifier,0.955309,0.5
Logistic regression,0.965737,0.675035
Logistic regression with balanced class weights,0.840375,0.858352
Under-sampling + Logistic regression,0.838906,0.859516


The performance of both options is comparable and substantially better than the naive regression approach in terms of balanced accuracy.