In [None]:
from load_data import loading_and_pre_processing_pipeline

feature_data = loading_and_pre_processing_pipeline()
cleaned_feature_data = feature_data.dropna(axis=0)

print(
    f'{len(feature_data)} records from {len(set(cleaned_feature_data["user_id"].values))} users present. '
)
print(f"{len(cleaned_feature_data)} of which are complete.")

In [None]:
feature_data.info()

In [None]:
df = feature_data.dropna(axis=0, subset=["test_result"]).drop(
    columns=["test_result", "user_id", "test_week_start", "date"]
)
y = feature_data.dropna(axis=0, subset=["test_result"])[["test_result"]].astype(bool)

classes_count = y.value_counts()
classes_count

In [None]:
df

As expected, the dataset is heavily imbalanced with a much higher number of negative than positive test results.
The problems with this are illustrated below:

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.dummy import DummyClassifier
import pandas as pd

dummy_clf = DummyClassifier(strategy="most_frequent")
scoring = ["accuracy", "balanced_accuracy"]

index = []
scores = {"Accuracy": [], "Balanced accuracy": []}

# Score a dummy classifier as baseline
index += ["Dummy classifier"]
cv_result = cross_validate(dummy_clf, df, y, scoring=scoring)
scores["Accuracy"].append(cv_result["test_accuracy"].mean())
scores["Balanced accuracy"].append(cv_result["test_balanced_accuracy"].mean())
print(f"Accuracy score of a dummy classifier: {cv_result['test_accuracy'].mean():.3f}")

pd.DataFrame(scores, index=index)

This shows that a dummy classifier that classifies all records as the majority class has very high accuracy, simply because it classifies most of the cases correctly. If one corrects for the class imbalance, the accuracy is still 0.5 (random).

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector as selector
from sklearn.linear_model import LogisticRegression

num_pipe = make_pipeline(
    StandardScaler(), SimpleImputer(strategy="mean", add_indicator=True)
)
cat_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value=False),
    OrdinalEncoder(),
)
preprocessor_linear = make_column_transformer(
    [num_pipe, selector(dtype_include="number")],
    [cat_pipe, selector(dtype_include="object")],
    n_jobs=2,
)
lr_clf = make_pipeline(preprocessor_linear, LogisticRegression(max_iter=1000))

In [None]:
# Train and score logistic regression
index += ["Logistic regression"]
cv_result = cross_validate(lr_clf, df, y, scoring=scoring)
scores["Accuracy"].append(cv_result["test_accuracy"].mean())
scores["Balanced accuracy"].append(cv_result["test_balanced_accuracy"].mean())

pd.DataFrame(scores, index=index)

So the accuracy of the logistic regression is essentially the same as a dummy baseline, if it is not corrected for the imbalance of classes in the dataset. Correcting for imbalance, the accuracy is better than chance, but still not very high. There are two things to improve the performance of the model:
* First, subsampling the majority class during training to get a balanced training set and
* Second, adjusting the loss function of the classifier to weigh cases of the minority class higher.

In [None]:
lr_clf.set_params(logisticregression__class_weight="balanced")

index += ["Logistic regression with balanced class weights"]
cv_result = cross_validate(lr_clf, df, y, scoring=scoring)
scores["Accuracy"].append(cv_result["test_accuracy"].mean())
scores["Balanced accuracy"].append(cv_result["test_balanced_accuracy"].mean())

pd.DataFrame(scores, index=index)

In [None]:
from imblearn.pipeline import make_pipeline as make_pipeline_with_sampler
from imblearn.under_sampling import RandomUnderSampler

lr_clf = make_pipeline_with_sampler(
    preprocessor_linear,
    RandomUnderSampler(random_state=42),
    LogisticRegression(max_iter=1000),
)

In [None]:
index += ["Under-sampling + Logistic regression"]
cv_result = cross_validate(lr_clf, df, y, scoring=scoring)
scores["Accuracy"].append(cv_result["test_accuracy"].mean())
scores["Balanced accuracy"].append(cv_result["test_balanced_accuracy"].mean())

pd.DataFrame(scores, index=index)

The performance of both options is comparable and substantially better than the naive regression approach in terms of balanced accuracy.