In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from src.data.dataset import ImageDataset
from sklearn.multioutput import MultiOutputClassifier

In [2]:
import logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s.%(msecs)03d : %(message)s',
datefmt='%Y-%m-%d %H:%M:%S')

In [3]:
base_path = os.path.join(os.getcwd(), "..")

transformations = [
    ('resize', {'size': (320, 320)}),
    ('flatten', {})
]
image_path = os.path.join(base_path, "data", "raw")
train_csv_path = os.path.join(base_path, "data", "raw", "CheXpert-v1.0-small", "train.csv")
# return_labels = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema',
# 'Pleural Effusion']
return_labels = ['Atelectasis']
map_option= {
    'Atelectasis': "U-one"
}
classes = np.array([[0, 1] for y in return_labels]).astype(np.float32)


## Cross validation

⚠️ StratifiedKFold cannot run on multioutput classifier

In [4]:
limit = 10000
train_dataset = ImageDataset(label_csv_path=train_csv_path, image_path_base=image_path, limit=limit,
transformations=transformations, map_option=map_option, random_state=42)

In [5]:
from src.models.sklearn_models import models
base_model = models['MultinomialNB']
if len(return_labels) > 1:
    model = MultiOutputClassifier(base_model, n_jobs=1)
else:
    model = base_model


In [6]:
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

from sklearn.metrics import roc_auc_score, roc_curve, f1_score, accuracy_score
    
    
batch_size = 500
skfolds = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
X_train, y_train = train_dataset.load(return_labels=return_labels, without_image=True, return_X_y=True)

accuracy_all = []
auc_all = []

for i, (train_index, test_index) in enumerate(skfolds.split(X_train, y_train)):
    logging.info(f'Cross validate model on iteration: {(i + 1)}')
    clone_clf = clone(model)
    X_train_folds = X_train.iloc[train_index]
    y_train_folds = y_train.iloc[train_index]
    X_test_folds = X_train.iloc[test_index]
    y_test_folds = y_train.iloc[test_index]

    train_ds = ImageDataset(label_df=pd.concat([X_train_folds, y_train_folds], axis=1), clean=False, transformations=train_dataset.transformations, random_state=42)
    test_ds = ImageDataset(label_df=pd.concat([X_test_folds, y_test_folds], axis=1), clean=False, transformations=train_dataset.transformations, random_state=42)
    
    j = 0
    for x_features, x_image, y in train_ds.batchloader(batch_size, return_labels):
        X = pd.concat([x_features, x_image], axis=1)
        logging.info(f'Training model on batch {(j + 1)}')
        clone_clf.partial_fit(X, y, classes=classes)
        j += 1


    y_true = []
    y_pred = []
    y_pred_proba = []
    j = 0
    for x_features, x_image, y in test_ds.batchloader(batch_size, return_labels):
        X = pd.concat([pd.DataFrame(x_features), pd.DataFrame(x_image)], axis=1)
        logging.info(f'Validate model on batch {(j + 1)}')
        y_true.append(y.to_numpy())
        y_pred_proba.append(clone_clf.predict_proba(X))
        y_pred.append(clone_clf.predict(X))
        j += 1
    y_true_all = np.concatenate(y_true, axis=0)
    y_pred_all = np.concatenate(y_pred, axis=0)
    y_pred_proba_all = np.concatenate(y_pred_proba, axis=0)
    accuracy = accuracy_score(y_true=y_true_all, y_pred=y_pred_all)
    auc = roc_auc_score(y_true=y_true_all, y_score=y_pred_proba_all[:, 1])
    logging.info(f'accuracy: {accuracy}, auc: {auc}')

    accuracy_all.append(accuracy)
    auc_all.append(auc)
    
logging.info(f'accuracy_all: {accuracy_all}')
logging.info(f'auc_all: {auc_all}')

2021-07-25 20:06:12.790 : Cross validate model on iteration: 1
2021-07-25 20:06:39.063 : Training model on batch 1
2021-07-25 20:07:02.895 : Training model on batch 2
2021-07-25 20:07:25.462 : Training model on batch 3
2021-07-25 20:07:47.543 : Training model on batch 4
2021-07-25 20:08:12.665 : Training model on batch 5
