In [1]:
import numpy as np
import pandas as pd
from debiasm import DebiasMClassifier

## Build synthetic data for the example

In [2]:
np.random.seed(123)
n_samples = 96*5
n_batches = 5
n_features = 100

## the read count matrix
X = ( np.random.rand(n_samples, n_features) * 1000 ).astype(int)

## the labels
y = np.random.rand(n_samples)>0.5

## the batches
batches = ( np.random.rand(n_samples) * n_batches ).astype(int)

In [3]:
## we assume the batches are numbered ints starting at '0',
## and they are in the first column of the input X matrices
## for now, you can just set the first column to all zeros if we have only one batch

X_with_batch = np.hstack((batches[:, np.newaxis], X))
X_with_batch[:5, :5]

array([[  4, 696, 286, 226, 551],
       [  4, 513, 666, 105, 130],
       [  3, 542,  66, 653, 996],
       [  3,  16, 721,   7,  84],
       [  1, 456, 279, 932, 314]])

In [4]:
y[:5]

array([ True, False, False,  True,  True])

In [5]:
## set the valdiation batch to '4'
val_inds = batches==4
X_train, X_val = X_with_batch[~val_inds], X_with_batch[val_inds]
y_train, y_val = y[~val_inds], y[val_inds]

## run DEBIAS-M, using standard sklearn object formats

In [6]:
y_train.shape

(374,)

In [7]:
X_train.shape

(374, 101)

In [8]:
dmc = DebiasMClassifier(x_val=X_val) ## give it the held-out inputs to account for
                                    ## those domains shifts while training

dmc.fit(X_train, y_train)
print('finished training!')

  f"Setting `Trainer(progress_bar_refresh_rate={progress_bar_refresh_rate})` is deprecated in v1.5 and"
  "Setting `Trainer(weights_summary=None)` is deprecated in v1.5 and will be removed"
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  y_hat = softmax(x)
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"
Trainer was signaled to stop but required minimum epochs (25) or minimum steps (None) has not been met. Training will continue...
Trainer was signaled to stop but required minimum epochs (25) or minimum steps (None) has not been met. Training will continue...
Trainer was signaled to stop but required minimum epochs (25) or minimum steps (None) has not been met. Training will continue...
Trainer was signale

finished training!


## Assess results

In [9]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_val, dmc.predict_proba(X_val)[:, 1]) 
## should be ~~0.5 in this notebook , since the data is all random

0.46737967914438505

## Extract the the 'DEBIAS-ed' data

In [12]:
X_debiassed = dmc.transform(X_with_batch)
X_debiassed[:5, :5]

array([[0.01457723, 0.00534167, 0.00494237, 0.01204327, 0.01374663],
       [0.01068423, 0.01236932, 0.00228337, 0.00282551, 0.00610285],
       [0.01086351, 0.00140875, 0.01448351, 0.02095727, 0.01629219],
       [0.00031793, 0.01525674, 0.00015392, 0.00175223, 0.00472576],
       [0.00966502, 0.00629951, 0.01727931, 0.00599989, 0.01835283]],
      dtype=float32)