In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import sys
sys.path.append("../src")
from data_proc import *
from baselines import *
from sklearn.linear_model import LinearRegression, LogisticRegression

pd.options.mode.chained_assignment = None

In [2]:
DATA_PATH = "../data/warfarin.csv"  # path to raw Warfarin data file

# A quick look at the data

In [3]:
# Load in raw data file
data = pd.read_csv(DATA_PATH)

# Preprocess data with the three-level dosage discretizer
data = preprocess(data, label_discretizer=discretize_label_3)

print("Number of records: {}".format(len(data)))
data.head()

Number of records: 5528


Unnamed: 0,daily-dosage,dosage-level,Age,Height (cm),Weight (kg),Asian,African-American,Race-Unknown,Enzyme,Amiodarone,...,CYP2C9-22,CYP2C9-23,CYP2C9-33,CYP2C9-Unknown,warfarin-treatment-3,warfarin-treatment-4,Current Smoker-1,Current Smoker-0,Congestive Heart Failure and/or Cardiomyopathy-1,Congestive Heart Failure and/or Cardiomyopathy-0
0,7.0,2,6.0,193.04,115.7,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,6.0,1,5.0,176.53,144.2,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,7.571429,2,4.0,162.56,77.1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4.0,1,6.0,182.24,90.7,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,6.0,1,5.0,167.64,72.6,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


# Evaluate performance of baseline models

In [4]:
def accuracy(s1, s2):
    return (s1 == s2).mean()

In [5]:
def evaluate_baselines(data_path, label_discretizer):
    # Load in raw data file, then preprocess data
    data = pd.read_csv(data_path)
    data = preprocess(data, label_discretizer)
    print("Number of records: {}".format(len(data)))
    print("Number of dosage levels: {}".format(data['dosage-level'].nunique()))
    
    # Number of patients for each discrete dosage level
    level_counts = data['dosage-level'].value_counts()
    for idx in level_counts.index:
        print("{}: {}".format(idx, level_counts[idx]))
    
    print("Evaluate performance (fraction of right decisions) of baseline models...")
    
    # Baseline 1: Fixed-dose
    majority_dosage_level = data['dosage-level'].mode()[0]
    print("Fixed-dose: {}".format(accuracy(data['dosage-level'], majority_dosage_level)))

    # Baseline 2: Clinical Dosing Algorithm
    print("Clinical Algorithm: {}".format(accuracy(data['dosage-level'],
                                                   clinical_predict(data, label_discretizer))))

    # Baseline 3: Pharmacogenetic Dosing Algorithm
    print("Pharmacogenetic Algorithm: {}".format(accuracy(data['dosage-level'],
                                                          genetic_predict(data, label_discretizer))))

In [6]:
# Three levels: low, medium, high
evaluate_baselines("../data/warfarin.csv", discretize_label_3)

Number of records: 5528
Number of dosage levels: 3
1: 2992
0: 1835
2: 701
Evaluate performance (fraction of right decisions) of baseline models...
Fixed-dose: 0.5412445730824892
Clinical Algorithm: 0.6069102749638206
Pharmacogenetic Algorithm: 0.6617221418234442


In [7]:
# Nine levels
evaluate_baselines("../data/warfarin.csv", discretize_label_9)

Number of records: 5528
Number of dosage levels: 9
3: 1196
1: 1033
4: 814
2: 805
5: 634
8: 319
7: 251
6: 239
0: 237
Evaluate performance (fraction of right decisions) of baseline models...
Fixed-dose: 0.21635311143270622
Clinical Algorithm: 0.2333574529667149
Pharmacogenetic Algorithm: 0.269356005788712


# Build an oracle for the multi-armed bandit setting
Following the Lasso Bandit paper, we establish an approximate oracle that estimates the true parameters of each arm using all of the data. Basically, we train a Logistic classifier for each arm using all of the data with no regularization.

In [8]:
def evaluate_oracle(data_path, label_discretizer):
    # Load in raw data file, then preprocess data
    data = pd.read_csv(data_path)
    data = preprocess(data, label_discretizer)
    K = data['dosage-level'].nunique()  # number of dosage levels (i.e. arms/classes)

    # Prepare training data
    X = data.drop(['daily-dosage', 'dosage-level'], axis=1)
    y = [(data['dosage-level'] == l).astype(np.float32).values for l in range(K)]

    # Train K Logistic Classifiers for the K arms
    models = [LogisticRegression(C=100000, solver='liblinear') for _ in range(K)]  # large C -> no regularization
    for i in range(K):
        models[i].fit(X.values, y[i])

    prediction_score = np.array([m.predict_proba(X)[:,1] for m in models])
    prediction_class = np.argmax(prediction_score, axis=0)  # Choose the arm with highest score    
    print("oracle accuracy: {}".format(accuracy(data['dosage-level'], prediction_class)))

In [9]:
# Three levels: low, medium, high
evaluate_oracle(DATA_PATH, discretize_label_3)

oracle accuracy: 0.6863241678726484


In [10]:
# Nine levels
evaluate_oracle(DATA_PATH, discretize_label_9)

oracle accuracy: 0.33375542691751087
