# Bike

In [None]:
import sage
import pickle
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [None]:
# Load data
df = sage.datasets.bike()
feature_names = df.columns.tolist()[:-3]

In [None]:
# Split data, with total count serving as regression target
train, test = train_test_split(
    df.values, test_size=int(0.1 * len(df.values)), random_state=123)
train, val = train_test_split(
    train, test_size=int(0.1 * len(df.values)), random_state=123)
Y_train = train[:, -1].copy()
Y_val = val[:, -1].copy()
Y_test = test[:, -1].copy()
train = train[:, :-3].copy()
val = val[:, :-3].copy()
test = test[:, :-3].copy()

In [None]:
with open('trained_models/bike model.pkl', 'rb') as f:
    model = pickle.load(f)

In [None]:
# Setup and calculate
imputer = sage.MarginalImputer(model, test[:512])
estimator = sage.PermutationEstimator(imputer, 'mse')
bike_sage = estimator(test, Y_test, thresh=0.01)

In [None]:
bike_sage.save('results/bike_sage.pkl')

# Bank

In [None]:
import sage
import pickle
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

In [None]:
# Load data
df = sage.datasets.bank()

# Feature names and categorical columns (for CatBoost model)
feature_names = df.columns.tolist()[:-1]
categorical_cols = ['Job', 'Marital', 'Education', 'Default', 'Housing',
                    'Loan', 'Contact', 'Month', 'Prev Outcome']
categorical_inds = [feature_names.index(col) for col in categorical_cols]

In [None]:
# Split data
train, test = train_test_split(
    df.values, test_size=int(0.1 * len(df.values)), random_state=123)
train, val = train_test_split(
    train, test_size=int(0.1 * len(df.values)), random_state=123)
Y_train = train[:, -1].copy().astype(int)
Y_val = val[:, -1].copy().astype(int)
Y_test = test[:, -1].copy().astype(int)
train = train[:, :-1].copy()
val = val[:, :-1].copy()
test = test[:, :-1].copy()

In [None]:
with open('trained_models/bank model.pkl', 'rb') as f:
    model = pickle.load(f)

In [None]:
# Setup and calculate
imputer = sage.MarginalImputer(model, test[:512])
estimator = sage.PermutationEstimator(imputer, 'cross entropy')
bank_sage = estimator(test, Y_test, thresh=0.01)

In [None]:
bank_sage.save('results/bank_sage.pkl')

# Credit

In [None]:
import sage
import numpy as np
from sklearn.metrics import log_loss
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

In [None]:
# Load data
df = sage.datasets.credit()

# Feature names and categorical columns (for CatBoost model)
feature_names = df.columns.tolist()[:-1]
categorical_columns = [
    'Checking Status', 'Credit History', 'Purpose', 'Credit Amount',
    'Savings Account/Bonds', 'Employment Since', 'Personal Status',
    'Debtors/Guarantors', 'Property Type', 'Other Installment Plans',
    'Housing Ownership', 'Job', 'Telephone', 'Foreign Worker'
]
categorical_inds = [feature_names.index(col) for col in categorical_columns]

In [None]:
# Split data
train, test = train_test_split(
    df.values, test_size=int(0.1 * len(df.values)), random_state=0)
train, val = train_test_split(
    train, test_size=int(0.1 * len(df.values)), random_state=0)
Y_train = train[:, -1].copy().astype(int)
Y_val = val[:, -1].copy().astype(int)
Y_test = test[:, -1].copy().astype(int)
train = train[:, :-1].copy()
val = val[:, :-1].copy()
test = test[:, :-1].copy()

In [None]:
with open('trained_models/credit model.pkl', 'rb') as f:
    model = pickle.load(f)

In [None]:
# Setup and calculate
imputer = sage.MarginalImputer(model, train[:512])
estimator = sage.PermutationEstimator(imputer, 'cross entropy')
credit_sage = estimator(test, Y_test, thresh=0.01)

In [None]:
credit_sage.save('results/credit_sage.pkl')

# BRCA

In [None]:
import sage
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
gene_names = [
    'BCL11A', 'IGF1R', 'CCND1', 'CDK6', 'BRCA1', 'BRCA2', 'EZH2', 'SFTPD',
    'CDC5L', 'ADMR', 'TSPAN2', 'EIF5B', 'ADRA2C', 'MRCL3', 'CCDC69', 'ADCY4',
    'TEX14', 'RRM2B', 'SLC22A5', 'HRH1', 'SLC25A1', 'CEBPE', 'IWS1', 'FLJ10213',
    'PSMD10', 'MARCH6', 'PDLIM4', 'SNTB1', 'CHCHD1', 'SCMH1', 'FLJ20489',
    'MDP-1', 'FLJ30092', 'YTHDC2', 'LFNG', 'HOXD10', 'RPS6KA5', 'WDR40B',
    'CST9L', 'ISLR', 'TMBIM1', 'TRABD', 'ARHGAP29', 'C15orf29', 'SCAMP4',
    'TTC31', 'ZNF570', 'RAB42', 'SERPINI2', 'C9orf21'
]

In [None]:
# Load data.
expression = pd.read_table('data/BRCA_TCGA_microarray.txt',
                           sep='\t', header=0,
                           skiprows=lambda x: x == 1, index_col=0).T
expression.index = pd.Index(
    ['.'.join(sample.split('-')[:3]) for sample in expression.index])

# Filter for reduced gene setif reduced:
expression = expression[gene_names]

# Impute missing values.
expression = expression.fillna(expression.mean())

# Load labels.
labels = pd.read_table('data/TCGA_breast_type.tsv',
                       sep='\t', header=None,
                       index_col=0, names=['Sample', 'Label'])

# Filter for common samples.
expression_index = expression.index.values
labels_index = labels.index.values
intersection = np.intersect1d(expression_index, labels_index)
expression = expression.iloc[[i for i in range(len(expression))
                              if expression_index[i] in intersection]]
labels = labels.iloc[[i for i in range(len(labels))
                      if labels_index[i] in intersection]]

# Join expression data with labels.
label_data = labels['Label'].values
label_index = list(labels.index)
expression['Label'] = np.array(
    [label_data[label_index.index(sample)] for sample in expression.index])
expression['Label'] = pd.Categorical(expression['Label']).codes
data = expression.values

# Split data
train, test = train_test_split(
    data, test_size=int(0.2 * len(data)), random_state=0)
train, val = train_test_split(
    train, test_size=int(0.2 * len(data)), random_state=0)
Y_train = train[:, -1].copy().astype(int)
Y_val = val[:, -1].copy().astype(int)
Y_test = test[:, -1].copy().astype(int)
train = train[:, :-1].copy()
val = val[:, :-1].copy()
test = test[:, :-1].copy()

# Preprocess
mean = train.mean(axis=0)
std = train.std(axis=0)
train = (train - mean) / std
val = (val - mean) / std
test = (test - mean) / std

In [None]:
with open('trained_models/brca model.pkl', 'rb') as f:
    model = pickle.load(f)

In [None]:
# Setup and calculate
imputer = sage.MarginalImputer(model, train)
estimator = sage.PermutationEstimator(imputer, 'cross entropy')
brca_sage = estimator(test, Y_test, thresh=0.01)

In [None]:
brca_sage.save('results/brca_sage.pkl')

# MNIST

In [None]:
import sage
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import torchvision.datasets as dsets

In [None]:
# Load train set
train = dsets.MNIST('../data', train=True, download=True)
imgs = train.data.reshape(-1, 784) / 255.0
labels = train.targets

# Shuffle and split into train and val
inds = torch.randperm(len(train))
imgs = imgs[inds]
labels = labels[inds]
val, Y_val = imgs[:6000], labels[:6000]
train, Y_train = imgs[6000:], labels[6000:]

# Load test set
test = dsets.MNIST('../data', train=False, download=True)
test, Y_test = test.data.reshape(-1, 784) / 255.0, test.targets

# Move test data to numpy
test_np = test.cpu().data.numpy()
Y_test_np = Y_test.cpu().data.numpy()

In [None]:
device = torch.device('cuda', 3)
model = torch.load('trained_models/mnist mlp.pt')
model = model.to(device)
model = nn.Sequential(model, nn.Softmax(dim=1))

In [None]:
# Setup and calculate
imputer = sage.MarginalImputer(model, test_np[:128])
estimator = sage.PermutationEstimator(imputer, 'cross entropy')
sage_values = estimator(test_np, Y_test_np, batch_size=512, thresh=0.01)

In [None]:
sage_values.save('results/mnist_sage.pkl')