# Load data

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# Load dataframe
df = pd.read_csv('../data/OnlineNewsPopularity.csv', header=0, sep=', ', engine='python')

# Drop columns
df.drop(columns=['url', 'timedelta'], inplace=True)

# Binarize target column
df['shares'] = df['shares'].values > 1400

# Split into X, Y
values = df.values.astype(float)
X_cols = np.array(df.columns) != 'shares'
X, Y = values[:, X_cols], values[:, ~X_cols]
feature_names = np.array(df.columns)[X_cols]
num_features = X.shape[1]

# Train/val/test split
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.1, random_state=123)
X_train, X_val, Y_train, Y_val = train_test_split(
    X_train, Y_train, test_size=0.1, random_state=123)

In [3]:
# Standardize continuous columns
ss = StandardScaler()
ss.fit(X_train)
X_train = ss.transform(X_train)
X_val = ss.transform(X_val)
X_test = ss.transform(X_test)

# Train model

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import RandomSampler, BatchSampler
from madgrad import MADGRAD
from copy import deepcopy

In [5]:
# Prepare data
X_train_pt = torch.tensor(X_train, dtype=torch.float32)
X_val_pt = torch.tensor(X_val, dtype=torch.float32)
X_test_pt = torch.tensor(X_test, dtype=torch.float32)
Y_train_pt = torch.tensor(Y_train, dtype=torch.float32)
Y_val_pt = torch.tensor(Y_val, dtype=torch.float32)
Y_test_pt = torch.tensor(Y_test, dtype=torch.float32)

In [6]:
# Set up model
device = torch.device('cuda', 1)
model = nn.Sequential(
    nn.Linear(num_features, 128),
    nn.ReLU(inplace=True),
    nn.Linear(128, 128),
    nn.ReLU(inplace=True),
    nn.Linear(128, 1)).to(device)

# Training parameters
batch_size = 256
max_epochs = 100
lr = 2e-4
lookback = 10
validation_batch_size = 1000
verbose = True

# Set up train data loader
train_set = TensorDataset(X_train_pt, Y_train_pt)
random_sampler = RandomSampler(
    train_set, replacement=True,
    num_samples=int(np.ceil(len(X_train) / batch_size))*batch_size)
batch_sampler = BatchSampler(
    random_sampler, batch_size=batch_size, drop_last=True)
train_loader = DataLoader(train_set, batch_sampler=batch_sampler)

# Prepare validation dataset
val_set = TensorDataset(X_val_pt, Y_val_pt)
val_loader = DataLoader(val_set, batch_size=validation_batch_size)

def validate(model, loader, loss_fn):
    with torch.no_grad():
        # Setup.
        device = next(model.parameters()).device
        mean_loss = 0
        N = 0

        for x, y in loader:
            x = x.to(device)
            y = y.to(device)
            pred = model(x)
            loss = loss_fn(pred, y)
            N += len(x)
            mean_loss += len(x) * (loss - mean_loss) / N

    return mean_loss

# Setup for training
device = next(model.parameters()).device
optimizer = MADGRAD(model.parameters(), lr=lr)
loss_fn = nn.BCEWithLogitsLoss()
best_loss = np.inf
best_epoch = 0
best_model = None
loss_list = []

for epoch in range(max_epochs):
    for i, (x, y) in enumerate(train_loader):
        # Prepare data
        x = x.to(device)
        y = y.to(device)

        # Make predictions
        pred = model(x)
        loss = loss_fn(pred, y)

        # Optimizer step
        loss.backward()
        optimizer.step()
        model.zero_grad()

    # Print progress
    val_loss = validate(model, val_loader, loss_fn).item()
    loss_list.append(val_loss)
    if verbose:
        print('----- Epoch = {} -----'.format(epoch + 1))
        print('Val loss = {:.4f}'.format(val_loss))
        print('')

    # Check if best model
    if val_loss < best_loss:
        best_loss = val_loss
        best_model = deepcopy(model)
        best_epoch = epoch
        if verbose:
            print('New best epoch, loss = {:.4f}'.format(val_loss))
            print('')
    elif epoch - best_epoch == lookback:
        if verbose:
            print('Stopping early')
        break

# Clean up
for param, best_param in zip(model.parameters(),
                             best_model.parameters()):
    param.data = best_param.data

----- Epoch = 1 -----
Val loss = 0.6317

New best epoch, loss = 0.6317

----- Epoch = 2 -----
Val loss = 0.6212

New best epoch, loss = 0.6212

----- Epoch = 3 -----
Val loss = 0.6171

New best epoch, loss = 0.6171

----- Epoch = 4 -----
Val loss = 0.6143

New best epoch, loss = 0.6143

----- Epoch = 5 -----
Val loss = 0.6108

New best epoch, loss = 0.6108

----- Epoch = 6 -----
Val loss = 0.6114

----- Epoch = 7 -----
Val loss = 0.6113

----- Epoch = 8 -----
Val loss = 0.6108

New best epoch, loss = 0.6108

----- Epoch = 9 -----
Val loss = 0.6097

New best epoch, loss = 0.6097

----- Epoch = 10 -----
Val loss = 0.6144

----- Epoch = 11 -----
Val loss = 0.6162

----- Epoch = 12 -----
Val loss = 0.6109

----- Epoch = 13 -----
Val loss = 0.6100

----- Epoch = 14 -----
Val loss = 0.6132

----- Epoch = 15 -----
Val loss = 0.6099

----- Epoch = 16 -----
Val loss = 0.6133

----- Epoch = 17 -----
Val loss = 0.6110

----- Epoch = 18 -----
Val loss = 0.6164

----- Epoch = 19 -----
Val loss = 0.

In [7]:
# Save model
model.eval()
model.cpu()
torch.save(model, '../models/news_model.pt')

# Train surrogate

In [8]:
import torch
import torch.nn as nn
from fastshap_torch.utils import MaskLayer1d
from fastshap_torch import Surrogate, SoftCrossEntropyLoss
import matplotlib.pyplot as plt

In [9]:
# Create data
model.to(device)
Y_train_surrogate = model(X_train_pt.to(device=device)).sigmoid().cpu().data.numpy()
Y_train_surrogate = np.concatenate([1 - Y_train_surrogate, Y_train_surrogate], axis=1)
Y_val_surrogate = model(X_val_pt.to(device=device)).sigmoid().cpu().data.numpy()
Y_val_surrogate = np.concatenate([1 - Y_val_surrogate, Y_val_surrogate], axis=1)

In [10]:
# Get loss upper bound
p = Y_train_surrogate.mean(axis=0)
soft_ce = - np.mean(np.sum(np.log(p) * Y_train_surrogate, axis=1))
print('Loss given no information = {:.4f}'.format(soft_ce))

Loss given no information = 0.6931


In [11]:
# Set up device
device = torch.device('cuda', 6)

# Create model
surrogate = nn.Sequential(
    MaskLayer1d(value=0, append=True),
    nn.Linear(2 * num_features, 128),
    nn.ReLU(inplace=True),
    nn.Linear(128, 128),
    nn.ReLU(inplace=True),
    nn.Linear(128, 2)).to(device)

# Set up surrogate object
surr = Surrogate(surrogate, num_features)

In [12]:
# Train
for batch_size in (32, 512, 8192):
    surr.train((X_train, Y_train_surrogate),
               (X_val, Y_val_surrogate),
               batch_size=batch_size,
               max_epochs=100,
               loss_fn=SoftCrossEntropyLoss(),
               validation_samples=50,
               validation_batch_size=10000,
               validation_seed=0,
               bar=False,
               verbose=True)
    
    print('Best loss = {:.4f}'.format(min(surr.loss_list)))

----- Epoch = 1 -----
Val loss = 0.6487

New best epoch, loss = 0.6487

----- Epoch = 2 -----
Val loss = 0.6442

New best epoch, loss = 0.6442

----- Epoch = 3 -----
Val loss = 0.6433

New best epoch, loss = 0.6433

----- Epoch = 4 -----
Val loss = 0.6425

New best epoch, loss = 0.6425

----- Epoch = 5 -----
Val loss = 0.6415

New best epoch, loss = 0.6415

----- Epoch = 6 -----
Val loss = 0.6417

----- Epoch = 7 -----
Val loss = 0.6410

New best epoch, loss = 0.6410

----- Epoch = 8 -----
Val loss = 0.6408

New best epoch, loss = 0.6408

----- Epoch = 9 -----
Val loss = 0.6409

----- Epoch = 10 -----
Val loss = 0.6406

New best epoch, loss = 0.6406

----- Epoch = 11 -----
Val loss = 0.6409

----- Epoch = 12 -----
Val loss = 0.6404

New best epoch, loss = 0.6404

----- Epoch = 13 -----
Val loss = 0.6408

----- Epoch = 14 -----
Val loss = 0.6406

----- Epoch = 15 -----
Val loss = 0.6411

----- Epoch = 16 -----
Val loss = 0.6402

New best epoch, loss = 0.6402

----- Epoch = 17 -----
Val 

In [14]:
surrogate.cpu()
surrogate.eval()
torch.save(surrogate, '../models/news_surrogate.pt')