In [1]:
import zlib
import requests
from pathlib import Path

import torch
from torch import nn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
from xynn.autoint import AutoIntClassifier

In [3]:
AutoIntClassifier.diagram()


if mlp_hidden_sizes (default)
-----------------------------
X_num ─ Num. embedding ┐ ┌─ Attn ─ ... ─ Attn ─ MLP ─┐
                       ├─┤                           w+ ── output
X_cat ─ Cat. embedding ┘ └────────── MLP ────────────┘

if no mlp_hidden_sizes
----------------------
X_num ─ Num. embedding ┬─ Attn ─ ... ─ Attn ─ Linear ─ output
X_cat ─ Cat. embedding ┘ 

splits are copies and joins are concatenations;
'w+' is weighted element-wise addition;
"Attn" is AutoInt's AttentionInteractionLayer



In [4]:
SEED = 34589

# Download ForestCoverType dataset

In [5]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz"
datapath = Path('../data/forest_cover/forest-cover-type.csv')

In [6]:
if datapath.exists():
    print("File already exists.")
else:
    print("Downloading file...")
    datapath.parent.mkdir(parents=True, exist_ok=True)
    response = requests.get(url, stream=True)
    data = zlib.decompress(response.content, zlib.MAX_WBITS|32)
    with open(datapath, 'wb') as outfile:
        outfile.write(data)

File already exists.


# Load data and split

In [7]:
target = "Covertype"

soil_types = [f"Soil_Type{i}" for i in range(1, 41)]

bool_columns = [
    "Wilderness_Area1",
    "Wilderness_Area2",
    "Wilderness_Area3",
    "Wilderness_Area4",
] + soil_types

int_columns = [
    "Elevation",
    "Aspect",
    "Slope",
    "Horizontal_Distance_To_Hydrology",
    "Vertical_Distance_To_Hydrology",
    "Horizontal_Distance_To_Roadways",
    "Hillshade_9am",
    "Hillshade_Noon",
    "Hillshade_3pm",
    "Horizontal_Distance_To_Fire_Points",
]

feature_columns = int_columns + bool_columns + [target]

In [8]:
train = pd.read_csv(datapath, header=None, names=feature_columns)
train.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Covertype
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5


In [9]:
X_num = train[int_columns + bool_columns]
y = train[[target]]

In [10]:
X_num_train, X_num_valid, y_train, y_valid = train_test_split(
    X_num.values, y.values, test_size=0.2, random_state=0
)

X_cat_train, X_cat_valid = None, None

In [11]:
mean = X_num_train.mean(axis=0, keepdims=True)
stdv = np.sqrt(X_num_train.var(ddof=1, axis=0, keepdims=True))

X_num_train = (X_num_train - mean) / stdv
X_num_valid = (X_num_valid - mean) / stdv

In [12]:
X_num_train.shape, X_num_valid.shape

((464809, 54), (116203, 54))

# Model

In [13]:
def accuracy(y_pred, y_true):
    y_pred = torch.argmax(y_pred, dim=1)
    acc = torch.eq(y_pred, y_true).to(dtype=torch.int).sum()
    return 100 * acc / y_pred.shape[0]

Note this example uses a GPU. If you don't have a GPU, comment out `device="cuda"` in the cell below.

In [14]:
model = AutoIntClassifier(
    attn_activation=None,
    attn_dropout=0.0,
    attn_normalize=False,
    mlp_hidden_sizes=(256, 192, 128, 64),
    mlp_activation=nn.LeakyReLU,
    mlp_use_bn=True,
    mlp_dropout=0.0,
    mlp_use_skip=True,
    use_leaky_gate=True,
    seed=SEED,
    device="cuda",
)

In [15]:
model.fit(
    X_num=X_num_train,
    X_cat=X_cat_train,
    y=y_train,
    optimizer=torch.optim.Adam,
    opt_kwargs={"lr": 1e-2},
    scheduler=torch.optim.lr_scheduler.StepLR,
    sch_kwargs={"step_size": 5, "gamma": 0.1 ** 0.125},
    val_sets=[[X_num_valid, X_cat_valid, y_valid]],
    extra_metrics=[("accuracy", accuracy)],
    num_epochs=100,
    batch_size=2048,
    early_stopping_patience=10,
    early_stopping_metric="accuracy",
    early_stopping_mode="max",
    #log_path=f"autoint_forest_log_seed{SEED}.txt",  # save epoch info to file
    #param_path=f"autoint_forest_seed{SEED}.pkl",  # auto-restore best model
    verbose=True,
)

epoch  lrn rate  non-mlp  train loss   val loss   accuracy
──────────────────────────────────────────────────────────
    0    0.0100     0.49      0.3614     0.3636      84.94         
    1    0.0100     0.48      0.2831     0.2732      88.67         
    2    0.0100     0.48      0.2240     0.2287      90.76         
    3    0.0100     0.49      0.2420     0.2083      91.59         
    4    0.0100     0.49      0.1802     0.1835      92.60         
    5   0.00750     0.49      0.1643     0.1598      93.49         
    6   0.00750     0.49      0.1446     0.1498      93.99         
    7   0.00750     0.50      0.1273     0.1451      94.19         
    8   0.00750     0.50      0.1523     0.1448      94.10         
    9   0.00750     0.50      0.1181     0.1336      94.70         
   10   0.00562     0.50      0.1045     0.1203      95.19          
   11   0.00562     0.50      0.1229     0.1186      95.27          
   12   0.00562     0.51      0.1190     0.1185      95.26      