In [1]:
import sys
import gzip
#import wget
import shutil
from pathlib import Path

import torch
from torch import nn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
from xynn.pnn import PNNPlusClassifier

In [3]:
PNNPlusClassifier.diagram()


if product_type="outer" (default) or product_type="inner"
---------------------------------------------------------
X_num ─ Num. embedding ┐ ┌─────── Linear ────────┬─ MLP ─┐
                       ├─┼─ inner/outer product ─┘       w+ ── output
X_cat ─ Cat. embedding ┘ └───────────────────────── MLP ─┘

if product_type="both"
----------------------   ┌──── Linear ─────┐
X_num ─ Num. embedding ┐ ├─ inner product ─┼─ MLP ─┐
                       ├─┼─ outer product ─┘       w+ ── output
X_cat ─ Cat. embedding ┘ └─────────────────── MLP ─┘

splits are copies and joins are concatenations;
'w+' is weighted element-wise addition



# Download ForestCoverType dataset

In [4]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz"
dataset_name = 'forest-cover-type'
out = Path('../data/forest_cover/forest-cover-type.csv')

In [5]:
if out.exists():
    print("File already exists.")
else:
    print("Downloading file...")
    out.parent.mkdir(parents=True, exist_ok=True)
    wget.download(url, out.as_posix())
    with gzip.open(tmp_out, 'rb') as f_in:
        with open(out, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

File already exists.


# Load data and split

In [6]:
target = "Covertype"

soil_types = [f"Soil_Type{i}" for i in range(1, 41)]

bool_columns = [
    "Wilderness_Area1",
    "Wilderness_Area2",
    "Wilderness_Area3",
    "Wilderness_Area4",
] + soil_types

int_columns = [
    "Elevation",
    "Aspect",
    "Slope",
    "Horizontal_Distance_To_Hydrology",
    "Vertical_Distance_To_Hydrology",
    "Horizontal_Distance_To_Roadways",
    "Hillshade_9am",
    "Hillshade_Noon",
    "Hillshade_3pm",
    "Horizontal_Distance_To_Fire_Points",
]

feature_columns = int_columns + bool_columns + [target]

In [7]:
train = pd.read_csv(out, header=None, names=feature_columns)
train.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Covertype
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5


In [8]:
X_num = train[int_columns + bool_columns]
y = train[[target]]

In [9]:
# Train, val and test split follows
# Rory Mitchell, Andrey Adinets, Thejaswi Rao, and Eibe Frank.
# Xgboost: Scalable GPU accelerated learning. arXiv:1806.11248, 2018.

X_num_train, X_num_valid, y_train, y_valid = train_test_split(
    X_num.values, y.values, test_size=0.2, random_state=0
)

X_cat_train, X_cat_valid = None, None

In [10]:
mean = X_num_train.mean(axis=0, keepdims=True)
stdv = np.sqrt(X_num_train.var(ddof=1, axis=0, keepdims=True))

X_num_train = (X_num_train - mean) / stdv
X_num_valid = (X_num_valid - mean) / stdv

In [11]:
X_num_train.shape, X_num_valid.shape

((464809, 54), (116203, 54))

# Model

In [12]:
SEED = 34589

In [13]:
def accuracy(y_pred, y_true):
    y_pred = torch.argmax(y_pred, dim=1)
    acc = torch.eq(y_pred, y_true).to(dtype=torch.int).sum()
    return 100 * acc / y_pred.shape[0]

In [14]:
model = PNNPlusClassifier(
    mlp_hidden_sizes=(256, 192, 128, 64),
    mlp_activation=nn.LeakyReLU,
    mlp_use_bn=True,
    mlp_dropout=0.0,
    mlp_leaky_gate=True,
    mlp_use_skip=True,
    weighted_sum=True,
    seed=SEED,
)

In [15]:
model.fit(
    X_num=X_num_train,
    X_cat=X_cat_train,
    y=y_train,
    optimizer=torch.optim.Adam,
    opt_kwargs={"lr": 1e-2},
    scheduler=torch.optim.lr_scheduler.StepLR,
    sch_kwargs={"step_size": 5, "gamma": 0.1 ** 0.125},
    val_sets=[[X_num_valid, X_cat_valid, y_valid]],
    extra_metrics=[("accuracy", accuracy)],
    num_epochs=100,
    batch_size=2048,
    early_stopping_patience=10,
    early_stopping_metric="accuracy",
    early_stopping_mode="max",
    #log_path=f"pnnp_forest_log_seed{SEED}.txt",  # save epoch info to file
    #param_path=f"pnnp_forest_seed{SEED}.pkl",  # auto-restore best model
    verbose=True,
)

epoch  lrn rate  non-mlp  train loss   val loss   accuracy
──────────────────────────────────────────────────────────
    0    0.0100     0.37      0.3713     0.4019      82.66         
    1    0.0100     0.32      0.2911     0.3007      87.79         
    2    0.0100     0.25      0.2126     0.2505      89.86         
    3    0.0100     0.24      0.2437     0.2158      91.13         
    4    0.0100     0.24      0.1974     0.2000      91.90         
    5   0.00750     0.23      0.1704     0.1850      92.54         
    6   0.00750     0.23      0.1338     0.1742      92.89         
    7   0.00750     0.23      0.1437     0.1604      93.51         
    8   0.00750     0.22      0.1651     0.1528      93.86         
    9   0.00750     0.22      0.1459     0.1627      93.46         
   10   0.00562     0.21      0.1276     0.1404      94.45         
   11   0.00562     0.21      0.1164     0.1332      94.71         
   12   0.00562     0.21      0.1117     0.1282      94.93        