In [1]:
import zlib
import requests
from pathlib import Path

import torch
from torch import nn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
from xynn.mlpnet import MLPClassifier

In [3]:
MLPClassifier.diagram()



if mlp_use_skip=True (default)
------------------------------
X_num ─ Num. embedding? ┐ ┌─── MLP ──┐
                        ├─┤          w+ ── output
X_cat ─ Cat. embedding ─┘ └─ Linear ─┘

if mlp_use_skip=False
---------------------
X_num ─ Num. embedding? ┐
                        ├─── MLP ── output
X_cat ─ Cat. embedding ─┘

splits are copies and joins are concatenations;
'w+' is weighted element-wise addition;
the numeric embedding is optional



In [4]:
SEED = 34589

# Download ForestCoverType dataset

In [5]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz"
datapath = Path('../data/forest_cover/forest-cover-type.csv')

In [6]:
if datapath.exists():
    print("File already exists.")
else:
    print("Downloading file...")
    datapath.parent.mkdir(parents=True, exist_ok=True)
    response = requests.get(url, stream=True)
    data = zlib.decompress(response.content, zlib.MAX_WBITS|32)
    with open(datapath, 'wb') as outfile:
        outfile.write(data)

File already exists.


# Load data and split

In [7]:
target = "Covertype"

soil_types = [f"Soil_Type{i}" for i in range(1, 41)]

bool_columns = [
    "Wilderness_Area1",
    "Wilderness_Area2",
    "Wilderness_Area3",
    "Wilderness_Area4",
] + soil_types

int_columns = [
    "Elevation",
    "Aspect",
    "Slope",
    "Horizontal_Distance_To_Hydrology",
    "Vertical_Distance_To_Hydrology",
    "Horizontal_Distance_To_Roadways",
    "Hillshade_9am",
    "Hillshade_Noon",
    "Hillshade_3pm",
    "Horizontal_Distance_To_Fire_Points",
]

feature_columns = int_columns + bool_columns + [target]

In [8]:
train = pd.read_csv(datapath, header=None, names=feature_columns)
train.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Covertype
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5


In [9]:
X_num = train[int_columns + bool_columns]
y = train[[target]]

In [10]:
X_num_train, X_num_valid, y_train, y_valid = train_test_split(
    X_num.values, y.values, test_size=0.2, random_state=0
)

X_cat_train, X_cat_valid = None, None

In [11]:
mean = X_num_train.mean(axis=0, keepdims=True)
stdv = np.sqrt(X_num_train.var(ddof=1, axis=0, keepdims=True))

X_num_train = (X_num_train - mean) / stdv
X_num_valid = (X_num_valid - mean) / stdv

In [12]:
X_num_train.shape, X_num_valid.shape

((464809, 54), (116203, 54))

# Model

In [13]:
def accuracy(y_pred, y_true):
    y_pred = torch.argmax(y_pred, dim=1)
    acc = torch.eq(y_pred, y_true).to(dtype=torch.int).sum()
    return 100 * acc / y_pred.shape[0]

In [14]:
model = MLPClassifier(
    mlp_hidden_sizes=(512, 256, 128, 64),
    mlp_activation=nn.LeakyReLU,
    mlp_use_bn=True,
    mlp_dropout=0.0,
    mlp_use_skip=True,
    use_leaky_gate=True,
    seed=SEED,
)

In [15]:
model.fit(
    X_num=X_num_train,
    X_cat=X_cat_train,
    y=y_train,
    optimizer=torch.optim.Adam,
    opt_kwargs={"lr": 1e-2},
    scheduler=torch.optim.lr_scheduler.StepLR,
    sch_kwargs={"step_size": 5, "gamma": 0.1 ** 0.125},
    val_sets=[[X_num_valid, X_cat_valid, y_valid]],
    extra_metrics=[("accuracy", accuracy)],
    num_epochs=100,
    batch_size=2048,
    early_stopping_patience=10,
    early_stopping_metric="accuracy",
    early_stopping_mode="max",
    #log_path=f"mlp_forest_log_seed{SEED}.txt",  # log model parameters and epoch info
    #param_path=f"mlp_forest_seed{SEED}.pkl",  # auto-restore best model
    verbose=True,
)

epoch  lrn rate  non-mlp  train loss   val loss   accuracy
──────────────────────────────────────────────────────────
    0    0.0100     0.41      0.3702     0.3854      83.94         
    1    0.0100     0.37      0.2750     0.2869      88.24         
    2    0.0100     0.35      0.2534     0.2499      89.65         
    3    0.0100     0.33      0.2067     0.2110      91.47         
    4    0.0100     0.31      0.1922     0.2013      91.85         
    5   0.00750     0.29      0.1624     0.1772      92.75         
    6   0.00750     0.29      0.1670     0.1613      93.47         
    7   0.00750     0.27      0.1490     0.1547      93.71         
    8   0.00750     0.27      0.1542     0.1647      93.39         
    9   0.00750     0.27      0.1237     0.1528      93.83         
   10   0.00562     0.25      0.1054     0.1363      94.50         
   11   0.00562     0.24      0.1361     0.1275      94.93          
   12   0.00562     0.24      0.1280     0.1277      94.89       