In [1]:
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score
import numpy as np
import pandas as pd
import os
import wget
import gzip
import shutil
from pathlib import Path
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
np.random.seed(0)

### Adult Census Income

In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
dataset = 'census-income'
dataset_name = 'census-income'
out = Path(os.getcwd() + '/data/' + dataset_name + '.csv')

out.parent.mkdir(parents=True, exist_ok=True)
if out.exists():
    print("File already exists.")
else:
    print("Downloading file...")
    wget.download(url, out.as_posix())

train = pd.read_csv(out)
target = ' <=50K'
if "Set" not in train.columns:
    train["Set"] = np.random.choice(["train", "valid", "test"], p=[.8, .1, .1], size=(train.shape[0],))

train_indices = train[train.Set == "train"].index
valid_indices = train[train.Set == "valid"].index
test_indices = train[train.Set == "test"].index

nunique = train.nunique()
types = train.dtypes

categorical_columns = []
categorical_dims = {}
for col in train.columns:
    if types[col] == 'object' or nunique[col] < 200:
        # print(col, train[col].nunique())
        l_enc = LabelEncoder()
        train[col] = train[col].fillna("VV_likely")
        train[col] = l_enc.fit_transform(train[col].values)
        categorical_columns.append(col)
        categorical_dims[col] = len(l_enc.classes_)
    else:
        train.fillna(train.loc[train_indices, col].mean(), inplace=True)

train.loc[train[target] == 0, target] = "wealthy"
train.loc[train[target] == 1, target] = "not_wealthy"

unused_feat = ['Set']

features = [col for col in train.columns if col not in unused_feat + [target]]

cat_idxs = [i for i, f in enumerate(features) if f in categorical_columns]

cat_dims = [categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

X_train = train[features].values[train_indices]
y_train = train[target].values[train_indices]

X_valid = train[features].values[valid_indices]
y_valid = train[target].values[valid_indices]

X_test = train[features].values[test_indices]
y_test = train[target].values[test_indices]

File already exists.


  train.loc[train[target] == 0, target] = "wealthy"


In [3]:
input_size = len(features)
# Initialize the MLPClassifier with your desired parameters
clf = MLPClassifier(hidden_layer_sizes=(4 * input_size, 2 * input_size), max_iter=10000, early_stopping=True)

# Fit the model on the training data and validate on the validation set
clf.fit(X_train, y_train)

In [4]:
# Predict the labels for the test set
y_pred = clf.predict(X_test)

# Calculate the accuracy score
test_acc = accuracy_score(y_test, y_pred)

print(f"FINAL TEST SCORE FOR {dataset_name} : {test_acc}")


FINAL TEST SCORE FOR census-income : 0.7976737067646159


### Forest 

In [5]:
# Download ForestCoverType dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz"
dataset_name = 'forest-cover-type'
dataset = 'forest-cover-type'
tmp_out = Path('./data/'+dataset_name+'.gz')
out = Path(os.getcwd()+'/data/'+dataset_name+'.csvpyth')

out.parent.mkdir(parents=True, exist_ok=True)
if out.exists():
    print("File already exists.")
else:
    print("Downloading file...")
    wget.download(url, tmp_out.as_posix())
    with gzip.open(tmp_out, 'rb') as f_in:
        with open(out, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

# Load data and split
target = "Covertype"

bool_columns = [
    "Wilderness_Area1", "Wilderness_Area2", "Wilderness_Area3",
    "Wilderness_Area4", "Soil_Type1", "Soil_Type2", "Soil_Type3", "Soil_Type4",
    "Soil_Type5", "Soil_Type6", "Soil_Type7", "Soil_Type8", "Soil_Type9",
    "Soil_Type10", "Soil_Type11", "Soil_Type12", "Soil_Type13", "Soil_Type14",
    "Soil_Type15", "Soil_Type16", "Soil_Type17", "Soil_Type18", "Soil_Type19",
    "Soil_Type20", "Soil_Type21", "Soil_Type22", "Soil_Type23", "Soil_Type24",
    "Soil_Type25", "Soil_Type26", "Soil_Type27", "Soil_Type28", "Soil_Type29",
    "Soil_Type30", "Soil_Type31", "Soil_Type32", "Soil_Type33", "Soil_Type34",
    "Soil_Type35", "Soil_Type36", "Soil_Type37", "Soil_Type38", "Soil_Type39",
    "Soil_Type40"
]

int_columns = [
    "Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Hydrology",
    "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways",
    "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm",
    "Horizontal_Distance_To_Fire_Points"
]

feature_columns = (
        int_columns + bool_columns + [target])

# train = pd.read_csv('data/covtype.csv', header=None, names=feature_columns)
train = pd.read_csv(out, header=None, names=feature_columns)
# print("number of features")
# print(len(feature_columns))

n_total = len(train)

# Train, val and test split follows
# Rory Mitchell, Andrey Adinets, Thejaswi Rao, and Eibe Frank.
# Xgboost: Scalable GPU accelerated learning. arXiv:1806.11248, 2018.

train_val_indices, test_indices = train_test_split(
    range(n_total), test_size=0.2, random_state=0)
train_indices, valid_indices = train_test_split(
    train_val_indices, test_size=0.2 / 0.6, random_state=0)

categorical_columns = []
categorical_dims =  {}
for col in train.columns[train.dtypes == object]:
    print(col, train[col].nunique())
    l_enc = LabelEncoder()
    train[col] = train[col].fillna("VV_likely")
    train[col] = l_enc.fit_transform(train[col].values)
    categorical_columns.append(col)
    categorical_dims[col] = len(l_enc.classes_)

for col in train.columns[train.dtypes == 'float64']:
    train.fillna(train.loc[train_indices, col].mean(), inplace=True)

unused_feat = []

features = [ col for col in train.columns if col not in unused_feat+[target]]

# print(features)

cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]

cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

X_train = train[features].values[train_indices]
y_train = train[target].values[train_indices]

X_valid = train[features].values[valid_indices]
y_valid = train[target].values[valid_indices]

X_test = train[features].values[test_indices]
y_test = train[target].values[test_indices]

File already exists.


In [6]:
input_size = len(features)
# Initialize the MLPClassifier with your desired parameters
clf = MLPClassifier(hidden_layer_sizes=(4 * input_size, 2 * input_size), max_iter=10000, early_stopping=True)

# Fit the model on the training data and validate on the validation set
clf.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = clf.predict(X_test)

# Calculate the accuracy score
test_acc = accuracy_score(y_test, y_pred)

print(f"FINAL TEST SCORE FOR {dataset_name} : {test_acc}")


FINAL TEST SCORE FOR forest-cover-type : 0.8489195631782313


### Poker

In [7]:
dataset = "poker_hand"
dataset_name = "poker_hand"
target = 'poker_hand_class'

int_columns = [
    "suit1", "rank1", "suit2",
    "rank2", "suit3", "rank3", "suit4", "rank4",
    "suit5", "rank5"
]

feature_columns = (
        int_columns + [target])

dataset_train_1 = 'poker-hand-training-true'
train_1_out = Path(os.getcwd()+'/data/'+dataset_train_1+'.csv')
train_1 = pd.read_csv(train_1_out,
                    header=None, names=feature_columns)

dataset_train_2 = 'poker-hand-testing'
train_2_out = Path(os.getcwd()+'/data/'+dataset_train_2+'.csv')
train_2 = pd.read_csv(train_2_out,
                    header=None, names=feature_columns)

train = pd.concat([train_1, train_2], axis=0)
n_total = len(train)

train_val_indices, test_indices = train_test_split(
    range(n_total), test_size=0.2, random_state=0)
train_indices, valid_indices = train_test_split(
    train_val_indices, test_size=0.2 / 0.6, random_state=0)

categorical_columns = []
categorical_dims = {}
for col in train.columns:
#   print(col, train[col].nunique())
    l_enc = LabelEncoder()
    train[col] = train[col].fillna("VV_likely")
    train[col] = l_enc.fit_transform(train[col].values)
    categorical_columns.append(col)
    categorical_dims[col] = len(l_enc.classes_)

unused_feat = []

features = [col for col in train.columns if col not in unused_feat + [target]]

cat_idxs = [i for i, f in enumerate(features) if f in categorical_columns]

cat_dims = [categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

X_train = train[features].values[train_indices]
y_train = train[target].values[train_indices]

X_valid = train[features].values[valid_indices]
y_valid = train[target].values[valid_indices]

X_test = train[features].values[test_indices]
y_test = train[target].values[test_indices]

In [8]:
input_size = len(features)
# Initialize the MLPClassifier with your desired parameters
clf = MLPClassifier(hidden_layer_sizes=(4 * input_size, 2 * input_size), max_iter=10000, early_stopping=True)

# Fit the model on the training data and validate on the validation set
clf.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = clf.predict(X_test)

# Calculate the accuracy score
test_acc = accuracy_score(y_test, y_pred)

print(f"FINAL TEST SCORE FOR {dataset_name} : {test_acc}")


FINAL TEST SCORE FOR poker_hand : 0.9970439312787193


### Mushroom

In [9]:
dataset_name = "mushroom"
target = 'poisonous'

columns = [
    "cap-shape", "cap-surface", "cap-color",
    "bruises", "odor", "gill-attachment", "gill-spacing", "gill-size",
    "gill-color", "stalk-shape", "stalk-root", "stalk-surface-above-ring", "stalk-surface-below-ring",
    "stalk-color-above-ring", "stalk-color-below-ring", "veil-type", "veil-color", "ring-number", "ring-type",
    "spore-print-color", "population", "habitat"
]

feature_columns = (
        [target] + columns)

dataset = 'mushroom'
dataset_out = Path(os.getcwd()+'/data/'+dataset+'.csv')
train = pd.read_csv(dataset_out,
                    header=None, names=feature_columns)

n_total = len(train)

# Train, val and test split follows
# Rory Mitchell, Andrey Adinets, Thejaswi Rao, and Eibe Frank.
# Xgboost: Scalable GPU accelerated learning. arXiv:1806.11248, 2018.

train_val_indices, test_indices = train_test_split(
    range(n_total), test_size=0.2, random_state=0)
train_indices, valid_indices = train_test_split(
    train_val_indices, test_size=0.2 / 0.6, random_state=0)

categorical_columns = []
categorical_dims = {}
for col in train.columns:
    # print(col, train[col].nunique())
    l_enc = LabelEncoder()
    train[col] = train[col].fillna("VV_likely")
    train[col] = l_enc.fit_transform(train[col].values)
    categorical_columns.append(col)
    categorical_dims[col] = len(l_enc.classes_)

unused_feat = []

features = [col for col in train.columns if col not in unused_feat + [target]]

cat_idxs = [i for i, f in enumerate(features) if f in categorical_columns]

cat_dims = [categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

X_train = train[features].values[train_indices]
y_train = train[target].values[train_indices]

X_valid = train[features].values[valid_indices]
y_valid = train[target].values[valid_indices]

X_test = train[features].values[test_indices]
y_test = train[target].values[test_indices]

In [10]:
input_size = len(features)
# Initialize the MLPClassifier with your desired parameters
clf = MLPClassifier(hidden_layer_sizes=(4 * input_size, 2 * input_size), max_iter=10000, early_stopping=True)

# Fit the model on the training data and validate on the validation set
clf.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = clf.predict(X_test)

# Calculate the accuracy score
test_acc = accuracy_score(y_test, y_pred)

print(f"FINAL TEST SCORE FOR {dataset_name} : {test_acc}")


FINAL TEST SCORE FOR mushroom : 0.9981538461538462


### Blastchar

In [11]:
dataset = 'blastchar'
dataset_name = 'blastchar'

target = 'Churn'


dataset_out = Path(os.getcwd()+'/data/'+dataset+'.csv')
train = pd.read_csv(dataset_out)

n_total = len(train)

train_val_indices, test_indices = train_test_split(
    range(n_total), test_size=0.2, random_state=0)
train_indices, valid_indices = train_test_split(
    train_val_indices, test_size=0.2 / 0.6, random_state=0)

categorical_columns = []
categorical_dims = {}
for col in train.columns:
    # print(col, train[col].nunique())
    l_enc = LabelEncoder()
    train[col] = train[col].fillna("VV_likely")
    train[col] = l_enc.fit_transform(train[col].values)
    categorical_columns.append(col)
    categorical_dims[col] = len(l_enc.classes_)

unused_feat = []

features = [col for col in train.columns if col not in unused_feat + [target]]

cat_idxs = [i for i, f in enumerate(features) if f in categorical_columns]

cat_dims = [categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

X_train = train[features].values[train_indices]
y_train = train[target].values[train_indices]

X_valid = train[features].values[valid_indices]
y_valid = train[target].values[valid_indices]

X_test = train[features].values[test_indices]
y_test = train[target].values[test_indices]

In [12]:
input_size = len(features)
# Initialize the MLPClassifier with your desired parameters
clf = MLPClassifier(hidden_layer_sizes=(4 * input_size, 2 * input_size), max_iter=10000, early_stopping=True)

# Fit the model on the training data and validate on the validation set
clf.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = clf.predict(X_test)

# Calculate the accuracy score
test_acc = accuracy_score(y_test, y_pred)

print(f"FINAL TEST SCORE FOR {dataset_name} : {test_acc}")


FINAL TEST SCORE FOR blastchar : 0.751596877217885


### Diabetes

In [13]:
dataset = 'diabetes'
dataset_name = 'diabetes'
target = 'readmitted'

dataset_out = Path(os.getcwd()+'/data/'+dataset+'.csv')
train = pd.read_csv(dataset_out)

n_total = len(train)

train_val_indices, test_indices = train_test_split(
    range(n_total), test_size=0.2, random_state=0)
train_indices, valid_indices = train_test_split(
    train_val_indices, test_size=0.2 / 0.6, random_state=0)

categorical_columns = []
categorical_dims = {}

for col in train.columns:
    l_enc = LabelEncoder()
    train[col] = train[col].fillna("VV_likely")
    train[col] = l_enc.fit_transform(train[col].values)
    categorical_columns.append(col)
    categorical_dims[col] = len(l_enc.classes_)

unused_feat = []

features = [col for col in train.columns if col not in unused_feat + [target]]

cat_idxs = [i for i, f in enumerate(features) if f in categorical_columns]

cat_dims = [categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

X_train = train[features].values[train_indices]
y_train = train[target].values[train_indices]

X_valid = train[features].values[valid_indices]
y_valid = train[target].values[valid_indices]

X_test = train[features].values[test_indices]
y_test = train[target].values[test_indices]

In [14]:
input_size = len(features)
# Initialize the MLPClassifier with your desired parameters
clf = MLPClassifier(hidden_layer_sizes=(4 * input_size, 2 * input_size), max_iter=10000, early_stopping=True)

# Fit the model on the training data and validate on the validation set
clf.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = clf.predict(X_test)

# Calculate the accuracy score
test_acc = accuracy_score(y_test, y_pred)

print(f"FINAL TEST SCORE FOR {dataset_name} : {test_acc}")


FINAL TEST SCORE FOR diabetes : 0.5399430087452098


### Higgs

In [15]:
dataset_name = "higgs"
target = 'class_label'

features = [
    'jet_1_b-tag',
    'jet_1_eta',
    'jet_1_phi',
    'jet_1_pt',
    'jet_2_b-tag',
    'jet_2_eta',
    'jet_2_phi',
    'jet_2_pt',
    'jet_3_b-tag',
    'jet_3_eta',
    'jet_3_phi',
    'jet_3_pt',
    'jet_4_b-tag',
    'jet_4_eta',
    'jet_4_phi',
    'jet_4_pt',
    'lepton_eta',
    'lepton_pT',
    'lepton_phi',
    'm_bb',
    'm_jj',
    'm_jjj',
    'm_jlv',
    'm_lv',
    'm_wbb',
    'm_wwbb',
    'missing_energy_magnitude',
    'missing_energy_phi',
]

feature_columns = ([target] +
        features)

dataset = 'HIGGS'
dataset_out = Path(os.getcwd()+'/data/'+dataset+'.csv')
train = pd.read_csv(dataset_out, header=None, names=feature_columns)

# Sample 600k from the dataset for training, validation, and testing
data_sample = train.sample(n=700000, random_state=0)

# Indices for splitting
indices = range(len(data_sample))

# Split indices into 600k for training & validation and 100k for testing
train_val_indices, test_indices = train_test_split(indices, test_size=100000, random_state=0)

# Now split the 600k into 100k for training and 500k for validation
train_indices, valid_indices = train_test_split(train_val_indices, test_size=500000, random_state=0)

categorical_columns = []
categorical_dims = {}
for col in train.columns:
    # print(col, train[col].nunique())
    l_enc = LabelEncoder()
    train[col] = train[col].fillna("VV_likely")
    train[col] = l_enc.fit_transform(train[col].values)
    categorical_columns.append(col)
    categorical_dims[col] = len(l_enc.classes_)

unused_feat = []

features = [col for col in train.columns if col not in unused_feat + [target]]

cat_idxs = [i for i, f in enumerate(features) if f in categorical_columns]

cat_dims = [categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

X_train = train[features].values[train_indices]
y_train = train[target].values[train_indices]

X_valid = train[features].values[valid_indices]
y_valid = train[target].values[valid_indices]

X_test = train[features].values[test_indices]
y_test = train[target].values[test_indices]

In [16]:
input_size = len(features)
# Initialize the MLPClassifier with your desired parameters
clf = MLPClassifier(hidden_layer_sizes=(4 * input_size, 2 * input_size), max_iter=10000, early_stopping=True)

# Fit the model on the training data and validate on the validation set
clf.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = clf.predict(X_test)

# Calculate the accuracy score
test_acc = accuracy_score(y_test, y_pred)

print(f"FINAL TEST SCORE FOR {dataset_name} : {test_acc}")


FINAL TEST SCORE FOR higgs : 0.63165
