In [21]:
from typing import Any

import numpy
import polars
from datasets import Dataset, load_dataset
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

In [22]:
dataset = load_dataset(
	"csv", data_dir="../data", data_files={"train": "train.csv", "test": "test.csv"}, keep_in_memory=True, num_proc=8
)

In [24]:
dataset = dataset.map(
	lambda samples: {
		"play years_0": [1 if sample == 0 else 0 for sample in samples],
		"play years_1": [1 if sample == 1 else 0 for sample in samples],
		"play years_2": [1 if sample == 2 else 0 for sample in samples],
	},
	input_columns=["play years"],
	remove_columns=["play years"],
	batched=True,
	num_proc=8,
)

Map (num_proc=8):   0%|          | 0/23868 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/7488 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/2652 [00:00<?, ? examples/s]

In [25]:
dataset = dataset.map(
	lambda samples: {
		"level_0": [1 if sample == 0 else 0 for sample in samples],
		"level_1": [1 if sample == 1 else 0 for sample in samples],
		"level_2": [1 if sample == 2 else 0 for sample in samples],
	},
	input_columns=["level"],
	remove_columns=["level"],
	batched=True,
	num_proc=8,
)

Map (num_proc=8):   0%|          | 0/23868 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/7488 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/2652 [00:00<?, ? examples/s]

In [26]:
index_columns: list = ["data_ID", "player_ID"]
label_columns: list = [
	"gender",
	"hold racket handed",
	"play years_0",
	"play years_1",
	"play years_2",
	"level_0",
	"level_1",
	"level_2",
]
data_columns: list = list(set(dataset["train"].column_names).difference(set(index_columns + label_columns)))

In [27]:
data_columns

['ax_var',
 'a_entropy',
 'gy_mean',
 'gx_mean',
 'a_mean',
 'az_var',
 'g_mean',
 'gx_rms',
 'gz_rms',
 'ay_rms',
 'gx_var',
 'a_fft',
 'a_min',
 'a_kurt',
 'a_skewn',
 'gy_rms',
 'g_max',
 'az_rms',
 'ay_mean',
 'g_min',
 'g_psd',
 'a_max',
 'g_kurt',
 'a_psd',
 'ax_mean',
 'gz_mean',
 'g_entropy',
 'g_skewn',
 'ay_var',
 'gy_var',
 'az_mean',
 'g_fft',
 'ax_rms',
 'gz_var']

In [None]:
scaler = StandardScaler()

In [28]:
train_dataset: polars.DataFrame = dataset["train"].to_polars()
train_data: numpy.ndarray = scaler.fit_transform(train_dataset[data_columns].to_numpy())
train_label: dict[str, numpy.ndarray] = {label: train_dataset[label].to_numpy() for label in label_columns}

In [None]:
train_data, validation_data, train_label, validation_label = train_test_split(
	train_data, train_label, test_size=0.1, random_state=37
)

In [30]:
hyperparameter_grid: dict[str, list[Any]] = {
	"hidden_layer_sizes": [(50,), (100,), (200,), (50, 50), (100, 50), (100, 100)],
	"activation": ["identity", "logistic", "tanh", "relu"],
	"solver": ["adam", "sgd", "lbfgs"],
	"learning_rate": ["constant", "adaptive", "invscaling"],
	"learning_rate_init": [0.001, 0.01, 0.1],
	"max_iter": [200, 300, 400, 500],
	"alpha": [0.0001, 0.001, 0.01],
	"batch_size": ["auto", 32, 64, 128],
}

In [31]:
best_classifiers: dict = {}
for label in label_columns:
	print(f"Tune classifier for {label}")
	classifier = MLPClassifier(random_state=37)

	tuner = GridSearchCV(classifier, hyperparameter_grid, cv=5, scoring="roc_auc_ovr", n_jobs=-1, verbose=1)

	tuner.fit(train_data, train_label[label])

	best_classifiers[label] = tuner.best_estimator_

	y_pred_proba = best_classifiers[label].predict_proba(validation_data)[:, 1]
	auc_score = roc_auc_score(validation_label[label], y_pred_proba, multi_class="ovr")
	print(f"AUC score of {label} classifier: {auc_score:.4f}")

Tune classifier for gender
Fitting 5 folds for each of 31104 candidates, totalling 155520 fits


  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scik

KeyboardInterrupt: 

In [None]:
test_data: polars.DataFrame = scaler.fit_transform(dataset["test"].select_columns(data_columns).to_polars().to_numpy())

In [None]:
predictions: dict = {"data_ID": dataset["test"]["data_ID"]}
for label in label_columns:
	predictions[label] = best_classifiers[label].predict(test_data).tolist()

In [None]:
test_result = Dataset.from_dict(predictions)

In [None]:
test_result

In [None]:
test_result.to_csv("../dist/result_mp.csv", num_proc=8)