In [1]:
from typing import Any

import numpy
import polars
from datasets import Dataset, load_dataset
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [2]:
dataset = load_dataset(
	"csv", data_dir="../data", data_files={"train": "train.csv", "test": "test.csv"}, keep_in_memory=True, num_proc=8
)

In [21]:
dataset_split = dataset["train"].train_test_split(test_size=0.1, keep_in_memory=True)
dataset["train"], dataset["validation"] = dataset_split["train"], dataset_split["test"]

In [7]:
index_columns: list = ["data_ID", "player_ID"]
label_columns: list = ["gender", "hold racket handed", "play years", "level"]
data_columns: list = list(set(dataset["train"].column_names).difference(set(index_columns + label_columns)))

In [19]:
correlations = dataset["train"].to_pandas().corr()[6:][label_columns]
correlations

Unnamed: 0,gender,hold racket handed,play years,level
ax_mean,-0.083687,0.037874,0.006176,0.22041
ay_mean,0.142736,-0.050927,0.049046,-0.043791
az_mean,0.449442,-0.286373,0.33522,-0.349032
gx_mean,0.144716,-0.198805,0.233342,-0.446217
gy_mean,-0.14961,-0.758158,0.05358,-0.103262
gz_mean,-0.219187,-0.004471,-0.014535,0.09005
ax_var,0.142442,-0.078595,0.159223,-0.196221
ay_var,0.013653,-0.065022,0.091239,-0.132474
az_var,-0.238862,-0.052082,-0.161755,0.103757
gx_var,-0.309226,-0.022403,-0.262048,0.290706


In [20]:
selected_features: dict = {
	label: [k for k, v in correlations.to_dict()[label].items() if v > 0] for label in label_columns
}
selected_features

{'gender': ['ay_mean',
  'az_mean',
  'gx_mean',
  'ax_var',
  'ay_var',
  'gy_var',
  'gz_var',
  'ax_rms',
  'gy_rms',
  'gz_rms',
  'g_max',
  'g_mean',
  'g_min',
  'a_kurt',
  'g_kurt',
  'a_skewn',
  'g_skewn',
  'a_entropy',
  'g_entropy'],
 'hold racket handed': ['ax_mean',
  'gz_var',
  'gy_rms',
  'gz_rms',
  'a_mean',
  'a_min',
  'g_max',
  'g_mean',
  'g_min',
  'a_fft',
  'g_fft',
  'a_psd',
  'g_psd',
  'a_entropy',
  'g_entropy'],
 'play years': ['ax_mean',
  'ay_mean',
  'az_mean',
  'gx_mean',
  'gy_mean',
  'ax_var',
  'ay_var',
  'gy_var',
  'gz_var',
  'ax_rms',
  'ay_rms',
  'gy_rms',
  'gz_rms',
  'a_max',
  'g_max',
  'g_mean',
  'a_kurt',
  'g_kurt',
  'a_skewn',
  'g_skewn',
  'a_entropy',
  'g_entropy'],
 'level': ['ax_mean',
  'gz_mean',
  'az_var',
  'gx_var',
  'az_rms',
  'gx_rms',
  'gy_rms',
  'a_min',
  'g_mean',
  'g_min',
  'a_fft',
  'g_fft',
  'a_entropy',
  'g_entropy']}

In [22]:
train_dataset: polars.DataFrame = dataset["train"].to_polars()
train_data: dict[str, numpy.ndarray] = {
	label: train_dataset[selected_features[label]].to_numpy() for label in label_columns
}
train_label: dict[str, numpy.ndarray] = {label: train_dataset[label].to_numpy() for label in label_columns}

In [23]:
validation_dataset: polars.DataFrame = dataset["validation"].to_polars()
validation_data: dict[str, numpy.ndarray] = {
	label: validation_dataset[selected_features[label]].to_numpy() for label in label_columns
}
validation_label: dict[str, numpy.ndarray] = {label: validation_dataset[label].to_numpy() for label in label_columns}

In [24]:
hyperparameter_grid: dict[str, list[Any]] = {
	"n_estimators": [50, 100, 200, 300],
	"max_depth": [None, 10, 20, 30, 40],
	"min_samples_split": [2, 5, 10, 15],
	"min_samples_leaf": [1, 2, 4, 6],
	"max_features": ["sqrt", "log2"],
	"bootstrap": [True, False],
}

In [25]:
best_classifiers: dict = {}
for label in label_columns:
	print(f"Tune classifier for {label}")
	classifier = RandomForestClassifier(random_state=37)

	tuner = RandomizedSearchCV(
		classifier, hyperparameter_grid, n_iter=50, cv=5, scoring="roc_auc_ovr", n_jobs=-1, verbose=10
	)

	tuner.fit(train_data[label], train_label[label])

	best_classifiers[label] = tuner.best_estimator_

Tune classifier for gender
Fitting 5 folds for each of 50 candidates, totalling 250 fits



KeyboardInterrupt



In [17]:
test_data: dict[str, numpy.ndarray] = {
	label: dataset["test"].select_columns(selected_features[label]).to_polars().to_numpy() for label in label_columns
}

In [18]:
predictions: dict = {"data_ID": dataset["test"]["data_ID"]}
for label in label_columns:
	predictions[label] = best_classifiers[label].predict(test_data[label]).tolist()

In [19]:
test_result = Dataset.from_dict(predictions)

In [20]:
test_result = test_result.map(
	lambda samples: {
		"play years_0": [1 if sample == 0 else 0 for sample in samples],
		"play years_1": [1 if sample == 1 else 0 for sample in samples],
		"play years_2": [1 if sample == 2 else 0 for sample in samples],
	},
	input_columns=["play years"],
	remove_columns=["play years"],
	batched=True,
	num_proc=8,
)

Map (num_proc=8):   0%|          | 0/7488 [00:00<?, ? examples/s]

In [21]:
test_result = test_result.map(
	lambda samples: {
		"level_0": [1 if sample == 0 else 0 for sample in samples],
		"level_1": [1 if sample == 1 else 0 for sample in samples],
		"level_2": [1 if sample == 2 else 0 for sample in samples],
	},
	input_columns=["level"],
	remove_columns=["level"],
	batched=True,
	num_proc=8,
)

Map (num_proc=8):   0%|          | 0/7488 [00:00<?, ? examples/s]

In [22]:
test_result

Dataset({
    features: ['data_ID', 'gender', 'hold racket handed', 'play years_0', 'play years_1', 'play years_2', 'level_0', 'level_1', 'level_2'],
    num_rows: 7488
})

In [23]:
test_result.to_csv("../dist/result_rf.csv", num_proc=8)