In [None]:
from typing import Any

import numpy
import polars
from datasets import Dataset, load_dataset
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC

In [None]:
dataset = load_dataset(
	"csv", data_dir="../data", data_files={"train": "train.csv", "test": "test.csv"}, keep_in_memory=True, num_proc=8
)

In [None]:
dataset_split = dataset["train"].train_test_split(test_size=0.1, keep_in_memory=True)
dataset["train"], dataset["validation"] = dataset_split["train"], dataset_split["test"]

In [None]:
index_columns: list = ["data_ID", "player_ID"]
label_columns: list = ["gender", "hold racket handed", "play years", "level"]
data_columns: list = list(set(dataset["train"].column_names).difference(set(index_columns + label_columns)))

In [None]:
train_dataset: polars.DataFrame = dataset["train"].to_polars()
train_data: dict[str, numpy.ndarray] = {label: train_dataset[data_columns].to_numpy() for label in label_columns}
train_label: dict[str, numpy.ndarray] = {label: train_dataset[label].to_numpy() for label in label_columns}

In [None]:
validation_dataset: polars.DataFrame = dataset["validation"].to_polars()
validation_data: dict[str, numpy.ndarray] = {
	label: validation_dataset[data_columns].to_numpy() for label in label_columns
}
validation_label: dict[str, numpy.ndarray] = {label: validation_dataset[label].to_numpy() for label in label_columns}

In [None]:
hyperparameter_grid: dict[str, list[Any]] = {
	"C": [0.1, 1, 10, 100],  # 正則化參數
	"kernel": ["linear", "poly", "rbf", "sigmoid"],  # 核函數類型
	"degree": [2, 3, 4, 5],  # 多項式核函數的次數（僅對 'poly' 核有效）
	"gamma": ["scale", "auto", 0.001, 0.01, 0.1, 1],  # 核係數（僅對 'rbf', 'poly', 'sigmoid' 有效）
	"coef0": [0.0, 0.1, 0.5, 1.0],  # 核函數中的常數項（僅對 'poly' 和 'sigmoid' 有效）
	"shrinking": [True, False],  # 是否使用 shrinking heuristic
	"probability": [True, False],  # 是否啟用概率估計
	"class_weight": [None, "balanced"],  # 類別權重
	"break_ties": [
		True,
		False,
	],  # 是否在概率估計後打破平局（僅在 decision_function_shape='ovr' 且 probability=True 時可用）
}

In [None]:
best_classifiers: dict = {}
for label in label_columns:
	print(f"Tune classifier for {label}")
	classifier = SVC(random_state=37, decision_function_shape="ovr")

	tuner = RandomizedSearchCV(
		classifier, hyperparameter_grid, n_iter=50, cv=5, scoring="roc_auc_ovr", n_jobs=-1, verbose=10
	)

	tuner.fit(train_data[label], train_label[label])

	best_classifiers[label] = tuner.best_estimator_

In [None]:
test_data: dict[str, numpy.ndarray] = {
	label: dataset["test"].select_columns(data_columns).to_polars().to_numpy() for label in label_columns
}

In [None]:
predictions: dict = {"data_ID": dataset["test"]["data_ID"]}
for label in label_columns:
	predictions[label] = best_classifiers[label].predict(test_data[label]).tolist()

In [None]:
test_result = Dataset.from_dict(predictions)

In [None]:
test_result = test_result.map(
	lambda samples: {
		"play years_0": [1 if sample == 0 else 0 for sample in samples],
		"play years_1": [1 if sample == 1 else 0 for sample in samples],
		"play years_2": [1 if sample == 2 else 0 for sample in samples],
	},
	input_columns=["play years"],
	remove_columns=["play years"],
	batched=True,
	num_proc=8,
)

In [None]:
test_result = test_result.map(
	lambda samples: {
		"level_0": [1 if sample == 0 else 0 for sample in samples],
		"level_1": [1 if sample == 1 else 0 for sample in samples],
		"level_2": [1 if sample == 2 else 0 for sample in samples],
	},
	input_columns=["level"],
	remove_columns=["level"],
	batched=True,
	num_proc=8,
)

In [None]:
test_result

In [None]:
test_result.to_csv("../dist/result_svm.csv", num_proc=8)