In [None]:
import multiprocessing

import polars
from datasets import Dataset

In [None]:
num_workers: int = multiprocessing.cpu_count()

In [None]:
train_info_df = polars.read_csv(
	"../../data/train_info.csv", n_threads=num_workers, low_memory=True, rechunk=True, use_pyarrow=True
)
train_data_df = polars.read_csv(
	"../../data/train_data.csv", n_threads=num_workers, low_memory=True, rechunk=True, use_pyarrow=True
)

In [None]:
features = ["Ax", "Ay", "Az", "Gx", "Gy", "Gz"]

In [None]:
train_data_df = (
	train_data_df.sort("player_ID", "data_id", "time_order")
	.set_sorted("time_order")
	.group_by("player_ID", maintain_order=True)
	.agg([polars.mean(features).name.suffix("_mean"), polars.std(features).name.suffix("_std")])
	.join(train_data_df, on="player_ID")
	.with_columns(((polars.col(col) - polars.col(f"{col}_mean")) / polars.col(f"{col}_std")) for col in features)
	.drop(["^[AG][xyz]_mean$", "^[AG][xyz]_std$"])
)

In [None]:
train_data_df = train_data_df.rolling(
	index_column="time_order", group_by=["player_ID", "data_id"], offset="0i", period="42i", closed="both"
).agg(polars.all())

In [None]:
train_df = train_data_df.join(train_info_df, on="data_id").drop(["player_ID", "data_id", "time_order"])

In [None]:
train_dataset = Dataset(train_df.to_arrow())

In [None]:
train_dataset

In [None]:
test_df = polars.read_csv(
	"../../data/test_data.csv", n_threads=num_workers, low_memory=True, rechunk=True, use_pyarrow=True
)
test_df = test_df.sort("data_id", "time_order").set_sorted("time_order")

In [None]:
test_df = test_df.with_columns((polars.col(features) - polars.mean(features)) / polars.std(features))

In [None]:
test_df = test_df.rolling(index_column="time_order", group_by="data_id", offset="0i", period="42i", closed="both").agg(
	polars.all()
)

In [None]:
test_df = test_df.drop("time_order")

In [None]:
test_dataset = Dataset(test_df.to_arrow())

In [None]:
test_dataset