In [1]:
import multiprocessing

import polars
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
num_workers: int = multiprocessing.cpu_count()

In [3]:
train_info_df = polars.read_csv(
	"../data/train_info.csv", n_threads=num_workers, low_memory=True, rechunk=True, use_pyarrow=True
)
train_data_df = polars.read_csv(
	"../data/train_data.csv", n_threads=num_workers, low_memory=True, rechunk=True, use_pyarrow=True
)

In [4]:
features = ["Ax", "Ay", "Az", "Gx", "Gy", "Gz"]

In [5]:
train_data_df = (
	train_data_df.sort("player_ID", "data_id", "time_order")
	.set_sorted("time_order")
	.group_by("player_ID", maintain_order=True)
	.agg([polars.mean(features).name.suffix("_mean"), polars.std(features).name.suffix("_std")])
	.join(train_data_df, on="player_ID")
	.with_columns(((polars.col(col) - polars.col(f"{col}_mean")) / polars.col(f"{col}_std")) for col in features)
	.drop(["^[AG][xyz]_mean$", "^[AG][xyz]_std$"])
)

In [6]:
train_data_df = train_data_df.rolling(
	index_column="time_order", group_by=["player_ID", "data_id"], offset="0i", period="42i", closed="both"
).agg(polars.all())

In [7]:
train_df = train_data_df.join(train_info_df, on="data_id").drop(["player_ID", "data_id", "time_order"])

In [8]:
dataset = Dataset(train_df.to_arrow())

In [9]:
dataset

Dataset({
    features: ['Ax', 'Ay', 'Az', 'Gx', 'Gy', 'Gz', 'gender', 'hold racket handed', 'play years', 'level'],
    num_rows: 2605094
})

In [10]:
dataset = dataset.train_test_split(test_size=0.2)

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Ax', 'Ay', 'Az', 'Gx', 'Gy', 'Gz', 'gender', 'hold racket handed', 'play years', 'level'],
        num_rows: 2084075
    })
    test: Dataset({
        features: ['Ax', 'Ay', 'Az', 'Gx', 'Gy', 'Gz', 'gender', 'hold racket handed', 'play years', 'level'],
        num_rows: 521019
    })
})

In [12]:
test_df = polars.read_csv(
	"../data/test_data.csv", n_threads=num_workers, low_memory=True, rechunk=True, use_pyarrow=True
)
test_df = test_df.sort("data_id", "time_order").set_sorted("time_order")

In [13]:
test_df = test_df.with_columns((polars.col(features) - polars.mean(features)) / polars.std(features))

In [14]:
test_df = test_df.rolling(index_column="time_order", group_by="data_id", offset="0i", period="42i", closed="both").agg(
	polars.all()
)

In [15]:
test_df = test_df.drop("time_order")

In [16]:
test_dataset = Dataset(test_df.to_arrow())

In [17]:
test_dataset

Dataset({
    features: ['data_id', 'Ax', 'Ay', 'Az', 'Gx', 'Gy', 'Gz'],
    num_rows: 1005894
})