In [2]:
from pathlib import Path

import jupyter_black
import numpy as np
import polars as pl
import seaborn as sns
from hydra import compose
from hydra import initialize
from hydra.core.global_hydra import GlobalHydra

jupyter_black.load()
# plt.style.use("ggplot")
sns.set()


def load_config(result_dir: Path):
    # clear previous initialization
    GlobalHydra.instance().clear()

    # initialize hydra
    config_path = result_dir / ".hydra"
    initialize(config_path=config_path.as_posix())
    # load the config
    cfg = compose(config_name="config")

    return cfg

In [3]:
EXP_NAME = "exp125"
RUN_NAMES = ["run0", "run1", "run2", "run3", "run4"]
all_preds = []
all_keys = []
all_labels = []
for run_name in RUN_NAMES:
    RESULT_DIR = Path("../output/train") / EXP_NAME / run_name
    cfg = load_config(RESULT_DIR)
    preds = np.load(RESULT_DIR / "preds.npy")
    labels = np.load(RESULT_DIR / "labels.npy")
    keys = np.load(RESULT_DIR / "keys.npy")
    all_preds.append(preds)
    all_labels.append(labels)
    all_keys.append(keys)

all_preds = np.concatenate(all_preds)
all_labels = np.concatenate(all_labels)
all_keys = np.concatenate(all_keys)
gt_df = pl.read_csv(Path(cfg.dir.data_dir) / "train_events.csv")

The version_base parameter is not specified.
Please specify a compatability version level, or None.
Will assume defaults for version 1.1
  initialize(config_path=config_path.as_posix())
The version_base parameter is not specified.
Please specify a compatability version level, or None.
Will assume defaults for version 1.1
  initialize(config_path=config_path.as_posix())


In [4]:
import polars as pl


In [5]:
all_preds.shape, all_labels.shape, all_keys.shape

((7525, 17280, 3), (7525, 17280, 3), (7525,))

In [6]:
target_series_ids = [
    # "0402a003dae9",
    # "280e08693c6d",
    # "2b8d87addea9",
    # "3452b878e596",
    # "4ac356361be9",
    # "4feda0596965",
    # "60d31b0bec3b",
    # "7504165f497d",
    # "91cb6c98201f",
    # "a9a2f7fac455",
    # "c535634d7dcd",
    # "c75b4b207bea",
    # "ca730dbf521d",
    # "cca14d1966c1",
    # "d150801f3145",
    # "db5e0ee1c0ab",
    # "df33ae359fb5",
    # "ece2561f07e9",
    # "eec197a4bdca",
    # "f56824b503a0",
    # "f7eb179216c2",
    # "703b5efa9bc1",
    # "05e1944c3818",
    "854206f602d0",
]

In [7]:
train = (
    pl.scan_parquet(Path(cfg.dir.data_dir) / "train_series.parquet")
    .filter(pl.col("series_id").is_in(target_series_ids))
    .collect()
)

In [23]:
series_ids = np.array(list(map(lambda x: x.split("_")[0], all_keys)))
# for series_id in target_series_ids:
#     break
idx = 0
series_id = target_series_ids[idx]

series_idx = np.where(series_ids == series_id)[0]
this_series_preds = all_preds[series_idx].reshape(-1, 3)
this_series_labels = all_labels[series_idx].reshape(-1, 3)

In [27]:
series_df = train.filter(pl.col("series_id") == series_id)
duration = series_df.shape[0]

series_df = series_df.with_columns(
    pl.lit(this_series_preds[:duration, 0]).alias("sleep_pred"),
    pl.lit(this_series_preds[:duration, 1]).alias("onset_pred"),
    pl.lit(this_series_preds[:duration, 2]).alias("wakeup_pred"),
    pl.lit(this_series_labels[:duration, 0]).alias("sleep_label"),
    pl.lit(this_series_labels[:duration, 1]).alias("onset_label"),
    pl.lit(this_series_labels[:duration, 2]).alias("wakeup_label"),
    ((pl.col("anglez") + 90) / 180).alias("anglez"),
    (pl.col("enmo") / 2).alias("enmo"),
)

In [28]:
# sampled_series_preds = this_series_preds[::10]
# sampled_series_labels = this_series_labels[::10]
# sampled_series = np.concatenate([sampled_series_preds, sampled_series_labels], axis=1)

In [39]:
import plotly.express as px

px.line(
    series_df.filter(pl.col("step") % 100 == 0).to_pandas(),
    x="step",
    y=[
        "sleep_pred",
        # "onset_pred",
        # "wakeup_pred",
        "sleep_label",
        # "onset_label",
        # "wakeup_label",
        "anglez",
        "enmo",
    ],
    title=series_id,
    # height=500,
    # width=1500,
    range_y=[-0.1, 1.1],
)

In [41]:
series_df

series_id,step,timestamp,anglez,enmo,sleep_pred,onset_pred,wakeup_pred,sleep_label,onset_label,wakeup_label
str,u32,str,f32,f32,f32,f32,f32,f32,f32,f32
"""05e1944c3818""",0,"""2018-11-16T18:…",0.01807,0.00765,0.005753,0.000064,0.002028,0.0,0.0,0.0
"""05e1944c3818""",1,"""2018-11-16T18:…",0.017353,0.00755,0.005753,0.000064,0.002028,0.0,0.0,0.0
"""05e1944c3818""",2,"""2018-11-16T18:…",0.017713,0.0075,0.005501,0.000061,0.001974,0.0,0.0,0.0
"""05e1944c3818""",3,"""2018-11-16T18:…",0.017422,0.0074,0.005001,0.000055,0.001869,0.0,0.0,0.0
"""05e1944c3818""",4,"""2018-11-16T18:…",0.017675,0.00765,0.004501,0.000049,0.001762,0.0,0.0,0.0
"""05e1944c3818""",5,"""2018-11-16T18:…",0.017939,0.0079,0.004002,0.000043,0.001657,0.0,0.0,0.0
"""05e1944c3818""",6,"""2018-11-16T18:…",0.017939,0.00835,0.003622,0.00004,0.001725,0.0,0.0,0.0
"""05e1944c3818""",7,"""2018-11-16T18:…",0.01807,0.00815,0.003366,0.000038,0.001966,0.0,0.0,0.0
"""05e1944c3818""",8,"""2018-11-16T18:…",0.017285,0.0079,0.003109,0.000037,0.002209,0.0,0.0,0.0
"""05e1944c3818""",9,"""2018-11-16T18:…",0.017874,0.0078,0.002853,0.000036,0.002451,0.0,0.0,0.0


In [19]:
gt_df.filter((pl.col("series_id") == series_id)).drop_nulls().to_pandas()

Unnamed: 0,series_id,night,event,step,timestamp
0,703b5efa9bc1,1,onset,6300,2018-06-19T22:30:00-0400
1,703b5efa9bc1,1,wakeup,14748,2018-06-20T10:14:00-0400
2,703b5efa9bc1,2,onset,25164,2018-06-21T00:42:00-0400
3,703b5efa9bc1,2,wakeup,31380,2018-06-21T09:20:00-0400
4,703b5efa9bc1,6,onset,94200,2018-06-25T00:35:00-0400
5,703b5efa9bc1,6,wakeup,99972,2018-06-25T08:36:00-0400
6,703b5efa9bc1,7,onset,111480,2018-06-26T00:35:00-0400
7,703b5efa9bc1,7,wakeup,117252,2018-06-26T08:36:00-0400
8,703b5efa9bc1,8,onset,128064,2018-06-26T23:37:00-0400
9,703b5efa9bc1,8,wakeup,135888,2018-06-27T10:29:00-0400


In [14]:
gt_df = gt_df.drop_nulls()

In [8]:
def add_noisy_event_flag(gt_df: pl.DataFrame) -> pl.DataFrame:
    """
    ソフトラベルにする
    """
    relabeled_events = pl.read_csv(
        "/home/kuto/kaggle/kaggle-sleep-v2/data/child-mind-institute-detect-sleep-states/relabeled_train_events.csv"
    )
    relabeled_gt_df = gt_df.join(relabeled_events, on=["series_id", "step", "event"], how="left")
    relabeled_gt_df = relabeled_gt_df.with_columns(
        pl.when(pl.col("relabeled_step").is_null())
        .then(pl.lit(0))
        .otherwise(pl.lit(1))
        .alias("is_noisy_event"),
    ).select(["series_id", "night", "event", "step", "timestamp", "is_noisy_event"])
    return relabeled_gt_df

In [9]:
gt_df2 = add_noisy_event_flag(gt_df)

In [10]:
gt_df2["is_noisy_event"].value_counts()

is_noisy_event,counts
i32,u32
0,14447
1,61


In [12]:
gt_df2.pivot(index=["series_id", "night"], columns="event", values=["step", 'is_nosity_event']).drop_nulls()

series_id,night,onset,wakeup
str,i64,i64,i64
"""038441c925bb""",1,4992,10932
"""038441c925bb""",2,20244,27492
"""038441c925bb""",3,39996,44400
"""038441c925bb""",4,57240,62856
"""038441c925bb""",6,91296,97860
"""038441c925bb""",7,109500,118524
"""038441c925bb""",8,127296,133332
"""038441c925bb""",10,159972,167400
"""038441c925bb""",11,177036,180804
"""038441c925bb""",12,194220,202272


In [15]:
gt_df2.pivot(index=["series_id", "night"], columns="event", values="is_noisy_event").drop_nulls()

series_id,night,onset,wakeup
str,i64,i32,i32
"""038441c925bb""",1,0,0
"""038441c925bb""",2,0,0
"""038441c925bb""",3,0,0
"""038441c925bb""",4,0,0
"""038441c925bb""",5,0,0
"""038441c925bb""",6,0,0
"""038441c925bb""",7,0,0
"""038441c925bb""",8,0,0
"""038441c925bb""",9,0,0
"""038441c925bb""",10,0,0


In [3]:
def add_hand_labeled_event(gt_df: pl.DataFrame) -> pl.DataFrame:
    """
    ソフトラベルにする
    """
    relabeled_events = (
        pl.read_csv(
            "/home/kuto/kaggle/kaggle-sleep-v2/data/child-mind-institute-detect-sleep-states/relabeled_train_events.csv"
        )
        .select(["series_id", "relabeled_step", "event"])
        .rename({"relabeled_step": "step"})
    )
    relabeled_gt_df = pl.concat([gt_df, relabeled_events]).sort(["series_id", "step"])
    return relabeled_gt_df

In [4]:
add_hand_labeled_event(gt_df)

ShapeError: unable to append to a dataframe of width 5 with a dataframe of width 3