# Processing of SCIP features

SCIP features have been computed on EhV data form the first large scale experiment, on files: H1_T7, H1_T8, H1_T9, H2_T7, H2_T8, H2_T9

## Load raw split frame

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from scip_workflows.common import *


In [None]:
import pyarrow


## Load data

In [None]:
try:
    paths = snakemake.input
    output = snakemake.output[0]
except NameError:
    data_root = Path("/data/gent/vo/000/gvo00070/vsc42015/datasets/wbc/")
    data_root = Path("/home/maximl/scratch/data/vsc/datasets/wbc/")
    data_dir = data_root / "scip" / "131020222139"
    output = data_dir / "features.parquet"
    paths = data_dir.glob("*.*.parquet")


In [None]:
df = pandas.concat([pq.read_table(p).to_pandas() for p in paths], axis=0)


In [None]:
cat_type = CategoricalDtype(
    categories=sorted(df["meta_group"].astype(int).unique()), ordered=True
)
df["meta_group"] = df["meta_group"].astype(int).astype(cat_type)
cat_type = CategoricalDtype(
    categories=sorted(df["meta_part"].astype(int).unique()), ordered=True
)
df["meta_part"] = df["meta_part"].astype(int).astype(cat_type)

df = df.set_index(["meta_group", "meta_part", "meta_fix", "meta_object_number"])


In [None]:
def map_to_name(r):
    return "wbc{group}_{part}{fix}".format(
        group=int(r.meta_group) + 2,
        part=r.meta_part,
        fix="" if r.meta_fix is numpy.nan else "_" + r.meta_fix,
    )


df["meta_sample"] = df.index.to_frame().apply(map_to_name, axis=1)


In [None]:
df.shape


In [None]:
df.loc[([1, 2], slice(None), "early"), :].shape


In [None]:
pq.write_table(pyarrow.Table.from_pandas(df), output)
