---
title: creating excluded list for outliers
description: 'while observing outliers it was deemed prudent to create a list of runs to be excluded from the core dataset during development, while retaining them for perhaps later reclaimation. To this end a `excluded` table was created, and a `inc_` `chm` and `img_stats` tbls were created as the subset of the respective master tables without the excluded runs.'
---


In [None]:
%reload_ext autoreload
%autoreload 2

import duckdb as db
import polars as pl
from database_etl.definitions import DB_PATH, RAW_DATA_LIB

pl.Config.set_fmt_str_lengths(9999)
pl.Config.set_tbl_rows(9999)
con = db.connect(DB_PATH)

image_files = list(RAW_DATA_LIB.glob("*.D/extract_*/data.parquet"))
len(image_files)


In [None]:
con.sql(
    """--sql
show tables
"""
).pl()


In [None]:
con.sql(
    """--sql
select * from image_stats limit 5
"""
).pl()


In [None]:
con.sql(
    """--sql
select distinct mins_max from image_stats
"""
).pl()


In [None]:
def find_time_outliers(con: db.DuckDBPyConnection, time_cutoff: float) -> pl.DataFrame:
    """
    return a pl df containing samples whose right side of time dim interval is less
    than `time_cutoff`
    """
    return con.execute(
        """--sql
    select
        *
    from
        image_stats
    join
        chm
    on
        image_stats.pk = chm.pk
    where
        mins_max < ?
    """,
        parameters=[time_cutoff],
    ).pl()


find_time_outliers(con=con, time_cutoff=20)


# Creating Excluded Table and Adding Sample 20.


In [None]:
def create_excluded_tbl(con: db.DuckDBPyConnection) -> None:
    con.sql(
        """--sql
    create or replace table excluded (
        pk integer primary key references chm(pk),
        runid varchar unique not null,
        reason varchar not null
    );
    """
    ).pl()


def add_61_to_excluded(con: db.DuckDBPyConnection) -> None:
    """
    As shown in `find_time_outliers`, sample `pk` = 61 is an aborted run with a runtime
    of 14 seconds, and is to be added to the excluded list.
    """
    con.sql(
        """--sql
        insert into excluded
            select
                pk,
                runid,
                'aborted run' as reason
            from
                chm
            where
                pk = 61;
        """
    )


def create_inc_chm_view(con: db.DuckDBPyConnection) -> None:
    """
    creates a view consisting of the anti join of chm and excluded, resulting in the set
    of runs deemed includable in downstream analyses.
    """
    con.sql("""--sql
    create or replace view inc_chm as
        select
            *
        from
            chm
        anti join
            excluded
        on
            excluded.pk = chm.pk;
    """)


create_excluded_tbl(con=con)
add_61_to_excluded(con=con)
create_inc_chm_view(con=con)


In [None]:
con.sql(
    """--sql
describe excluded
"""
).pl()


# Creating Included Image Stats (`inc_img_stats`)


In [None]:
def create_inc_img_stats(con=con) -> None:
    """
    masks `image_stats` by the difference from the `excluded` list, returning the runs
    which are included.
    """
    con.sql(
        """--sql
    create or replace view inc_img_stats as
        select
            *
        from
            image_stats ist
        anti join
            excluded exc
        on
            ist.pk = exc.pk;
    """
    ).pl()


create_inc_img_stats(con=con)


In [None]:
# con.sql(
#     """--sql
# select
#     distinct mins_max
# from
#     inc_img_stats
# """
# ).pl().pipe(display)


In [None]:
con.sql(
    """--sql
describe inc_img_stats
"""
).pl()


So when accounting for the aborted run pk = 61, all runs have the same end time - 52 mins, until the 3rd significant figure. Considering that it is one observation every 0.4 seconds, then only the first two significant figures are relevent, and thus they are the same. I presume that rounding to the second signifiant figure will make all time labels the same - this is easier than resampling.

In [None]:
paths = con.sql(
    """--sql
select
    path
from
    inc_img_stats
"""
).fetchall()

paths[0:5]


In [None]:
paths[0][0]


In [None]:
import pandas as pd

df = pd.read_parquet(paths[0][0])
df


In [None]:
pl.read_parquet(
    "/Users/jonathan/mres_thesis/database_etl/database_etl/data/raw_uv/114.D/extract_2024-09-18T212559/data.parquet"
)


In [None]:
df = pl.read_parquet(paths[0][0])
df
