# Statistical Summary/Analysis
Use this notebook to calculate summary stats, correlation analyses
and other useful metrics for training and evalution data

In [None]:
import os

import pandas as pd

DATA_FILE = os.path.join(
    "/", "fantasy-experiments", "df-hist", "data", "lol-draftkings-CLASSIC-GPP.csv"
)
COLS_TO_DROP = ["slate_id", "link", "style", "type", "date"]
FILTER_QUERY = "slate_id.notna()"


In [None]:
def load(path: str, cols_to_drop: list[str] | None = None, filter_query: str | None = None):
    """
    filter_query: Rows not matching this query will be dropped
    """
    df = pd.read_csv(DATA_FILE)
    # display("Loaded data", df)
    file_len = len(df)
    print(f"Loaded n={file_len} from '{DATA_FILE}'")
    if filter_query:
        df = df.query(filter_query)
        print(f"Filter query dropped {file_len - len(df)} rows, {len(df)} remaining")
    if cols_to_drop is not None:
        print(f"Dropping columns: {cols_to_drop}")
        df = df.drop(columns=cols_to_drop)
    return df

def summarize(df: pd.DataFrame):
    return df.describe()

In [None]:
df = load(DATA_FILE, filter_query=FILTER_QUERY, cols_to_drop=COLS_TO_DROP)
display(f"data n={len(df)}", df.style.hide())
summary = summarize(df)
display(summary)