# Statistical Summary/Analysis
Use this notebook to calculate summary stats, correlation analyses
and other useful metrics for training and evalution data.
## Error Analysis
Visualize the error between prediction and truth
- For error analysis create the error dataset using _dumpdata.sc_ with the _--model_ argument.
- (notebook) Set _ANALYSIS_ to 'error'
- In the notebook set _ERROR_TRUTH_COL_ and _ERROR_PREDICTION_COL_ to the columns containing the true and predicted values
## Summary
Basic summary stats of the loaded dataframe statistics
## Histogram
Histograms showing distribution of statistics

In [None]:
import os
from typing import Literal
from glob import glob

import pandas as pd

AnalysisType = Literal["summary", "error", "histogram"]

ANALYSES: list[AnalysisType]
COLS_TO_DROP = None
COLS_TO_ANALYZE = None
FILTER_QUERY = None
PREVIEW_DATA = False
"""show a preview of data before analysis"""
TARGET_PATTERN = None
HISTOGRAM_COLS = 3
HISTOGRAM_BINS = 10
HISTOGRAM_INCHES_PER_ROW = 3
HISTOGRAM_INCHES_PER_COL = 5
DATA_LIMIT = 100000
ERROR_TRUTH_COL = None
ERROR_PREDICTION_COL = None
ADDL_FEATURE_FUNC = None
DROP_TARGET_OUTLIERS = True

# DATA_FILES = glob(
#     os.path.join("/", "fantasy-isync", "fantasy-modeling", "2023.12", "nhl_skater.pq")
# )
# DATA_FILES = glob(os.path.join("/", "fantasy-isync", "fantasy-modeling", "2023.12", "nhl_goalie.pq"))
# DATA_FILES = glob(os.path.join("/", "fantasy-isync", "fantasy-modeling", "2023.12", "mlb_pitcher.pq"))
# DATA_FILES = glob(
#     os.path.join("/", "fantasy-isync", "fantasy-modeling", "2023.12", "mlb_hitter.pq")
# )
# DATA_FILES = glob(os.path.join("/", "fantasy-isync", "fantasy-modeling", "2023.12", "nba_player.pq"))

ANALYSES = ["error"]
DATA_FILES = glob(os.path.join("/", "fantasy", "test-training-export.csv"))
ERROR_TRUTH_COL = "target:calc:dk_score"
ERROR_PREDICTION_COL = "calc:dk_score-std-mean" #  "prediction:calc:dk_score"
# ANALYSES = ["histogram", "summary"]
# PREVIEW_DATA = True
# TARGET_PATTERN = r"calc:.*_score"
# COLS_TO_ANALYZE = r"(calc:.*)|(((stat)|(extra)).*((recent-mean)|(std)).*)|"
# COLS_TO_ANALYZE = r"(calc:.*)|(.*-mean.*)"
# COLS_TO_ANALYZE = r"(stat|extra|calc):(?!venue).*"

# LOL dfs win score feature summary and correlation analysis
# DATA_FILES = [os.path.join(
#     "/", "fantasy-experiments", "df-hist", "data", "lol-draftkings-CLASSIC-GPP.csv"
# )]
# COLS_TO_DROP = ["slate_id", "link", "style", "type", "date"]
# FILTER_QUERY = "slate_id.notna()"
# ANALYSES = ["summary"]

# DFS win score prediction error analysis
# DATA_FILES = [
#     filepath
#     for filepath in glob(
#         os.path.join(
#             "/",
#             "fantasy-experiments",
#             "df-hist",
#             "eval_results",
#             "*.prediction.csv",
#         )
#     )
# ]
# ANALYSES = ["error"]


def mlb_addl_features_func(row: pd.Series):
    ret = {}
    for sr in ["std", "recent"]:
        ret[f"addl:slug-{sr}-mean"] = (
            (
                (
                    row[f"stat:off_1b:{sr}-mean"]
                    + 2 * row[f"stat:off_2b:{sr}-mean"]
                    + 3 * row[f"stat:off_3b:{sr}-mean"]
                    + 4 * row[f"stat:off_hr:{sr}-mean"]
                )
                / row[f"stat:off_ab:{sr}-mean"]
            )
            if row[f"stat:off_ab:{sr}-mean"] > 0
            else 0
        )

        obp_denom = (
            row[f"stat:off_ab:{sr}-mean"]
            + row[f"stat:off_bb:{sr}-mean"]
            + row[f"stat:off_sac:{sr}-mean"]
            + row[f"stat:off_hbp:{sr}-mean"]
        )
        ret[f"addl:obp-{sr}-mean"] = (
            (
                (
                    row[f"stat:off_hit:{sr}-mean"]
                    + row[f"stat:off_bb:{sr}-mean"]
                    + row[f"stat:off_hbp:{sr}-mean"]
                )
                / obp_denom
            )
            if obp_denom > 0
            else 0
        )

        ret[f"addl:ops-{sr}-mean"] = ret[f"addl:obp-{sr}-mean"] + ret[f"addl:slug-{sr}-mean"]
    return ret


# ADDL_FEATURE_FUNC = mlb_addl_features_func

assert len(DATA_FILES)

In [None]:
import re
from math import sqrt
from typing import Callable, Pattern

from matplotlib import pyplot as plt
from sklearn import metrics
from tqdm import tqdm


def load(
    path: str,
    cols_to_drop: list[str] | None = None,
    cols_to_analyze: Pattern[str] | None = None,
    filter_query: str | None = None,
    addl_features_func: Callable | None = None,
):
    """
    filter_query: Rows not matching this query will be dropped
    cols_to_drop: list of cols to not analyze
    cols_to_analyze: list of cols to analyze. cols are names or regexs
    """
    assert (cols_to_drop is None) or (
        cols_to_analyze is None
    ), "cols_to_drop and cols_to_analyze cannot both be not None"
    if path.endswith(".csv"):
        df = pd.read_csv(path)
    elif path.endswith(".pq"):
        df = pd.read_parquet(path)
    else:
        raise ValueError(f"Unknown file type for path '{path}'")
    print(f"Loaded {len(df)} rows")
    if DATA_LIMIT is not None:
        df = df.head(DATA_LIMIT)
    # if df.isna().any().any():
    #     print("NAs found. filling na->0.0")
    #     df = df.fillna(0)

    if ADDL_FEATURE_FUNC is not None:
        cols = sorted(
            {":".join(col.split(":", 2)[:2]) for col in df.columns if col[:4] in ("stat", "calc")}
        )
        print(f"{cols=}")

        tqdm.pandas(desc="addl_features")
        addl_df = df.progress_apply(addl_features_func, result_type="expand", axis=1)
        addl_cols = list(addl_df.columns)
        print(f"applied addl_func. addl_cols = {addl_cols}")
        df = pd.concat([df, addl_df], axis=1)
    else:
        addl_cols = []

    file_len = len(df)
    if filter_query:
        df = df.query(filter_query)
        print(f"Filter query dropped {file_len - len(df)} rows, {len(df)} remaining")
    if cols_to_drop is not None:
        print(f"Dropping columns: {cols_to_drop}")
        df = df.drop(columns=cols_to_drop)
    if cols_to_analyze is not None:
        cols_to_keep = [col for col in df.columns if re.match(cols_to_analyze, col)]
        if (cols_dropped := len(df.columns) - len(cols_to_keep)) > 0:
            print(
                f"Dropping {cols_dropped} columns, {len(cols_to_keep)} columns remaining in analysis. "
                f"Dropped cols: {set(df.columns) - set(cols_to_keep)}"
            )
            df = df[cols_to_keep]
    print(f"Loaded features: {sorted(list(df.columns))}")
    return df


def summarize(df: pd.DataFrame, targets: list[str] | None):
    if targets is not None:
        no_target_df = df.drop(columns=targets)
        corr_df = pd.DataFrame(
            {target: no_target_df.corrwith(df[target]) for target in targets}
        ).sort_values(targets, ascending=False)
    else:
        print("Running full cross correlation")
        corr_df = df.corr()

    summary = {
        "summary": df.describe().transpose().drop(columns="count"),
        "correlation": corr_df.style.background_gradient(axis=None, cmap="RdYlGn"),
    }
    return summary


def drop_target_outliers(df: pd.DataFrame, targets: set[str], outlier_range: float = 2.0):
    """drop rows with target outliers"""
    df_no_outliers = df
    for target in targets:
        mean = df[target].mean()
        std = df[target].std()
        lower_bound = mean - outlier_range * std
        upper_bound = mean + outlier_range * std
        df = df[(df[target] >= lower_bound) & (df[target] <= upper_bound)]
    if len(df) < len(df_no_outliers):
        print(f"Dropped {len(df_no_outliers) - len(df)} rows due to outliers")
    return df


def error_analysis(df: pd.DataFrame, desc, truth_col, pred_col):
    assert truth_col in df.columns and pred_col in df.columns, "truth or prediction col not found"
    error_df = df[[truth_col, pred_col]].dropna()
    error_df["error"] = error_df[truth_col] - error_df[pred_col]
    error_df.rename(columns={truth_col: "truth", pred_col: "prediction"}, inplace=True)
    r2 = round(metrics.r2_score(error_df.truth, error_df.prediction), 4)
    rmse = round(sqrt(metrics.mean_squared_error(error_df.truth, error_df.prediction)), 4)
    mae = round(sqrt(metrics.mean_absolute_error(error_df.truth, error_df.prediction)), 4)

    fig, axs = plt.subplots(1, 2, figsize=(10, 5))
    fig.suptitle(f"{desc or 'unknown model'} : n={len(df)} {r2=} {rmse=} {mae=}")
    for ax in axs:
        ax.axis("equal")

    min_v = min(error_df.truth.min(), error_df.prediction.min())
    max_v = max(error_df.truth.max(), error_df.prediction.max())

    axs[0].plot((min_v, max_v), (min_v, max_v), "-g", linewidth=1)
    error_df.plot(kind="scatter", x="truth", y="prediction", ax=axs[0])

    axs[1].yaxis.set_label_position("right")
    axs[1].plot((min_v, max_v), (0, 0), "-g", linewidth=1)
    error_df.plot(kind="scatter", x="truth", y="error", ax=axs[1])

In [None]:
for filepath in DATA_FILES:
    print(f"Analyzing '{filepath}'")
    df = load(
        filepath,
        filter_query=FILTER_QUERY,
        cols_to_drop=COLS_TO_DROP,
        cols_to_analyze=COLS_TO_ANALYZE,
        addl_features_func=ADDL_FEATURE_FUNC,
    )

    targets = (
        [col for col in df.columns if re.match(TARGET_PATTERN, col)] if TARGET_PATTERN else None
    )

    if DROP_TARGET_OUTLIERS:
        assert (
            targets or ERROR_TRUTH_COL
        ), "must specify outlier_cols or ERROR_TRUTH_COL to drop outliers"
        outlier_cols = set(targets) if targets is not None else {ERROR_TRUTH_COL}
        assert outlier_cols <= set(df.columns), "outlier_cols must be a subset of df.columns"
        df = drop_target_outliers(df, outlier_cols)

    if PREVIEW_DATA:
        display(f"data preview total-n={len(df)}", df.head(5).style.hide())

    if "summary" in ANALYSES:
        print(f"{TARGET_PATTERN=} => {targets=}")
        assert (targets is not None and len(targets) > 0) == (TARGET_PATTERN is not None)
        summary = summarize(df, targets)
        with pd.option_context(
            "display.max_rows", None, "display.max_columns", None, "display.max_colwidth", None
        ):
            for name, summary_df in summary.items():
                display(name)
                display(summary_df)

    if "error" in ANALYSES:
        assert (
            ERROR_TRUTH_COL in df.columns and ERROR_PREDICTION_COL in df.columns
        ), f"{ERROR_TRUTH_COL=} or {ERROR_PREDICTION_COL=} not found in df"
        error_analysis(df, os.path.basename(filepath), ERROR_TRUTH_COL, ERROR_PREDICTION_COL)

    if "histogram" in ANALYSES:
        rows = 1 + (len(df.columns) // HISTOGRAM_COLS)
        layout = rows, HISTOGRAM_COLS
        figsize = (HISTOGRAM_COLS * HISTOGRAM_INCHES_PER_COL), (rows * HISTOGRAM_INCHES_PER_ROW)
        print(f"{layout=} {figsize=}")
        df.hist(bins=HISTOGRAM_BINS, layout=layout, figsize=figsize)
        plt.tight_layout()
        plt.show()