# Statistical Summary/Analysis
Use this notebook to calculate summary stats, correlation analyses
and other useful metrics for training and evalution data

In [None]:
import os
from typing import Literal
from glob import glob

import pandas as pd

Analysis = Literal["summary", "error", "histogram"]

ANALYSES: list[Analysis]
COLS_TO_DROP = None
COLS_TO_ANALYZE = None
FILTER_QUERY = None
PREVIEW_DATA = False
TARGET_PATTERN = None
HISTOGRAM_COLS = 3
HISTOGRAM_BINS = 10
HISTOGRAM_INCHES_PER_ROW = 3
HISTOGRAM_INCHES_PER_COL = 5
"""show a preview of data before analysis"""

# NFL DFS score predict
DATA_FILES = glob(os.path.join("/", "fantasy-experiments", "models", "2023.12", "nfl_RB*csv"))
ANALYSES = ["histogram", "summary"]
COLS_TO_ANALYZE = r"(stat|extra|calc):(?!venue).*"
PREVIEW_DATA = True
TARGET_PATTERN = r"calc:.*"
# COLS_TO_ANALYZE = r"(calc:.*)|(((stat)|(extra)).*((recent-mean)|(std)).*)|"
COLS_TO_ANALYZE = r"(calc:.*)|(.*-mean.*)"

# LOL dfs win score feature summary and correlation analysis
# DATA_FILES = [os.path.join(
#     "/", "fantasy-experiments", "df-hist", "data", "lol-draftkings-CLASSIC-GPP.csv"
# )]
# COLS_TO_DROP = ["slate_id", "link", "style", "type", "date"]
# FILTER_QUERY = "slate_id.notna()"
# ANALYSES = ["summary"]

# DFS win score prediction error analysis
# DATA_FILES = [
#     filepath
#     for filepath in glob(
#         os.path.join(
#             "/",
#             "fantasy-experiments",
#             "df-hist",
#             "eval_results",
#             "*.prediction.csv",
#         )
#     )
# ]
# ANALYSES = ["error"]

assert len(DATA_FILES)

In [None]:
import re
from math import sqrt
from typing import Pattern

from matplotlib import pyplot as plt
from sklearn import metrics


def load(
    path: str,
    cols_to_drop: list[str] | None = None,
    cols_to_analyze: Pattern[str] | None = None,
    filter_query: str | None = None,
):
    """
    filter_query: Rows not matching this query will be dropped
    cols_to_drop: list of cols to not analyze
    cols_to_analyze: list of cols to analyze. cols are names or regexs
    """
    assert (cols_to_drop is None) or (
        cols_to_analyze is None
    ), "cols_to_drop and cols_to_analyze cannot both be not None"
    df = pd.read_csv(path)
    file_len = len(df)
    if filter_query:
        df = df.query(filter_query)
        print(f"Filter query dropped {file_len - len(df)} rows, {len(df)} remaining")
    if cols_to_drop is not None:
        print(f"Dropping columns: {cols_to_drop}")
        df = df.drop(columns=cols_to_drop)
    if cols_to_analyze is not None:
        cols_to_keep = [col for col in df.columns if re.match(cols_to_analyze, col)]
        if (cols_dropped := len(df.columns) - len(cols_to_keep)) > 0:
            print(
                f"Dropping {cols_dropped} columns, {len(cols_to_keep)} columns remaining in analysis. "
                f"Dropped cols: {set(df.columns) - set(cols_to_keep)}"
            )
            df = df[cols_to_keep]
    print(f"Analyzing features: {sorted(list(df.columns))}")
    return df


def summarize(df: pd.DataFrame, targets: list[str] | None):
    if targets is not None:
        corr_df = pd.DataFrame({target: df.corrwith(df[target]) for target in targets})
    else:
        print("Running full cross correlation")
        corr_df = df.corr()

    summary = {
        "summary": df.describe().transpose().drop(columns="count"),
        "correlation": corr_df.style.background_gradient(axis=None, cmap="RdYlGn"),
    }
    return summary


def error_analysis(df: pd.DataFrame, desc):
    assert {"truth", "prediction", "error"} <= set(df.columns)

    r2 = round(metrics.r2_score(df.truth, df.prediction), 4)
    rmse = round(sqrt(metrics.mean_squared_error(df.truth, df.prediction)), 4)
    mae = round(sqrt(metrics.mean_absolute_error(df.truth, df.prediction)), 4)

    fig, axs = plt.subplots(1, 2, figsize=(10, 5))
    fig.suptitle(f"{desc or 'unknown model'} : n={len(df)} {r2=} {rmse=} {mae=}")
    for ax in axs:
        ax.axis("equal")

    min_v = min(df.truth.min(), df.prediction.min())
    max_v = max(df.truth.max(), df.prediction.max())

    axs[0].plot((min_v, max_v), (min_v, max_v), "-g", linewidth=1)
    df.plot(kind="scatter", x="truth", y="prediction", ax=axs[0])

    axs[1].yaxis.set_label_position("right")
    axs[1].plot((min_v, max_v), (0, 0), "-g", linewidth=1)
    df.plot(kind="scatter", x="truth", y="error", ax=axs[1])

In [None]:
for filepath in DATA_FILES:
    print(f"Analyzing '{filepath}'")
    df = load(
        filepath,
        filter_query=FILTER_QUERY,
        cols_to_drop=COLS_TO_DROP,
        cols_to_analyze=COLS_TO_ANALYZE,
    )
    if PREVIEW_DATA:
        display(f"data preview total-n={len(df)}", df.head(5).style.hide())

    if "summary" in ANALYSES:
        targets = (
            [col for col in df.columns if re.match(TARGET_PATTERN, col)] if TARGET_PATTERN else None
        )
        print(f"{TARGET_PATTERN=} => {targets=}")
        assert (targets is not None and len(targets) > 0) == (TARGET_PATTERN is not None)
        summary = summarize(df, targets)
        with pd.option_context(
            "display.max_rows", None, "display.max_columns", None, "display.max_colwidth", None
        ):
            for name, summary_df in summary.items():
                display(name)
                display(summary_df)

    if "error" in ANALYSES:
        error_analysis(df, os.path.basename(filepath))

    if "histogram" in ANALYSES:
        rows = 1 + (len(df.columns) // HISTOGRAM_COLS)
        layout = rows, HISTOGRAM_COLS
        figsize = (HISTOGRAM_COLS * HISTOGRAM_INCHES_PER_COL), (rows * HISTOGRAM_INCHES_PER_ROW)
        print(f"{layout=} {figsize=}")
        df.hist(bins=HISTOGRAM_BINS, layout=layout, figsize=figsize)
        plt.tight_layout()
        plt.show()