# Sports book online historic odds data aggregation
Combine sports book archive CSV files and betiq data into a single CSV
file with columns that match fantasy odds retrieval.

Files csv files will be retrieved from the `fantasy-archive/SPORT/sportsbook-odds` folder.

Output files will be written to the same location as the 
source data. The filename will be "sportsbook-odds-[SPORT].[SEASON_MIN]-[SEASON_MAX].[EXT]"

Output data will have one row per game with columns: 'date', 'season', 'away-team', 'home-team', 'away-moneyline', 'home-moneyline', 'overunder', 'spread'.
All odds are american style. Spread if for a home win (so positive means home scores the spread more then away, negative is home scores the spread less)


In [None]:
from typing import Literal

TESTING_RUN = False
EXT: None | Literal["csv", "parquet"] = "parquet" if not TESTING_RUN else None
"""the output file extention and format"""

VALID_RANGE = {
    "mlb": {"spread": (1.5, 1.5), "overunder": (3, 18)},
    "nhl": {"spread": (1.5, 1.5), "overunder": (2.5, 15)},
    "nfl": {"spread": (0, 35), "overunder": (20, 85)},
    "nba": {"spread": (0, 25), "overunder": (150, 300)},
}
"""absolute min/max values for validating things by sport"""

# SPORT = "mlb"
# SPORT = "nba"
# SPORT = "nfl"
SPORT = "nhl"

In [None]:
import glob
import json
import os
from functools import cache

path_to_csv_files = os.path.join(os.environ["FANTASY_DATA_ARCHIVE_DIR"], "odds-archive", SPORT)
assert os.path.isdir(path_to_csv_files), f"Ensure that csv data file path exists {path_to_csv_files=}"
csv_files = glob.glob(os.path.join(path_to_csv_files, "*Sheet*.csv"))
print("CSV files that will be processed:")
print("\n".join(csv_files))

team_abbrs_filepath = os.path.join(os.getcwd(), "team_abbrs.json")
with open(team_abbrs_filepath, "r") as f_:
    teamname_remap = json.load(f_)[SPORT]
print(f"Loaded {len(teamname_remap)} team abbr remaps for {SPORT}")


@cache
def get_team_abbr(archive_team_name: str, season: int):
    """return a team abbreviation for the team name"""
    if SPORT == "nba":
        if archive_team_name == "NewOrleans":
            # hornets up to 20122013, then the pelicans
            return "NOH" if season <= 20122013 else "NOP"
        if archive_team_name == "Charlotte":
            # bobcats till 20132014, then hornets
            return "CHB" if season <= 20132014 else "CHA"
    return teamname_remap.get(archive_team_name, archive_team_name)

In [None]:
import re
from typing import cast

import pandas as pd
import tqdm
from dateutil.parser import parse as du_parse
from fantasy_py import SPORT_DB_MANAGER_DOMAIN, CLSRegistry, GameScheduleEpoch, NotSeasonDateError
from fantasy_py.sport import SportDBManager

db_manager = cast(SportDBManager, CLSRegistry.get_class(SPORT_DB_MANAGER_DOMAIN, SPORT))


def _date_str(
    date_str: str, min_year_in_file: int | None = None, min_epoch: GameScheduleEpoch | None = None
):
    """figure out the game date (as a string)"""
    orig_date_4_str = ("0" if int(date_str) < 1000 else "") + str(date_str)
    if min_year_in_file is not None:
        return str(min_year_in_file) + orig_date_4_str

    assert min_epoch is not None
    new_date_str = str(min_epoch.date.year) + orig_date_4_str
    try:
        if db_manager.epoch_for_date(du_parse(new_date_str).date()).season == min_epoch.season:
            return new_date_str
    except (NotSeasonDateError, ValueError):
        pass

    new_date_str = str(min_epoch.date.year + 1) + orig_date_4_str
    assert db_manager.epoch_for_date(du_parse(new_date_str).date()).season == min_epoch.season
    return new_date_str


def _inspect_file(filepath: str):
    """returns (season, index-of-header-row)"""
    if SPORT == "mlb":
        start_year_matches = re.findall("mlb-odds-([0-9]+).xls.*", filepath)
        season = int(start_year_matches[0])
        min_season_epoch = db_manager.epoch_for_game_number(season, 1)
        return min_season_epoch, 0

    with open(filepath, "r") as f_:
        info_row_str = f_.readline()
        f_.readline()
        first_row_str = f_.readline()

    first_date_str = first_row_str.split(",")[0]

    # line should be of the form '{SPORT} {YEARS}' where YEARS is YYYY | YYYY-YY
    file_info_str = info_row_str.split(",", 1)[0]
    assert file_info_str.lower().startswith(
        SPORT + " "
    ), f"Expected file info string to start with '{SPORT} '. {file_info_str=}"
    years_str = file_info_str.split(" ", 1)[1]
    assert years_str.startswith(
        "20"
    ), f"Expected years str to start with '20'. {file_info_str=} {years_str=}"

    years_split = years_str.split("-")

    if SPORT == "nfl":
        season = int(years_split[0])
        min_season_epoch = db_manager.epoch_for_game_number(season, 1)
        return min_season_epoch, 1

    # NBA|NHL years are of the form YYYY or YYYY-YY or YYYY-YYYY
    min_file_year_str = years_split[0]
    assert len(min_file_year_str) == 4
    min_file_year = int(min_file_year_str)
    if len(years_split) == 1:
        # if there is only 1 year then the first date must be in it
        first_date = du_parse(_date_str(first_date_str, min_file_year))
        epoch = db_manager.epoch_for_date(first_date)
        return epoch, 1

    assert len(years_split) == 2, "expecting 2 years"
    if len(years_split[1]) == 2:
        max_file_year_str = "20" + years_split[1]
    else:
        assert len(years_split[1]) == 4
        max_file_year_str = years_split[1]
    assert int(min_file_year_str) + 1 == int(max_file_year_str), "years should be consecutive"
    season = int(min_file_year_str + max_file_year_str)

    # date must be either in the first year or following year, try first year and test against epoch
    first_date = du_parse(_date_str(first_date_str, min_file_year))
    epoch = db_manager.epoch_for_date(first_date)
    if epoch.season == season:
        return epoch, 1

    first_date = du_parse(_date_str(first_date_str, min_file_year + 1))
    epoch = db_manager.epoch_for_date(first_date)
    assert epoch.season == season, "seasons should match"
    return epoch, 1


def parse_csv(filepath: str):
    ref_epoch, header_row = _inspect_file(filepath)
    print(f"'{filepath}' {ref_epoch=} {header_row=}")

    df = pd.read_csv(filepath, header=header_row)

    if SPORT in ("mlb", "nhl"):
        df.columns = list(df.columns[:-4]) + [
            "OpenOU",
            "OpenOU Odds",
            "CloseOU",
            "CloseOU Odds",
        ]
    df.index = pd.Index(
        df.Date.map(lambda date_str: _date_str(date_str, min_epoch=ref_epoch)),
        name="date",
    )
    return df.drop(columns="Date"), ref_epoch.season


seasons_to_file: dict[int, str] = {}
"""season to filepath"""
dfs: dict[str, pd.DataFrame] = {}
"""filepath to data"""

# read each file to a dataframe, drop header lines until the first line that starts with "Date"
for filepath in tqdm.tqdm(csv_files, desc="loading"):
    try:
        df, season = parse_csv(filepath)
    except NotSeasonDateError as ex:
        print(f"Skipping {filepath=} due to season not being supported. {ex=}")
        continue
    assert (
        season not in seasons_to_file
    ), f"Found multiple files for {season=}. '{filepath}' , '{seasons_to_file[season]}'"
    # print(f"Loading '{filepath}' {df.index[0]}-{df.index[-1]}")
    assert len(df) % 2 == 0, "there should be an even number of rows"
    # print(df)
    # concat to other dataframe
    dfs[filepath] = df
    seasons_to_file[season] = filepath

first_filepath = next(iter(dfs.keys()))
display(f"sample ... {first_filepath}", dfs[first_filepath])

In [None]:
from functools import partial
from typing import Callable

import numpy as np
from fantasy_py import NotSeasonDateError, HomeAwayIter
from tqdm import tqdm


def _xform_rows_mlb_nhl(rows: pd.DataFrame, away_row: int, home_row: int):
    # money line is in the Close column
    assert len(rows.Close) == 2, "assuming that there is a favorite"
    ml = [float(val) for val in rows.Close.values]
    assert len(rows.CloseOU.unique()) == 1, "both OU close values should match"
    overunder = rows.CloseOU.iloc[0]
    if overunder == "NL":
        return (None, None, None, None, None)

    if isinstance(overunder, str):
        overunder = float(overunder.replace("½", ".5"))
    over_odds_val = rows["CloseOU Odds"].iloc[0]
    if isinstance(over_odds_val, str) and over_odds_val[0] == "a":
        over_odds_val = over_odds_val[1:]
    over_odds = int(over_odds_val) if not pd.isna(over_odds_val) else None
    try:
        under_odds = int(rows["CloseOU Odds"].iloc[1])
    except ValueError:
        under_odds = None

    spread = 1.5 * (1 if ml[away_row] > ml[home_row] else -1)
    return ml, spread, overunder, over_odds, under_odds


def _xform_rows_nba_nfl(rows: pd.DataFrame, away_row: int, home_row: int):
    """get odds data for away/home rows for nba or nfl"""
    # for nfl and nba in 'Close' the favored row has spread
    # the underdog has overunder

    ml = [int(ml_str) for ml_str in rows.ML.values] if "NL" not in rows.ML.values else None
    if "pk" in rows.Close.values or "PK" in rows.Close.values:
        # toss-up, so no spread. but there should be an overunder
        spread = 0
        overunder = float(
            rows.iloc[away_row].Close
            if rows.iloc[home_row].Close == "pk"
            else rows.iloc[home_row].Close
        )
    else:
        close = [float(close_str) for close_str in rows.Close.values]
        if close[away_row] > close[home_row]:
            overunder = close[away_row]
            spread = close[home_row]
        elif close[0] < close[home_row]:
            overunder = close[home_row]
            spread = close[away_row]
        else:
            raise ValueError("Close values should not be the same")
        if spread < 0:
            display(rows)
            print("Negative spread found, using abs")
            spread = abs(spread)
    return ml, spread, overunder, None, None


ROWS_FUNCS = {
    "mlb": _xform_rows_mlb_nhl,
    "nhl": _xform_rows_mlb_nhl,
    "nfl": _xform_rows_nba_nfl,
    "nba": _xform_rows_nba_nfl,
}
"""mapping of sport to row processing func"""


def game_xform(
    rows_xformer: Callable,
    rows: pd.DataFrame,
):
    """
    rows_xformer: function with args (game_rows, away_row_idx, home_row_idx) -> (ml-list, ou, spread).\
        Game_rows is a dataframe with 2 rows containing home and away odds data,\
        and the other args are the indices for which row is home/away.\
        The returned values are ml-list: a list or 2 flows with the money line for the game, home/away\
        money line will be at index matching [away|home]_row_idx. ou and spread floats
    """
    game_date = du_parse(rows.index[0]).date()
    try:
        assert len(rows) == 2, "expecting 2 rows"
        if set(rows.VH) not in ({"N"}, {"V", "H"}):
            raise ValueError(
                f"For {game_date}, expected rows to be order visitor, home, instead {rows.VH=}"
            )
        away_row, home_row = (
            (0, 1) if rows.VH.to_list() == ["V", "H"] or set(rows.VH) == {"N"} else (1, 0)
        )

        if "Close" in rows.columns and rows.Close.hasnans or ("ML" in rows and rows.ML.hasnans):
            display(rows)
            print(f"Skipping game on {game_date} with NaN for close or ml")
            return None
        if len(rows.index.unique()) > 1:
            display(rows)
            print(
                f"WARNING: expected all rows on {game_date} to be for the same date, these do not!"
            )
        if "Rot" in rows.columns and (rows.Rot.iloc[0] + 1) != rows.Rot.iloc[1]:
            display(rows)
            print(f"WARNING: the rotation numbers on {game_date} were not consecutive")

        try:
            epoch = db_manager.epoch_for_date(game_date)
        except:
            if SPORT == "nfl" and game_date.weekday() in (1, 2):
                print(f"Skipping NFL odds on {game_date} cause it is a TUES/WED")
                # for NFL skip tuesday and wednesday games
                return None
            raise

        ml, spread, overunder, over_odds, under_odds = rows_xformer(rows, away_row, home_row)
        if ml is None and overunder is None and spread is None:
            return None

        new_row = {
            "date": game_date,
            "season": epoch.season,
            "away-team": rows.iloc[away_row].Team,
            "away-abbr": get_team_abbr(rows.iloc[away_row].Team, epoch.season),
            "home-team": rows.iloc[home_row].Team,
            "home-abbr": get_team_abbr(rows.iloc[home_row].Team, epoch.season),
            "away-moneyline": ml[away_row] if ml is not None else None,
            "home-moneyline": ml[home_row] if ml is not None else None,
            "overunder": overunder,
            "over-odds": over_odds,
            "under-odds": under_odds,
            "spread": spread,
        }
        assert len(new_row["home-abbr"]) <= 3 and len(new_row["away-abbr"]) <= 3, (
            f"on {game_date} expected all team abbreviations to have length <= 3, "
            f"one of the following is too long: '{new_row["home-abbr"]}', '{new_row["away-abbr"]}'"
        )
    except Exception as ex:
        display(f"failed. {ex=} on the following rows:")
        display(rows)
        raise
    return pd.Series(new_row)


def valid_test(filepath, df: pd.DataFrame):
    """validation testing on odds data by dropping/setting to NA any value
    outside of the valid range for a column"""

    def _test_val(stat_name, valid_min, valid_max):
        """
        test that the min and max of the stat_name columns
        are within the valid range or is na. if all is well return None
        otherwise return a new series with na for invalid values
        """
        try:
            min_val = abs(df[stat_name].dropna().min())
            max_val = abs(df[stat_name].dropna().max())
        except:
            print(f"Failed to get min/max for {stat_name=}")
            raise

        assert pd.notna(
            min_val
        ), f"failed on {stat_name}, {min_val=}, {df[stat_name].dropna().min()=}"

        if min_val >= valid_min and max_val <= valid_max:
            return None

        updates_df = pd.DataFrame(
            df[stat_name]
            .map(
                lambda val: (
                    [val, 0] if (pd.isna(val) or valid_min <= val <= valid_max) else [None, 1]
                )
            )
            .tolist(),
            columns=[stat_name, "nulled"],
        )
        nulled_vals = updates_df.nulled.sum()
        print(
            f"In '{filepath}' {stat_name} has out of range values."
            f"abs(min, max) = [{float(min_val)} : {float(max_val)}] "
            f"valid range = [{valid_min} : {valid_max}]. "
            f"{nulled_vals} out of range values nulled"
        )

        assert (
            nulled_vals > 0
        ), f"Found values out of range for {filepath=} {stat_name=} but nothing was nulled!"
        return updates_df[stat_name]

    for stat in ["spread", "overunder"]:
        if (updates := _test_val(stat, *VALID_RANGE[SPORT][stat])) is not None:
            df = df.assign(**{stat: updates})
    for ha in HomeAwayIter:
        if (updates := _test_val(f"{ha}-moneyline", 100, 15000)) is not None:
            df = df.assign(**{f"{ha}-moneyline": updates})
    return df

In [None]:
odds_dfs = []
apply_func = partial(game_xform, ROWS_FUNCS[SPORT])
for filepath, df in (progress := tqdm(dfs.items(), desc="files", total=len(dfs))):
    progress.set_postfix_str(filepath)
    tqdm.pandas(desc="games")
    try:
        group_by = df.groupby(np.arange(len(df)) // 2)
        game_xformed_df = cast(
            pd.DataFrame, group_by.progress_apply(apply_func).set_index("date")
        ).query("season.notna()")
        xformed_df = game_xformed_df.sort_index()
        validated_df = valid_test(filepath, xformed_df)
        odds_dfs.append(validated_df)
    except NotSeasonDateError as ex:
        display(f"Skipping '{filepath}' data not in a fantasy season: {ex}")
        continue
    except Exception as ex:
        display(f"Unhandled error for {filepath}... STOPPING EVERYTHING! {ex=}")
        raise

    if TESTING_RUN:
        break

In [None]:
# handle nfl&mlb season 2022 data from betiq
def _xform_betiq_rows(rows: pd.DataFrame, away_row: int, home_row: int):
    """get odds data for away/home rows betiq"""
    assert {away_row, home_row} == {0, 1}
    if [away_row, home_row] == [0, 1]:
        ml = rows["Money Line"]
    else:
        ml = rows.iloc[::-1]["Money Line"]
    overunder, spread = rows.iloc[0][["Total (O/U)", "Spread"]]
    return list(ml), spread, overunder, None, None


def _betiq_sortable_col(row: pd.Series):
    """
    construct a column that betiq data can be sorted by to
    ensure home/away rows are consecutive
    """
    team_1 = row.Team if row.Team < row.Opponent else row.Opponent
    score = row.Score if row.Team == team_1 else "-".join(reversed(row.Score.split("-")))
    return row.Date + ":" + team_1 + ":" + score


if SPORT in ("mlb", "nfl"):
    filepath = os.path.join(
        os.environ["FANTASY_DATA_ARCHIVE_DIR"], "odds-archive", SPORT, SPORT + "-odds.betiq.tsv"
    )
    betiq_df = pd.read_csv(filepath, sep="\t")
    # display("betiq data from file", betiq_df)

    vh = betiq_df.Location.map(lambda ha: "V" if ha == "Away" else "H" if ha == "Home" else "N")
    sortable_col = betiq_df.apply(_betiq_sortable_col, axis=1)
    sortable_df = (
        betiq_df.assign(VH=vh, sort_col=sortable_col).sort_values("sort_col").set_index("Date")
    )
    # display("sortable betiq data", sortable_df)

    group_by = sortable_df.groupby(np.arange(len(sortable_df)) // 2)
    apply_func = partial(game_xform, _xform_betiq_rows)
    xformed_df: pd.DataFrame = group_by.progress_apply(apply_func).set_index("date").sort_index()
    display("xformed df", xformed_df)

    validated_df = valid_test(filepath, xformed_df)
    display(
        f"Validation dropped {len(xformed_df) - len(validated_df)} rows leaving {len(validated_df)} rows"
    )
    display("clean betiq data", validated_df)
    odds_dfs.append(validated_df)

In [None]:
odds_df = pd.concat(odds_dfs)
display(odds_df)
print("------------------------ SUCCESS!!! -------------------------------")

In [None]:
# export
if EXT is not None:
    sorted_df = odds_df.sort_index()
    min_season = int(sorted_df.iloc[0].season)
    max_season = int(sorted_df.iloc[-1].season)

    dest_filepath = os.path.join(
        os.environ["FANTASY_HOME"], f"{SPORT}.odds-archive.{min_season}-{max_season}.{EXT}"
    )
    print(f"writing data to '{dest_filepath}'")
    if EXT == "parquet":
        odds_df.to_parquet(dest_filepath)
    elif EXT == "csv":
        odds_df.to_csv(dest_filepath)
    else:
        raise ValueError(f"Don't know how to export to '{EXT}'")
else:
    print("Not saving results...")
print("Done")