# Sports book online historic odds data aggregation
Combine sports book online CSV files into a single CSV
file with columns that match fantasy odds retrieval.

Output files will be written to the same location as the 
source data. The filename will be "sportsbook-odds-[SPORT].[SEASON_MIN]-[SEASON_MAX].[EXT]"

Output data will have one row per game with columns: 'date', 'season', 'away-team', 'home-team', 'away-moneyline', 'home-moneyline', 'overunder', 'spread'.
All odds are american style. Spread if for a home win (so positive means home scores the spread more then away, negative is home scores the spread less)


In [None]:
from typing import Literal

TESTING_RUN = False
EXT: None | Literal["csv", "parquet"] = "parquet" if not TESTING_RUN else None
"""the output file extention and format"""

VALID_RANGE = {
    "mlb": {"spread": (1.5, 1.5), "overunder": (3, 15)},
    "nhl": {"spread": (1.5, 1.5), "overunder": (2.5, 15)},
    "nfl": {"spread": (0, 30), "overunder": (25, 85)},
    "nba": {"spread": (0, 25), "overunder": (150, 275)},
}
# SPORT = "mlb"
# SPORT = "nba"
SPORT = "nfl"
# SPORT = "nhl"

In [None]:
import os
import glob
import json

path_to_csv_files = os.path.join(os.environ["FANTASY_ARCHIVE_BASE"], SPORT, "sportbook-odds")
assert os.path.isdir(path_to_csv_files)
csv_files = glob.glob(os.path.join(path_to_csv_files, "*Sheet*.csv"))
print("CSV files that will be processed:")
print("\n".join(csv_files))

team_abbrs_filepath = os.path.join(os.getcwd(), "team_abbrs.json")
with open(team_abbrs_filepath, "r") as f_:
    teamname_remap = json.load(f_)[SPORT]
print(f"Loaded {len(teamname_remap)} team abbr remaps for {SPORT}")

In [None]:
import re

import pandas as pd
import tqdm


def _find_start_year(filepath: str):
    if SPORT == "mlb":
        start_year_matches = re.findall("mlb-odds-([0-9]+).xls.*", filepath)
        start_row = 0
    else:
        with open(filepath, "r") as f_:
            l1 = f_.readline()
        start_year_matches = re.findall(f"{SPORT.upper()} (20..).*", l1)
        start_row = 1
    assert len(start_year_matches) in (1, 2), f"Find start-year for '{filepath}' in '{l1}'"
    return int(start_year_matches[0]), start_row


def parse_csv(filepath: str):
    start_year, start_row = _find_start_year(filepath)
    df = pd.read_csv(filepath, header=start_row)

    if SPORT in ("mlb", "nhl"):
        df.columns = list(df.columns[:-4]) + [
            "OpenOU",
            "OpenOU Odds",
            "CloseOU",
            "CloseOU Odds",
        ]

    def _date(orig_date: str, first_date: str | None = None):
        """figure out the game date"""
        orig_date_4 = ("0" if int(orig_date) < 1000 else "") + str(orig_date)
        new_date = str(start_year) + orig_date_4
        if first_date is None or (first_date is not None and new_date >= first_date):
            return new_date
        # the season calendar is in a new year, use the next year instead of start_year
        return str(start_year + 1) + orig_date_4

    row_1_date = _date(df.iloc[0].Date)
    df.index = pd.Index(
        df.Date.map(lambda date_str: _date(date_str, first_date=row_1_date)), name="date"
    )
    return df.drop(columns="Date")


dfs: dict[str, pd.DataFrame] = {}
# read each file to a dataframe, drop header lines until the first line that starts with "Date"
for filepath in tqdm.tqdm(csv_files, desc="loading"):
    df = parse_csv(filepath)
    # print(f"Loading '{filepath}' {df.index[0]}-{df.index[-1]}")
    assert len(df) % 2 == 0, "there should be an even number of rows"
    # print(df)

    # concat to other dataframe
    dfs[filepath] = df

display(f"sample ... {csv_files[0]}", dfs[csv_files[0]])

In [None]:
from functools import partial
from typing import Callable, cast

import numpy as np
from dateutil.parser import parse as du_parse
from fantasy_py import SPORT_DB_MANAGER_DOMAIN, CLSRegistry, NotSeasonDateError
from fantasy_py.sport import SportDBManager
from tqdm import tqdm

db_manager = cast(SportDBManager, CLSRegistry.get_class(SPORT_DB_MANAGER_DOMAIN, SPORT))


def _xform_rows_mlb_nhl(rows: pd.DataFrame, away_row: int, home_row: int):
    # money line is in the Close column
    assert len(rows.Close) == 2, "assuming that there is a favorite"
    ml = [float(val) for val in rows.Close.values]
    assert len(rows.CloseOU.unique()) == 1, "both OU close values should match"
    overunder = rows.CloseOU.iloc[0]
    if isinstance(overunder, str):
        overunder = float(overunder.replace("½", ".5"))
    spread = 1.5 * (1 if ml[away_row] > ml[home_row] else -1)
    return ml, overunder, spread


def _xform_rows_nba_nfl(rows: pd.DataFrame, away_row: int, home_row: int):
    """get odds data for away/home rows for nba or nfl"""
    # for nfl and nba in 'Close' the favored row has spread
    # the underdog has overunder

    ml = [int(ml_str) for ml_str in rows.ML.values] if "NL" not in rows.ML.values else None
    if "pk" in rows.Close.values or "PK" in rows.Close.values:
        # toss-up, so no spread. but there should be an overunder
        spread = 0
        overunder = float(
            rows.iloc[away_row].Close
            if rows.iloc[home_row].Close == "pk"
            else rows.iloc[home_row].Close
        )
    else:
        close = [float(close_str) for close_str in rows.Close.values]
        if close[away_row] > close[home_row]:
            overunder = close[away_row]
            spread = close[home_row]
        elif close[0] < close[home_row]:
            overunder = close[home_row]
            spread = close[away_row]
        else:
            raise ValueError("Close values should not be the same")
        if spread < 0:
            display(rows)
            print("Negative spread found, using abs")
            spread = abs(spread)
    return ml, overunder, spread


ROWS_FUNCS = {
    "mlb": _xform_rows_mlb_nhl,
    "nhl": _xform_rows_mlb_nhl,
    "nfl": _xform_rows_nba_nfl,
    "nba": _xform_rows_nba_nfl,
}
"""mapping of sport to row processing func"""


def game_xform(
    rows_xformer: Callable,
    rows: pd.DataFrame,
):
    """
    rows_xformer: function with args (game_rows, away_row_idx, home_row_idx) -> (ml-list, ou, spread).\
        Game_rows is a dataframe with 2 rows containing home and away odds data,\
        and the other args are the indices for which row is home/away.\
        The returned values are ml-list: a list or 2 flows with the money line for the game, home/away\
        money line will be at index matching [away|home]_row_idx. ou and spread floats
    """
    try:
        assert len(rows) == 2, "expecting 2 rows"
        if set(rows.VH) not in ({"N"}, {"V", "H"}):
            raise ValueError(f"expected rows to be order visitor, home, instead {rows.VH=}")
        away_row, home_row = (
            (0, 1) if rows.VH.to_list() == ["V", "H"] or set(rows.VH) == {"N"} else (1, 0)
        )

        if "Close" in rows.columns and rows.Close.hasnans or ("ML" in rows and rows.ML.hasnans):
            display(rows)
            print("Skipping game with NaN for close or ml")
            return None
        if len(rows.index.unique()) > 1:
            display(rows)
            print("WARNING: expected all rows to be for the same date, these do not!")
        if "Rot" in rows.columns and (rows.Rot.iloc[0] + 1) != rows.Rot.iloc[1]:
            display(rows)
            print("WARNING: expecting the rotation number to be ascending by 1, these do not")

        game_date = du_parse(rows.index[0]).date()

        try:
            epoch = db_manager.epoch_for_date(game_date)
        except:
            if SPORT == "nfl" and game_date.weekday() in (1, 2):
                print(f"Skipping NFL odds on {game_date} cause it is a TUES/WED")
                # for NFL skip tuesday and wednesday games
                return None
            raise

        ml, overunder, spread = rows_xformer(rows, away_row, home_row)

        new_row = {
            "date": game_date,
            "season": epoch.season,
            "away-team": rows.iloc[away_row].Team,
            "away-abbr": teamname_remap.get(rows.iloc[away_row].Team, rows.iloc[away_row].Team),
            "home-team": rows.iloc[home_row].Team,
            "home-abbr": teamname_remap.get(rows.iloc[home_row].Team, rows.iloc[home_row].Team),
            "away-moneyline": ml[away_row] if ml is not None else None,
            "home-moneyline": ml[home_row] if ml is not None else None,
            "overunder": overunder,
            "spread": spread,
        }
        assert len(new_row["home-abbr"]) <= 3 and len(new_row["away-abbr"]) <= 3, (
            "expecting all team abbreviations to have lenght <= 3, "
            f"one of the following is too long: '{new_row["home-abbr"]}', '{new_row["away-abbr"]}'"
        )
    except Exception as ex:
        display(f"failed. {ex=} on the following rows:")
        display(rows)
        raise
    return pd.Series(new_row)


def valid_test(filepath, df: pd.DataFrame):
    """validation testing on odds data"""

    valid_df = df

    def _test_val(stat_name, valid_min, valid_max):
        """
        test that the min and max of the stat_name columne
        are within the valid range
        return a validated/filtered version of the DF
        """
        min_val = abs(df[stat_name].min())
        max_val = abs(df[stat_name].max())

        if not (valid_min <= min_val <= valid_max and valid_min <= max_val <= valid_max):
            print(
                f"In '{filepath}' {stat_name} is out of range in. abs(min) or abs(max) value was "
                f"outside range. abs(min, max) = {sorted([min_val, max_val])} "
                f"valid range = [{valid_min} : {valid_max}]"
            )
            return f"{valid_min} <= `{stat_name}` <= {valid_max}"
        return None

    if filter_query := _test_val("spread", *VALID_RANGE[SPORT]["spread"]):
        valid_df = valid_df.query(filter_query)
    if filter_query := _test_val("overunder", *VALID_RANGE[SPORT]["overunder"]):
        valid_df = valid_df.query(filter_query)
    if filter_query := _test_val("home-moneyline", 100, 15000):
        valid_df = valid_df.query(filter_query)
    if filter_query := _test_val("away-moneyline", 100, 15000):
        valid_df = valid_df.query(filter_query)
    if (dropped_rows := len(df) - len(valid_df)) > 0:
        print(f"Validation dropped {dropped_rows} games from '{filepath}'")
    return valid_df

In [None]:
odds_dfs = []
apply_func = partial(game_xform, ROWS_FUNCS[SPORT])
for filepath, df in (progress := tqdm(dfs.items(), desc="files", total=len(dfs))):
    progress.set_postfix_str(filepath)
    tqdm.pandas(desc="games")
    try:
        group_by = df.groupby(np.arange(len(df)) // 2)
        xformed_df: pd.DataFrame = (
            (group_by.progress_apply(apply_func).set_index("date")).dropna().sort_index()
        )

        validated_df = valid_test(filepath, xformed_df)

        odds_dfs.append(validated_df)
    except NotSeasonDateError as ex:
        display(f"Skipping '{filepath}' data not in a fantasy season: {ex}")
        continue
    except Exception as ex:
        display(f"Unhandled error for {filepath}... STOPPING EVERYTHING! {ex=}")
        raise

    if TESTING_RUN:
        break

In [None]:
# handle nfl season 2022 data from betiq
def _xform_betiq_nfl_rows(rows: pd.DataFrame, away_row: int, home_row: int):
    """get odds data for away/home rows betiq nfl"""
    assert {away_row, home_row} == {0, 1}
    if [away_row, home_row] == [0, 1]:
        ml = rows["Money Line"]
    else:
        ml = rows.iloc[::-1]["Money Line"]
    overunder, spread = rows.iloc[0][["Total (O/U)", "Spread"]]
    return list(ml), overunder, spread


if SPORT == "nfl":
    betiq_df = pd.read_csv(
        os.path.join(os.environ["FANTASY_ARCHIVE_BASE"], "nfl", "nfl-odds.betiq.2022.tsv"),
        sep="\t",
    )
    betiq_df = betiq_df.assign(
        VH=betiq_df.Location.map(lambda ha: "V" if ha == "Away" else "H" if ha == "Home" else "N")
    ).set_index("Date")
    display("betiq data", betiq_df)
    group_by = betiq_df.groupby(np.arange(len(betiq_df)) // 2)
    apply_func = partial(game_xform, _xform_betiq_nfl_rows)
    xformed_df: pd.DataFrame = (group_by.progress_apply(apply_func).set_index("date")).sort_index()
    validated_df = valid_test(filepath, xformed_df)
    display("clean betiq data", validated_df)
    odds_dfs.append(validated_df)

In [None]:
odds_df = pd.concat(odds_dfs)
display(odds_df)
print("------------------------ SUCCESS!!! -------------------------------")

In [None]:
# export
if EXT is not None:
    sorted_df = odds_df.sort_index()
    min_year = sorted_df.index[0].year
    max_year = sorted_df.index[-1].year
    dest_filepath = os.path.join(
        os.environ['FANTASY_HOME'], f"{SPORT}.odds-archive.{min_year}-{max_year}.{EXT}"
    )
    print(f"writing data to '{dest_filepath}'")
    if EXT == "parquet":
        odds_df.to_parquet(dest_filepath)
    elif EXT == "csv":
        odds_df.to_csv(dest_filepath)
    else:
        raise ValueError(f"Don't know how to export to '{EXT}'")
else:
    print("Not saving results...")
print("Done")