# Sports book online historic odds data aggregation
Combine sports book online CSV files into a single CSV
file with columns that match fantasy odds retrieval.

Output files will be written to the same location as the 
source data. The filename will be "sportsbook-odds-[SPORT].[SEASON_MIN]-[SEASON_MAX].[EXT]"


In [1]:
from typing import Literal
# SPORT = "mlb"
# SPORT = "nba"  # WORKS!
SPORT = "nfl"
# SPORT = "nhl"
EXT: Literal["csv", "parquet"] = "csv"

In [None]:
import os
import glob

path_to_csv_files = os.path.join(os.environ["FANTASY_ARCHIVE_BASE"], SPORT, "sportbook-odds")
assert os.path.isdir(path_to_csv_files)
csv_files = glob.glob(os.path.join(path_to_csv_files, "*Sheet*.csv"))
print("CSV files that will be processed:")
print("\n".join(csv_files))

In [None]:
import re

import pandas as pd
import tqdm


def _find_start_year(filepath: str, sport: str):
    if sport == "mlb":
        start_year_matches = re.findall("mlb-odds-([0-9]+).xls.*", filepath)
        start_row = 0
    else:
        with open(filepath, "r") as f_:
            l1 = f_.readline()
        start_year_matches = re.findall(f"{sport.upper()} (20..)-.*", l1)
        start_row = 1
    assert len(start_year_matches) == 1, f"Find start-year for '{filepath}' in '{l1}'"
    return int(start_year_matches[0]), start_row


def parse_csv(filepath: str, sport: str):
    start_year, start_row = _find_start_year(filepath, sport)
    df = pd.read_csv(filepath, header=start_row)

    def _date(orig_date: str, first_date: str | None = None):
        """figure out the game date"""
        orig_date_4 = ("0" if int(orig_date) < 1000 else "") + str(orig_date)
        new_date = str(start_year) + orig_date_4
        if first_date is None or (first_date is not None and new_date >= first_date):
            return new_date
        # the season calendar is in a new year, use the next year instead of start_year
        return str(start_year + 1) + orig_date_4

    row_1_date = _date(df.iloc[0].Date)
    df.index = pd.Index(
        df.Date.map(lambda date_str: _date(date_str, first_date=row_1_date)), name="date"
    )
    return df.drop(columns="Date")


dfs: dict[str, pd.DataFrame] = {}
# read each file to a dataframe, drop header lines until the first line that starts with "Date"
for filepath in tqdm.tqdm(csv_files, desc="loading"):
    df = parse_csv(filepath, SPORT)
    # print(f"Loading '{filepath}' {df.index[0]}-{df.index[-1]}")
    assert len(df) % 2 == 0, "there should be an even number of rows"
    # print(df)

    # concat to other dataframe
    dfs[filepath] = df
    
display(csv_files[0], dfs[csv_files[0]])

In [None]:
import dateutil
import numpy as np
from tqdm import tqdm
from typing import cast

from fantasy_py import SPORT_DB_MANAGER_DOMAIN, CLSRegistry, NotSeasonDateError
from fantasy_py.sport import SportDBManager

db_manager = cast(SportDBManager, CLSRegistry.get_class(SPORT_DB_MANAGER_DOMAIN, SPORT))


def game_xform(rows: pd.DataFrame):
    try:
        assert len(rows) == 2, "expecting 2 rows"
        if rows.Rot.diff().iloc[1] != 1:
            raise ValueError("expecting the rotation number to by 1 apart")
        if list(rows.VH) != ["V", "H"] and set(rows.VH) != {"N"}:
            raise ValueError(f"expected rows to be order visitor, home, instead {rows.VH=}")
        if rows.Close.hasnans or rows.ML.hasnans:
            display(rows)
            print("Skipping game with NaN for close or ml")
            return None

        if len(rows.index.unique()) > 1:
            display(rows)
            print("expected all rows to be for the same date, these do not!")
        # in 'Close' the favored row has spread, the underdog has overunder
        ml = [int(ml_str) for ml_str in rows.ML.values] if "NL" not in rows.ML.values else None
        if "pk" in rows.Close.values or "PK" in rows.Close.values:
            # toss-up, so no spread. but there should be an overunder
            spread = 0
            overunder = float(
                rows.iloc[0].Close if rows.iloc[1].Close == "pk" else rows.iloc[1].Close
            )
        else:
            close = [float(close_str) for close_str in rows.Close.values]
            if close[0] > close[1]:
                overunder = close[0]
                spread = close[1]
            elif close[0] < close[1]:
                overunder = close[1]
                spread = close[0]
            else:
                raise ValueError("Close values should not be the same")
            if spread < 0:
                display(rows)
                print("Negative spread found, using abs")
                spread = abs(spread)

        game_date = dateutil.parser.parse(rows.index[0]).date()
        epoch = db_manager.epoch_for_date(game_date)
        new_row = {
            "date": game_date,
            "season": epoch.season,
            "away-team": rows.iloc[0].Team,
            "home-team": rows.iloc[1].Team,
            "away-moneyline": ml[0] if ml is not None else None,
            "home-moneyline": ml[1] if ml is not None else None,
            "overunder": overunder,
            "spread": spread,
        }
    except:
        print("failed on the following rows:")
        display(rows)
        raise
    return pd.Series(new_row)


odds_dfs = []
for filepath, df in (progress := tqdm(dfs.items(), desc="files", total=len(dfs))):
    progress.set_postfix_str(filepath)
    tqdm.pandas(desc="games")
    try:
        group_by = df.groupby(np.arange(len(df)) // 2)
        xformed_df: pd.DataFrame = (
            (group_by.progress_apply(game_xform).set_index("date")).dropna().sort_index()
        )
        odds_dfs.append(xformed_df)
    except NotSeasonDateError as ex:
        print(f"Skipping '{filepath}' data not in a fantasy season: {ex}")

odds_df = pd.concat(odds_dfs)
display(odds_df)

In [None]:
# export
# print(xformed_df.dtypes)
# print(f"{xformed_df.index=}")
sorted_df = xformed_df.sort_index()
min_year = sorted_df.index[0].year
max_year = sorted_df.index[-1].year
dest_filepath = os.path.join(
    os.environ['FANTASY_HOME'], f"{SPORT}.odds-archive.{min_year}-{max_year}.parquet"
)
print(f"writing data to '{dest_filepath}'")

# xformed_df.to_parquet(dest_filepath)