In [None]:
import os

print("📂 Verifying current path:", os.getcwd())

daily_path = "data/daily"
if not os.path.exists(daily_path):
    daily_path = "../data/daily"

print(f"\n📁 Scanning path: {daily_path}")
if os.path.exists(daily_path):
    print("📄 Files found:")
    for f in sorted(os.listdir(daily_path)):
        print("  -", f)
else:
    print("❌ Path does not exist.")



In [None]:
import os
from datetime import datetime, timedelta

# === Config ===
folder = "data/daily"
start_date = datetime(2025, 5, 7)
end_date = datetime.today()

missing = []

for i in range((end_date - start_date).days + 1):
    day = start_date + timedelta(days=i)
    filename = f"MLB_Combined_Odds_Results_{day.strftime('%Y-%m-%d')}.csv"
    full_path = os.path.join(folder, filename)

    if not os.path.exists(full_path):
        missing.append(filename)

if missing:
    print("❌ Missing daily files:")
    for f in missing:
        print("  -", f)
else:
    print("✅ All expected files are present!")


❌ Missing daily files:
  - MLB_Combined_Odds_Results_2025-05-02.csv
  - MLB_Combined_Odds_Results_2025-05-03.csv
  - MLB_Combined_Odds_Results_2025-05-04.csv
  - MLB_Combined_Odds_Results_2025-05-05.csv
  - MLB_Combined_Odds_Results_2025-05-06.csv
  - MLB_Combined_Odds_Results_2025-05-10.csv


In [None]:
import pandas as pd
import os
from datetime import datetime, timedelta
import pytz # Import pytz for timezone awareness (as used in pull_transform_daily)

# === Config ===
# Use US/Eastern timezone for consistency with your pull script
eastern = pytz.timezone("US/Eastern")
yesterday = (datetime.now().astimezone(eastern) - timedelta(days=1)).strftime("%Y-%m-%d")

daily_file = f"../data/daily/MLB_Combined_Odds_Results_{yesterday}.csv"
abbrev_file = "../data/lookups/MLB_Teams_Template_2025.xlsx"
master_file = "../data/master/master_template.parquet"

# Ensure master directory exists (assuming daily/lookups are handled by repo structure or pull script)
os.makedirs("../data/master", exist_ok=True)

# === Guard Clause ===
if not os.path.exists(daily_file):
    print(f"⚠️ No daily file found: {daily_file} — skipping append.")
    exit()

# === Load game-level data and team abbreviations ===
df = pd.read_csv(daily_file)
abbrev_df = pd.read_excel(abbrev_file).rename(columns={"City and Team": "team_name", "Abbreviation": "team_abbr"})
abbrev_map = dict(zip(abbrev_df["team_name"], abbrev_df["team_abbr"]))

# === Normalize game_date to 'YYYY-MM-DD' format ===
# Added format for robustness
df['game_date'] = pd.to_datetime(df['game_date'], format='%Y-%m-%d', errors='coerce').dt.strftime('%Y-%m-%d')

# === Transform to team-level rows ===
team_rows = []
for _, row in df.iterrows():
    # Only process rows where scores are not missing (as per original logic)
    if pd.isna(row["home_score"]) or pd.isna(row["away_score"]):
        continue

    total_score = row["home_score"] + row["away_score"]
    hit_over = total_score > row["total_line"] if pd.notna(row["total_line"]) else None

    for team_type in ["home", "away"]:
        is_home = team_type == "home"
        team = row[f"{team_type}_team"]
        opponent = row[f"{'away' if is_home else 'home'}_team"]
        team_score = row[f"{team_type}_score"]
        opp_score = row[f"{'away' if is_home else 'home'}_score"]
        moneyline = row[f"moneyline_{team_type}"]

        row_data = {
            "game_id": row["game_id"],
            "game_date_et": pd.to_datetime(row["game_date"]), # This will be processed later
            "start_time_et": pd.to_datetime(row["start_time_et"]), # This will be processed later
            "team": team,
            "team_abbr": abbrev_map.get(team),
            "opponent": opponent,
            "opponent_abbr": abbrev_map.get(opponent),
            "is_home": is_home, # This sets it as a boolean initially
            "home_score": row["home_score"],
            "away_score": row["away_score"],
            "run_diff": team_score - opp_score,
            "won_game": team_score > opp_score,
            "hit_over": hit_over,
            "h2h_own": moneyline,
            "h2h_opp": row[f"moneyline_{'away' if is_home else 'home'}"],
            "is_home_odds": is_home, # Ensure this is a boolean
            "Run_Line": None, "Spread_Price": None, "Opp_Spread_Price": None, "Total": row["total_line"],
            "Over_Price": row["over_odds"], "Under_Price": row["under_odds"],
            "team_abbr_odds": abbrev_map.get(team), "opponent_abbr_odds": abbrev_map.get(opponent)
        }

        for i in range(1, 10):
            row_data[f"home_{i}"] = row.get(f"home_{i}")
            row_data[f"away_{i}"] = row.get(f"away_{i}")

        team_rows.append(row_data)

team_df = pd.DataFrame(team_rows)

# === Ensure numeric consistency for odds fields ===
for col in ["h2h_own", "h2h_opp", "Over_Price", "Under_Price", "Total", "run_diff"]:
    team_df[col] = pd.to_numeric(team_df[col], errors="coerce")

# === Append to master Parquet ===
if os.path.exists(master_file):
    master_df = pd.read_parquet(master_file)
    # Applying format for robustness and to suppress UserWarning for game_date_et
    master_df["game_date_et"] = pd.to_datetime(master_df["game_date_et"], format='%Y-%m-%d', errors="coerce")
    # Applying format for robustness and to suppress UserWarning for start_time_et
    master_df["start_time_et"] = pd.to_datetime(master_df["start_time_et"], format='%Y-%m-%d %H:%M:%S', errors="coerce")

    # --- START OF IS_HOME_ODDS FIX ---
    # This robust fix handles values from historical data that might not be clean booleans/numbers
    temp_numeric_odds = pd.to_numeric(master_df['is_home_odds'], errors='coerce')
    master_df['is_home_odds'] = temp_numeric_odds.map({1: True, 0: False}).astype('boolean')
    # --- END OF IS_HOME_ODDS FIX ---

    existing_keys = master_df["game_id"].astype(str) + "_" + master_df["team_abbr"]
    new_keys = team_df["game_id"].astype(str) + "_" + team_df["team_abbr"]
    master_df = master_df[~existing_keys.isin(new_keys)]

    combined_df = pd.concat([master_df, team_df], ignore_index=True)
else:
    combined_df = team_df

# === Add season and sort ===
combined_df["season"] = pd.to_datetime(combined_df["game_date_et"], errors='coerce').dt.year
combined_df = combined_df.sort_values(by=["game_date_et", "team_abbr"]).reset_index(drop=True)

# 🔁 Sort by season > team_abbr > game_date_et to preserve correct row order
combined_df = combined_df.sort_values(by=["season", "team_abbr", "game_date_et"]).reset_index(drop=True)

# === Save updated master ===
combined_df.to_parquet(master_file, index=False)
print(f"✅ Appended {len(team_df)} team-level rows to: {master_file}")

✅ Appended 30 team-level rows to: master_template.parquet


  combined_df = pd.concat([master_df, team_df], ignore_index=True)


In [None]:
!pwd


/content
