In [None]:
import os

print("📂 Verifying current path:", os.getcwd())

daily_path = "data/daily"
if not os.path.exists(daily_path):
    daily_path = "../data/daily"

print(f"\n📁 Scanning path: {daily_path}")
if os.path.exists(daily_path):
    print("📄 Files found:")
    for f in sorted(os.listdir(daily_path)):
        print("  -", f)
else:
    print("❌ Path does not exist.")



In [None]:
import os
from datetime import datetime, timedelta

# === Config ===
folder = "data/daily"
start_date = datetime(2025, 5, 7)
end_date = datetime.today()

missing = []

for i in range((end_date - start_date).days + 1):
    day = start_date + timedelta(days=i)
    filename = f"MLB_Combined_Odds_Results_{day.strftime('%Y-%m-%d')}.csv"
    full_path = os.path.join(folder, filename)

    if not os.path.exists(full_path):
        missing.append(filename)

if missing:
    print("❌ Missing daily files:")
    for f in missing:
        print("  -", f)
else:
    print("✅ All expected files are present!")


❌ Missing daily files:
  - MLB_Combined_Odds_Results_2025-05-02.csv
  - MLB_Combined_Odds_Results_2025-05-03.csv
  - MLB_Combined_Odds_Results_2025-05-04.csv
  - MLB_Combined_Odds_Results_2025-05-05.csv
  - MLB_Combined_Odds_Results_2025-05-06.csv
  - MLB_Combined_Odds_Results_2025-05-10.csv


In [None]:
import pandas as pd
import os
from datetime import datetime, timedelta

# === Config ===
yesterday = (datetime.today() - timedelta(days=1)).strftime("%Y-%m-%d")
daily_file = f"../data/daily/MLB_Combined_Odds_Results_{yesterday}.csv"
abbrev_file = "../data/lookups/MLB_Teams_Template_2025.xlsx"
master_file = "../data/master/master_template.parquet"

# === Verify paths ===
print("📂 Verifying current path:", os.getcwd())
for root, dirs, files in os.walk("data/daily"):
    print(f"\n📁 {root}")
    for f in files:
        print(f"  └── {f}")

# === Guard Clause ===
if not os.path.exists(daily_file):
    print(f"⚠️ No daily file found: {daily_file} — skipping append.")
    exit()

# === Load game-level data and team abbreviations ===
df = pd.read_csv(daily_file)
abbrev_df = pd.read_excel(abbrev_file).rename(columns={"City and Team": "team_name", "Abbreviation": "team_abbr"})
abbrev_map = dict(zip(abbrev_df["team_name"], abbrev_df["team_abbr"]))

# 🔧 Normalize game_date to 'YYYY-MM-DD' format to match master
df['game_date'] = pd.to_datetime(df['game_date'], errors='coerce').dt.strftime('%Y-%m-%d')


# === Transform to team-level rows ===
team_rows = []
for _, row in df.iterrows():
    if pd.isna(row["home_score"]) or pd.isna(row["away_score"]):
        continue
    total_score = row["home_score"] + row["away_score"]
    hit_over = total_score > row["total_line"] if pd.notna(row["total_line"]) else None

    for team_type in ["home", "away"]:
        is_home = team_type == "home"
        team = row[f"{team_type}_team"]
        opponent = row[f"{'away' if is_home else 'home'}_team"]
        team_score = row[f"{team_type}_score"]
        opp_score = row[f"{'away' if is_home else 'home'}_score"]
        moneyline = row[f"moneyline_{team_type}"]

        row_data = {
            "game_id": row["game_id"],
            "game_date_et": pd.to_datetime(row["game_date"]),
            "start_time_et": pd.to_datetime(row["start_time_et"]),
            "team": team,
            "team_abbr": abbrev_map.get(team),
            "opponent": opponent,
            "opponent_abbr": abbrev_map.get(opponent),
            "is_home": is_home,
            "home_score": row["home_score"],
            "away_score": row["away_score"],
            "run_diff": team_score - opp_score,
            "won_game": team_score > opp_score,
            "hit_over": hit_over,
            "merge_key": f"{team}_{row['game_date']}",
            "h2h_own": moneyline,  # 🔥 Updated to align with master
            "h2h_opp": row[f"moneyline_{'away' if is_home else 'home'}"],  # 🔥 Updated to align with master
            "is_home_odds": is_home,
            "Run_Line": None, "Spread_Price": None, "Opp_Spread_Price": None, "Total": row["total_line"],
            "Over_Price": row["over_odds"], "Under_Price": row["under_odds"],
            "team_abbr_odds": abbrev_map.get(team), "opponent_abbr_odds": abbrev_map.get(opponent)
        }
        for i in range(1, 10):
            row_data[f"home_{i}"] = row.get(f"home_{i}")
            row_data[f"away_{i}"] = row.get(f"away_{i}")
        team_rows.append(row_data)

team_df = pd.DataFrame(team_rows)

# === Ensure numeric consistency for odds fields ===
for col in ["h2h_own", "h2h_opp", "Over_Price", "Under_Price", "Total", "run_diff"]:
    team_df[col] = pd.to_numeric(team_df[col], errors="coerce")

# === Append to master Parquet (safe for upsert logic) ===
if os.path.exists(master_file):
    master_df = pd.read_parquet(master_file)
    master_df["game_date_et"] = pd.to_datetime(master_df["game_date_et"], errors="coerce")
    master_df["start_time_et"] = pd.to_datetime(master_df["start_time_et"], errors="coerce")
    master_df = master_df[~master_df['game_id'].isin(team_df['game_id'])]  # 🔥 Optional upsert to avoid duplicate game_id
    combined_df = pd.concat([master_df, team_df], ignore_index=True)
else:
    combined_df = team_df

# ✅ Optional: Add season
combined_df["season"] = pd.to_datetime(combined_df["game_date_et"], errors='coerce').dt.year

# ✅ Optional: Sort by date and team
combined_df = combined_df.sort_values(by=["game_date_et", "team_abbr"]).reset_index(drop=True)


# === Save updated master ===
combined_df.to_parquet(master_file, index=False)
print(f"✅ Appended {len(team_df)} team-level rows to: {master_file}")


✅ Appended 30 team-level rows to: master_template.parquet


  combined_df = pd.concat([master_df, team_df], ignore_index=True)


In [None]:
!pwd


/content
