                            Parse box score function and fetcha and save function with regex                                                                                                

Todo:
- add weather data
- optimize storage output of data/cell class
- create input for all players/pitchers
- plot out running averages per team
- develop metric for offense and defense scoring prob
- plot offensive/defensive metric to id good/bad teams
- create individ player and pitcher cell class with relevant data
- create design matrix over time for input into Logistic GLM
- run logistical GLM across all teams to test latent overlying factors
- run per team

In [29]:
import pandas as pd
import requests
import statsapi

class TeamBoxScores:
    def __init__(self, file_path):
        self.file_path = file_path
        self.data = pd.read_csv(file_path) if os.path.exists(file_path) else pd.DataFrame(columns=['Team', 'Game ID', 'Type', 'Inning', 'Score', 'Precipitation %', 'Wind Speed', 'Temperature'])

    def fetch_weather_data(self, date, location):
        api_key = 'YOUR_API_KEY'
        base_url = 'https://api.weatherapi.com/v1/history.json'
        query_params = {
            'key': api_key,
            'q': location,
            'dt': date
        }
        response = requests.get(base_url, params=query_params)
        if response.status_code == 200:
            data = response.json()
            weather = data['forecast']['forecastday'][0]['day']
            return {
                'precipitation': weather['totalprecip_mm'],
                'wind_speed': weather['maxwind_kph'],
                'temperature': weather['avgtemp_c']
            }
        else:
            print(f"Failed to fetch weather data for {location} on {date}.")
            return None

    def get_game_date(self, game_id):
        game = statsapi.get('game', {'gamePk': game_id})
        game_date = game['dates'][0]['date']
        return game_date

    def update_data(self, start_date, end_date):
        sched = statsapi.schedule(start_date=start_date, end_date=end_date)
        new_data = []
        for game in sched:
            game_id = game['game_id']
            game_date = self.get_game_date(game_id)
            venue_id = game.get('venue_id', 'Unknown')
            location = venue_location_map.get(str(venue_id), 'Unknown Location')
            if location == 'Unknown Location':
                print(f"Warning: No mapping found for venue ID {venue_id}. Skipping weather data.")
                weather_data = None
            else:
                weather_data = self.fetch_weather_data(game_date, location)

            linescore = statsapi.linescore(game_id)
            if isinstance(linescore, str):
                team_scores = self.parse_box_score(linescore)
                if len(team_scores) < 2:
                    print("Warning: Less than two teams found in the game data. Skipping game ID:", game_id)
                    continue
                for team, scores in team_scores.items():
                    for inning, score in enumerate(scores, start=1):
                        new_row = {
                            'Team': team, 'Game ID': game_id, 'Type': 'Offensive', 'Inning': inning, 'Score': score,
                            'Precipitation %': weather_data['precipitation'] if weather_data else 'N/A',
                            'Wind Speed': weather_data['wind_speed'] if weather_data else 'N/A',
                            'Temperature': weather_data['temperature'] if weather_data else 'N/A'
                        }
                        new_data.append(new_row)
                        # Calculate defensive scores for the other team
                        opposing_team = next(t for t in team_scores if t != team)
                        opposing_score = team_scores[opposing_team][inning-1] if inning <= len(team_scores[opposing_team]) else 0
                        new_row.update({'Type': 'Defensive', 'Team': opposing_team, 'Score': opposing_score})
                        new_data.append(new_row.copy())

        if new_data:
            new_df = pd.DataFrame(new_data)
            self.data = pd.concat([self.data, new_df], ignore_index=True)
            self.data.to_csv(self.file_path, index=False)
            print(f"Data saved to {self.file_path} with {len(self.data)} records.")

    def parse_box_score(self, box_score):
        import re
        lines = box_score.strip().split('\n')
        headers = re.findall(r'\d+', lines[0])
        team_scores = {}
        for line in lines[1:]:
            team_name_match = re.match(r'^([A-Za-z\s\-]+?)\s+(\d[\d\s]*)', line)
            if team_name_match:
                team_name = team_name_match.group(1).strip()
                matched_team = next((team for team in mlb_teams if team_name.endswith(team)), None)
                if matched_team:
                    score_line = [int(num) for num in team_name_match.group(2).split()]
                    team_scores[matched_team] = score_line[:len(headers)]
        return team_scores

mlb_teams = [
    "D-backs", "Braves", "Orioles", "Red Sox", "Cubs", "White Sox", "Reds", "Guardians", "Rockies", "Tigers",
    "Astros", "Royals", "Angels", "Dodgers", "Marlins", "Brewers", "Twins", "Mets", "Yankees", "Athletics",
    "Phillies", "Pirates", "Padres", "Giants", "Mariners", "Cardinals", "Rays", "Rangers", "Blue Jays", "Nationals"
]

# Assuming each venue_id maps directly to a city/state location
venue_location_map = {
    '101': 'Phoenix, AZ',  # Chase Field, Phoenix
    '102': 'Atlanta, GA',  # Truist Park, Atlanta
    '103': 'Baltimore, MD',  # Oriole Park at Camden Yards, Baltimore
    '104': 'Boston, MA',  # Fenway Park, Boston
    '105': 'Chicago, IL',  # Wrigley Field, Chicago (Cubs)
    '106': 'Chicago, IL',  # Guaranteed Rate Field, Chicago (White Sox)
    '107': 'Cincinnati, OH',  # Great American Ball Park, Cincinnati
    '108': 'Cleveland, OH',  # Progressive Field, Cleveland
    '109': 'Denver, CO',  # Coors Field, Denver
    '110': 'Detroit, MI',  # Comerica Park, Detroit
    '111': 'Houston, TX',  # Minute Maid Park, Houston
    '112': 'Kansas City, MO',  # Kauffman Stadium, Kansas City
    '113': 'Anaheim, CA',  # Angel Stadium, Anaheim
    '114': 'Los Angeles, CA',  # Dodger Stadium, Los Angeles
    '115': 'Miami, FL',  # Marlins Park, Miami
    '116': 'Milwaukee, WI',  # American Family Field, Milwaukee
    '117': 'Minneapolis, MN',  # Target Field, Minneapolis
    '118': 'New York, NY',  # Citi Field, New York (Mets)
    '119': 'New York, NY',  # Yankee Stadium, New York (Yankees)
    '120': 'Oakland, CA',  # Oakland Coliseum, Oakland
    '121': 'Philadelphia, PA',  # Citizens Bank Park, Philadelphia
    '122': 'Pittsburgh, PA',  # PNC Park, Pittsburgh
    '123': 'San Diego, CA',  # Petco Park, San Diego
    '124': 'San Francisco, CA',  # Oracle Park, San Francisco
    '125': 'Seattle, WA',  # T-Mobile Park, Seattle
    '126': 'St. Louis, MO',  # Busch Stadium, St. Louis
    '127': 'Tampa, FL',  # Tropicana Field, Tampa
    '128': 'Arlington, TX',  # Globe Life Field, Arlington
    '129': 'Toronto, ON',  # Rogers Centre, Toronto
    '130': 'Washington, DC',  # Nationals Park, Washington D.C.
}

# Usage of the class
file_path = 'game_data.csv'
boxscores = TeamBoxScores(file_path)
start_date = '2024-03-01'
end_date = '2024-05-10'
boxscores.update_data(start_date, end_date)



KeyError: 'dates'

                                    Offensive 1st Inning Plot                                                                                                           

In [None]:
# Creating the DataFrame
columns = ['Team', 'Game ID', 'Type', 'Inning', 'Score']
new_df = pd.DataFrame(data, columns=columns)

# Combine with existing data
combined_df = pd.concat([existing_df, new_df])

# Save combined DataFrame to CSV
combined_df.to_csv(file_path, index=False)

# Assuming `combined_df` has been defined and contains all the data

# Filter for defensive scores in the first inning
first_inning_defensive_df = combined_df[(combined_df['Type'] == 'Defensive') & (combined_df['Inning'] == 1)]

# Group by team and game, and calculate average first inning defensive score
avg_first_inning_defensive_scores = first_inning_defensive_df.groupby(['Team', 'Game ID']).agg({'Score': 'mean'}).reset_index()

# Sort by game ID for plotting
avg_first_inning_defensive_scores.sort_values(by='Game ID', inplace=True)

# Plotting Defensive 1st Inning Moving Average
plt.figure(figsize=(14, 8))
teams = avg_first_inning_defensive_scores['Team'].unique()
for team in teams:
    team_data = avg_first_inning_defensive_scores[avg_first_inning_defensive_scores['Team'] == team]
    team_data['MA Score'] = team_data['Score'].rolling(window=15).mean()  # Calculate moving average
    plt.plot(team_data['Game ID'], team_data['MA Score'], marker='', linestyle='-', label=f'{team}')

plt.title('Moving Average of First Inning Defensive Scores for All MLB Teams')
plt.xlabel('Game ID')
plt.ylabel('Moving Average Score')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

                                    Defensive 1st Inning Plot                                                                                                           

In [None]:


# Filter for defensive scores in the first inning
first_inning_defensive_df = combined_df[(combined_df['Type'] == 'Offensive') & (combined_df['Inning'] == 1)]

# Group by team and game, and calculate average first inning defensive score
avg_first_inning_defensive_scores = first_inning_defensive_df.groupby(['Team', 'Game ID']).agg({'Score': 'mean'}).reset_index()

# Sort by game ID for plotting
avg_first_inning_defensive_scores.sort_values(by='Game ID', inplace=True)

# Plotting Defensive 1st Inning Moving Average
plt.figure(figsize=(14, 8))
teams = avg_first_inning_defensive_scores['Team'].unique()
for team in teams:
    team_data = avg_first_inning_defensive_scores[avg_first_inning_defensive_scores['Team'] == team]
    team_data['MA Score'] = team_data['Score'].rolling(window=15).mean()  # Calculate moving average
    plt.plot(team_data['Game ID'], team_data['MA Score'], marker='', linestyle='-', label=f'{team}')

plt.title('Moving Average of First Inning Offensive Scores for All MLB Teams')
plt.xlabel('Game ID')
plt.ylabel('Moving Average Score')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()