                            Parse box score function and fetcha and save function with regex                                                                                                

In [7]:
import statsapi
import pandas as pd
import re
import os

def parse_box_score(box_score):
    lines = box_score.strip().split('\n')
    headers = re.findall(r'\d+', lines[0])  # Extract inning numbers from the header
    team_scores = {}
    for line in lines[1:]:
        # Improved regex with better whitespace handling
        team_name_match = re.match(r'^([A-Za-z\s\-]+?)\s+(\d[\d\s]*)', line)
        if team_name_match:
            team_name = team_name_match.group(1).strip()
            # Check if the extracted team name ends with any team from the list
            matched_team = next((team for team in mlb_teams if team_name.endswith(team)), None)
            if matched_team:
                # Extract scores using whitespace separation after confirming team name
                score_line = [int(num) for num in team_name_match.group(2).split()]
                team_scores[matched_team] = score_line[:len(headers)]
            else:
                print(f"Team name not matched or not in list: '{team_name}' from line: {line}")
        else:
            print(f"No team name match found in line: {line}")
    return team_scores

mlb_teams = [
    "D-backs", "Braves", "Orioles", "Red Sox", "Cubs", "White Sox", "Reds", "Guardians", "Rockies", "Tigers",
    "Astros", "Royals", "Angels", "Dodgers", "Marlins", "Brewers", "Twins", "Mets", "Yankees", "Athletics",
    "Phillies", "Pirates", "Padres", "Giants", "Mariners", "Cardinals", "Rays", "Rangers", "Blue Jays", "Nationals"
]

def fetch_and_save_mlb_data(start_date, end_date, file_path):
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
    else:
        df = pd.DataFrame(columns=['Team', 'Game ID', 'Type', 'Inning', 'Score'])

    data = []
    sched = statsapi.schedule(start_date=start_date, end_date=end_date)
    for game in sched:
        game_id = game['game_id']
        linescore = statsapi.linescore(game_id)
        if isinstance(linescore, str):
            team_data = parse_box_score(linescore)
            if team_data:
                for team, scores in team_data.items():
                    for inning, score in enumerate(scores, start=1):
                        data.append({'Team': team, 'Game ID': game_id, 'Type': 'Offensive', 'Inning': inning, 'Score': score})
            else:
                print(f"No valid MLB team data parsed for game ID {game_id}.")
        else:
            print(f"Failed to fetch linescore for game ID {game_id} or received non-string data.")

    if data:
        new_df = pd.DataFrame(data)
        df = pd.concat([df, new_df], ignore_index=True)
        df.to_csv(file_path, index=False)
        print(f"Data saved to {file_path} with {len(df)} records.")
    else:
        print("No new data to add.")

start_date = '2024-03-01'
end_date = '2024-05-10'
file_path = 'game_data.csv'
fetch_and_save_mlb_data(start_date, end_date, file_path)


#code could be finished to parse the inning data into proper output structure - Stopping now to make class strucutre for mlb teams for efficiency HLR 5/13/24

Team name not matched or not in list: 'Kiwoom Heroes' from line: Kiwoom Heroes 0 0 0 1 0 0 2 0 0  3   6   1
Team name not matched or not in list: 'Korea' from line: Korea  0 0 0 0 0 0 0 0 0  0   5   0  
Team name not matched or not in list: 'Korea' from line: Korea   0 0 2 0 0 0 0 0 0  2   5   0  
Team name not matched or not in list: 'Diablos Rojos' from line: Diablos Rojos 0 0 0 1 0 2 1 0 0  4   10  0
Team name not matched or not in list: 'River Cats' from line: River Cats 2 0 0 0 3 0 0 0 3  8   7   0  
Team name not matched or not in list: 'Naturals' from line: Naturals 0 0 0 0 0 0 0 1 0  1   7   0
Team name not matched or not in list: 'Diablos Rojos' from line: Diablos Rojos 0 0 8 0 0 0 0 0 0  8   9   0
Team name not matched or not in list: 'Space Cowboys' from line: Space Cowboys 0 0 0 0 1 0 1 1 0  3   5   0  
Team name not matched or not in list: 'Nationals Prospects' from line: Nationals Prospects 0 0 0 0 0 0 1 0 0  1   10  1  
Team name not matched or not in list: 'Space Cowboy

                                    Offensive 1st Inning Plot                                                                                                           

In [None]:
# Creating the DataFrame
columns = ['Team', 'Game ID', 'Type', 'Inning', 'Score']
new_df = pd.DataFrame(data, columns=columns)

# Combine with existing data
combined_df = pd.concat([existing_df, new_df])

# Save combined DataFrame to CSV
combined_df.to_csv(file_path, index=False)

# Assuming `combined_df` has been defined and contains all the data

# Filter for defensive scores in the first inning
first_inning_defensive_df = combined_df[(combined_df['Type'] == 'Defensive') & (combined_df['Inning'] == 1)]

# Group by team and game, and calculate average first inning defensive score
avg_first_inning_defensive_scores = first_inning_defensive_df.groupby(['Team', 'Game ID']).agg({'Score': 'mean'}).reset_index()

# Sort by game ID for plotting
avg_first_inning_defensive_scores.sort_values(by='Game ID', inplace=True)

# Plotting Defensive 1st Inning Moving Average
plt.figure(figsize=(14, 8))
teams = avg_first_inning_defensive_scores['Team'].unique()
for team in teams:
    team_data = avg_first_inning_defensive_scores[avg_first_inning_defensive_scores['Team'] == team]
    team_data['MA Score'] = team_data['Score'].rolling(window=15).mean()  # Calculate moving average
    plt.plot(team_data['Game ID'], team_data['MA Score'], marker='', linestyle='-', label=f'{team}')

plt.title('Moving Average of First Inning Defensive Scores for All MLB Teams')
plt.xlabel('Game ID')
plt.ylabel('Moving Average Score')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

                                    Defensive 1st Inning Plot                                                                                                           

In [None]:


# Filter for defensive scores in the first inning
first_inning_defensive_df = combined_df[(combined_df['Type'] == 'Offensive') & (combined_df['Inning'] == 1)]

# Group by team and game, and calculate average first inning defensive score
avg_first_inning_defensive_scores = first_inning_defensive_df.groupby(['Team', 'Game ID']).agg({'Score': 'mean'}).reset_index()

# Sort by game ID for plotting
avg_first_inning_defensive_scores.sort_values(by='Game ID', inplace=True)

# Plotting Defensive 1st Inning Moving Average
plt.figure(figsize=(14, 8))
teams = avg_first_inning_defensive_scores['Team'].unique()
for team in teams:
    team_data = avg_first_inning_defensive_scores[avg_first_inning_defensive_scores['Team'] == team]
    team_data['MA Score'] = team_data['Score'].rolling(window=15).mean()  # Calculate moving average
    plt.plot(team_data['Game ID'], team_data['MA Score'], marker='', linestyle='-', label=f'{team}')

plt.title('Moving Average of First Inning Offensive Scores for All MLB Teams')
plt.xlabel('Game ID')
plt.ylabel('Moving Average Score')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

                        Setting up MLB team class function                                                                                                                                                                                                                          

In [9]:
import statsapi
import pandas as pd
import re
import os

class MLBStats:
    def __init__(self):
        self.all_teams = self.load_teams()
        self.al_teams = self.load_teams('AL')
        self.nl_teams = self.load_teams('NL')

    def load_teams(self, league=None):
        """Load teams optionally filtered by league (AL or NL)."""
        teams = statsapi.get('teams', {'sportId': 1})
        if league:
            return [team for team in teams['teams'] if team['league']['name'] == league]
        return [team for team in teams['teams']]

    def update_mlb_data(self, start_date, end_date, file_path):
        """Update MLB data by fetching new data and merging with existing data if present."""
        existing_df = pd.read_csv(file_path) if os.path.exists(file_path) else pd.DataFrame(columns=['Team', 'Game ID', 'Type', 'Inning', 'Score'])
        existing_game_ids = set(existing_df['Game ID'].unique())
        sched = statsapi.schedule(start_date=start_date, end_date=end_date)
        new_games = [game for game in sched if game['game_id'] not in existing_game_ids]

        data = []
        for game in new_games:
            game_data = self.get_game_data(game['game_id'])
            if game_data:
                data.extend(game_data)
        
        if data:
            new_df = pd.DataFrame(data)
            updated_df = pd.concat([existing_df, new_df], ignore_index=True)
            updated_df.to_csv(file_path, index=False)
            print(f"Data updated and saved to {file_path} with {len(updated_df)} records.")
        else:
            print("No new data to add.")

    def get_game_data(self, game_id):
        """Retrieve detailed game data including box scores."""
        boxscore = statsapi.boxscore_data(game_id)
        return boxscore

    def get_player_stats(self, player_id, group='hitting', type='career'):
        """Retrieve player stats; group can be 'hitting', 'pitching', or 'fielding'."""
        try:
            player_stats = statsapi.player_stat_data(player_id, group=group, type=type)
            return player_stats

    def compare_al_nl(self):
        """Compare stats or other criteria between AL and NL teams."""
        # Placeholder for comparison logic
        pass

# Usage of the class
mlb_stats = MLBStats()
start_date = '2024-03-01'
end_date = '2024-05-10'
file_path = 'game_data.csv'
mlb_stats.update_mlb_data(start_date, end_date, file_path)

# Fetching player stats
player_stats = mlb_stats.get_player_stats(12345)  # Example player ID
print(player_stats)

# Fetching game data
game_data = mlb_stats.get_game_data(56789)  # Example game ID
print(game_data)


IndentationError: unexpected unindent (2290447774.py, line 51)

In [10]:
import statsapi
import pandas as pd

class MLBStats:
    def __init__(self):
        self.all_teams = self.load_teams()

    def load_teams(self):
        """Load all MLB teams."""
        teams = statsapi.get('teams', {'sportId': 1})
        return {team['id']: team['name'] for team in teams['teams']}

    def get_all_players(self):
        """Retrieve all players from all teams."""
        player_data = []
        for team_id in self.all_teams:
            roster = statsapi.get('team_roster', {'teamId': team_id})
            for player in roster['roster']:
                player_details = statsapi.lookup_player(player['person']['id'])
                if player_details:
                    player_stats = self.get_player_stats(player['person']['id'])
                    player_data.append({
                        'Player ID': player['person']['id'],
                        'Name': player['person']['fullName'],
                        'Team': self.all_teams[team_id],
                        'Stats': player_stats
                    })
        return player_data

    def get_player_stats(self, player_id, group='hitting', type='career'):
        """Retrieve player stats; group can be 'hitting', 'pitching', or 'fielding'."""
        try:
            return statsapi.player_stat_data(player_id, group=group, type=type)
        except Exception as e:
            print(f"Failed to retrieve data for player ID {player_id}: {e}")
            return None

# Usage
mlb_stats = MLBStats()
all_players_data = mlb_stats.get_all_players()


KeyboardInterrupt: 

In [15]:
import statsapi
import pandas as pd
import re
import os

class MLBStats:
    def __init__(self, file_path):
        self.file_path = file_path
        self.mlb_teams = self.load_mlb_teams()

    def load_mlb_teams(self):
        """Hardcoded MLB teams for simplicity and reliability."""
        return [
            "D-backs", "Braves", "Orioles", "Red Sox", "Cubs", "White Sox", "Reds", "Guardians",
            "Rockies", "Tigers", "Astros", "Royals", "Angels", "Dodgers", "Marlins", "Brewers",
            "Twins", "Mets", "Yankees", "Athletics", "Phillies", "Pirates", "Padres", "Giants",
            "Mariners", "Cardinals", "Rays", "Rangers", "Blue Jays", "Nationals"
        ]

    def parse_box_score(self, box_score):
        """Parse the box score from a game to extract team scores."""
        lines = box_score.strip().split('\n')
        headers = re.findall(r'\d+', lines[0])  # Extract inning numbers from the header
        team_scores = {}
        for line in lines[1:]:
            team_name_match = re.match(r'^([A-Za-z\s\-]+?)\s+(\d[\d\s]*)', line)
            if team_name_match:
                team_name = team_name_match.group(1).strip()
                matched_team = next((team for team in self.mlb_teams if team_name.endswith(team)), None)
                if matched_team:
                    score_line = [int(num) for num in team_name_match.group(2).split()]
                    team_scores[matched_team] = score_line[:len(headers)]
                else:
                    print(f"Team name not matched or not in list: '{team_name}' from line: {line}")
            else:
                print(f"No team name match found in line: {line}")
        return team_scores

    def update_mlb_data(self, start_date, end_date):
        """Update the MLB data by only fetching new games and adding them to the existing dataset."""
        existing_df = pd.read_csv(self.file_path) if os.path.exists(self.file_path) else pd.DataFrame(columns=['Team', 'Game ID', 'Type', 'Inning', 'Score'])
        sched = statsapi.schedule(start_date=start_date, end_date=end_date)
        new_games = [game for game in sched if game['game_id'] not in existing_df['Game ID'].tolist()]

        data = []
        for game in new_games:
            game_id = game['game_id']
            linescore = statsapi.linescore(game_id)
            if isinstance(linescore, str):
                team_data = self.parse_box_score(linescore)
                for team, scores in team_data.items():
                    for inning, score in enumerate(scores, start=1):
                        data.append({'Team': team, 'Game ID': game_id, 'Type': 'Offensive', 'Inning': inning, 'Score': score})
        
        if data:
            new_df = pd.DataFrame(data)
            updated_df = pd.concat([existing_df, new_df], ignore_index=True)
            updated_df.to_csv(self.file_path, index=False)
            print(f"Data updated and saved to {self.file_path} with {len(updated_df)} records.")
        else:
            print("No new data to add.")


# Usage of the class
mlb_stats = MLBStats('game_data.csv')
start_date = '2024-03-01'
end_date = '2024-05-10'
mlb_stats.update_mlb_data(start_date, end_date)


No new data to add.


In [16]:
print(MLB.mlb_teams.Astros)

NameError: name 'MLB' is not defined