                            Parse box score function and fetcha and save function with regex                                                                                                

In [7]:
import statsapi
import pandas as pd
import re
import os

def parse_box_score(box_score):
    lines = box_score.strip().split('\n')
    headers = re.findall(r'\d+', lines[0])  # Extract inning numbers from the header
    team_scores = {}
    for line in lines[1:]:
        # Improved regex with better whitespace handling
        team_name_match = re.match(r'^([A-Za-z\s\-]+?)\s+(\d[\d\s]*)', line)
        if team_name_match:
            team_name = team_name_match.group(1).strip()
            # Check if the extracted team name ends with any team from the list
            matched_team = next((team for team in mlb_teams if team_name.endswith(team)), None)
            if matched_team:
                # Extract scores using whitespace separation after confirming team name
                score_line = [int(num) for num in team_name_match.group(2).split()]
                team_scores[matched_team] = score_line[:len(headers)]
            else:
                print(f"Team name not matched or not in list: '{team_name}' from line: {line}")
        else:
            print(f"No team name match found in line: {line}")
    return team_scores

mlb_teams = [
    "D-backs", "Braves", "Orioles", "Red Sox", "Cubs", "White Sox", "Reds", "Guardians", "Rockies", "Tigers",
    "Astros", "Royals", "Angels", "Dodgers", "Marlins", "Brewers", "Twins", "Mets", "Yankees", "Athletics",
    "Phillies", "Pirates", "Padres", "Giants", "Mariners", "Cardinals", "Rays", "Rangers", "Blue Jays", "Nationals"
]

def fetch_and_save_mlb_data(start_date, end_date, file_path):
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
    else:
        df = pd.DataFrame(columns=['Team', 'Game ID', 'Type', 'Inning', 'Score'])

    data = []
    sched = statsapi.schedule(start_date=start_date, end_date=end_date)
    for game in sched:
        game_id = game['game_id']
        linescore = statsapi.linescore(game_id)
        if isinstance(linescore, str):
            team_data = parse_box_score(linescore)
            if team_data:
                for team, scores in team_data.items():
                    for inning, score in enumerate(scores, start=1):
                        data.append({'Team': team, 'Game ID': game_id, 'Type': 'Offensive', 'Inning': inning, 'Score': score})
            else:
                print(f"No valid MLB team data parsed for game ID {game_id}.")
        else:
            print(f"Failed to fetch linescore for game ID {game_id} or received non-string data.")

    if data:
        new_df = pd.DataFrame(data)
        df = pd.concat([df, new_df], ignore_index=True)
        df.to_csv(file_path, index=False)
        print(f"Data saved to {file_path} with {len(df)} records.")
    else:
        print("No new data to add.")

start_date = '2024-03-01'
end_date = '2024-05-10'
file_path = 'game_data.csv'
fetch_and_save_mlb_data(start_date, end_date, file_path)


#code could be finished to parse the inning data into proper output structure - Stopping now to make class strucutre for mlb teams for efficiency HLR 5/13/24

Team name not matched or not in list: 'Kiwoom Heroes' from line: Kiwoom Heroes 0 0 0 1 0 0 2 0 0  3   6   1
Team name not matched or not in list: 'Korea' from line: Korea  0 0 0 0 0 0 0 0 0  0   5   0  
Team name not matched or not in list: 'Korea' from line: Korea   0 0 2 0 0 0 0 0 0  2   5   0  
Team name not matched or not in list: 'Diablos Rojos' from line: Diablos Rojos 0 0 0 1 0 2 1 0 0  4   10  0
Team name not matched or not in list: 'River Cats' from line: River Cats 2 0 0 0 3 0 0 0 3  8   7   0  
Team name not matched or not in list: 'Naturals' from line: Naturals 0 0 0 0 0 0 0 1 0  1   7   0
Team name not matched or not in list: 'Diablos Rojos' from line: Diablos Rojos 0 0 8 0 0 0 0 0 0  8   9   0
Team name not matched or not in list: 'Space Cowboys' from line: Space Cowboys 0 0 0 0 1 0 1 1 0  3   5   0  
Team name not matched or not in list: 'Nationals Prospects' from line: Nationals Prospects 0 0 0 0 0 0 1 0 0  1   10  1  
Team name not matched or not in list: 'Space Cowboy

                                    Offensive 1st Inning Plot                                                                                                           

In [None]:
# Creating the DataFrame
columns = ['Team', 'Game ID', 'Type', 'Inning', 'Score']
new_df = pd.DataFrame(data, columns=columns)

# Combine with existing data
combined_df = pd.concat([existing_df, new_df])

# Save combined DataFrame to CSV
combined_df.to_csv(file_path, index=False)

# Assuming `combined_df` has been defined and contains all the data

# Filter for defensive scores in the first inning
first_inning_defensive_df = combined_df[(combined_df['Type'] == 'Defensive') & (combined_df['Inning'] == 1)]

# Group by team and game, and calculate average first inning defensive score
avg_first_inning_defensive_scores = first_inning_defensive_df.groupby(['Team', 'Game ID']).agg({'Score': 'mean'}).reset_index()

# Sort by game ID for plotting
avg_first_inning_defensive_scores.sort_values(by='Game ID', inplace=True)

# Plotting Defensive 1st Inning Moving Average
plt.figure(figsize=(14, 8))
teams = avg_first_inning_defensive_scores['Team'].unique()
for team in teams:
    team_data = avg_first_inning_defensive_scores[avg_first_inning_defensive_scores['Team'] == team]
    team_data['MA Score'] = team_data['Score'].rolling(window=15).mean()  # Calculate moving average
    plt.plot(team_data['Game ID'], team_data['MA Score'], marker='', linestyle='-', label=f'{team}')

plt.title('Moving Average of First Inning Defensive Scores for All MLB Teams')
plt.xlabel('Game ID')
plt.ylabel('Moving Average Score')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

                                    Defensive 1st Inning Plot                                                                                                           

In [None]:


# Filter for defensive scores in the first inning
first_inning_defensive_df = combined_df[(combined_df['Type'] == 'Offensive') & (combined_df['Inning'] == 1)]

# Group by team and game, and calculate average first inning defensive score
avg_first_inning_defensive_scores = first_inning_defensive_df.groupby(['Team', 'Game ID']).agg({'Score': 'mean'}).reset_index()

# Sort by game ID for plotting
avg_first_inning_defensive_scores.sort_values(by='Game ID', inplace=True)

# Plotting Defensive 1st Inning Moving Average
plt.figure(figsize=(14, 8))
teams = avg_first_inning_defensive_scores['Team'].unique()
for team in teams:
    team_data = avg_first_inning_defensive_scores[avg_first_inning_defensive_scores['Team'] == team]
    team_data['MA Score'] = team_data['Score'].rolling(window=15).mean()  # Calculate moving average
    plt.plot(team_data['Game ID'], team_data['MA Score'], marker='', linestyle='-', label=f'{team}')

plt.title('Moving Average of First Inning Offensive Scores for All MLB Teams')
plt.xlabel('Game ID')
plt.ylabel('Moving Average Score')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()