# MLB Season Data Fetcher

The purpose of this Jupyter Notebook is to fetch MLB historical seasonal data to be used in a d3.js data visualization. The final output of the `.csv` file will look something like this:

| date       | division                | games_played | isWinner | losses | pct   | team_name            | team_short_name | wins | 
|------------|-------------------------|--------------|----------|--------|-------|----------------------|-----------------|------| 
| 2018-03-29 | National League Central | 1            | True     | 0      | 1.000 | Chicago Cubs         | Chi Cubs        | 1    | 
| 2018-03-30 | National League Central | 1            | False    | 1      | .000  | Cincinnati Reds      | Cincinnati      | 0    | 
| 2018-03-30 | National League East    | 1            | True     | 0      | 1.000 | Washington Nationals | Washington      | 1    | 
| 2018-03-30 | American League Central | 1            | False    | 1      | .000  | Detroit Tigers       | Detroit         | 0    | 
| 2018-03-30 | National League Central | 1            | True     | 0      | 1.000 | Pittsburgh Pirates   | Pittsburgh      | 1    | 

## How to use it

Change the cell below for the `SEASON_YEAR` to the one that you want to download

Click **Cell** -> **Run All**

You should have a file downloaded named `SEASON_YEAR-cumulative-season-games.csv`

In [37]:
SEASON_YEAR = "2018"

In [38]:
from pathlib import Path
import pandas as pd
import json
import requests
from datetime import timedelta, date
import datetime
import time

In [39]:
def save_response(response, path):
    """Saves a response to a file in the local filesystem. Used for caching
    """
    with open(path, "w") as f:        
        json.dump(response, f)

In [40]:
def load_response(path):
    """Loads path that have previously been saved. Used for caching
    """
    with open(path, "r") as f:
        response = json.load(f)
    return response

In [41]:
# Get Season data (used later to get the start and end dates)
def get_season_with_cache(year):
    """Get season data from given year, loading from a disk cache if available.
    
    The first time you call this function, it will download season for
    that year.  Subsequent calls will not re-download the season; instead
    they'll load the season from a save file in your local filesystem.
   
    Args:
        year (str): Season year
    """

    SEASONS_URL = "https://statsapi.mlb.com/api/v1/seasons/" + year
    params = {"sportId": 1}
    season_save_path = year + '_season.json'
    if not Path(season_save_path).is_file():
        print("Cached season not found for " + year)
        season_result = requests.get(SEASONS_URL, params=params)
        season_result_data = season_result.json()
        save_response(season_result_data, season_save_path)
        return season_result_data
    else:
        print("Cached season found for " + year)
        season_result = load_response(season_save_path)
    
    return season_result

In [42]:
# Get Games data (used later to get the start and end dates)
def get_games_with_cache(start_date, end_date):
    """Get result data for a given date, loading from a disk cache if available.
    
    The first time you call this function, it will download data for
    that date.  Subsequent calls will not re-download the data; instead
    they'll load the standings from a save file in your local filesystem.
   
    Args:
        start_date (str): Date formatted like YYYY-MM-DD, e.g. 2017-05-30
        end_date (str): Date formatted like YYYY-MM-DD, e.g. 2017-05-30
    """
    games_url = f"https://statsapi.mlb.com/api/v1/schedule?sportId=1&startDate={start_date}&endDate={end_date}&language=en"
    print(start_date)
    games_save_path = start_date + "--" + end_date + "_games.json"
    if not Path(games_save_path).is_file():
        games_result = requests.get(games_url)
        while games_result.status_code != 200:
            print("Status code: ", games_result.status_code)
            print("Retrying in 3 secs...")
            time.sleep(3)
            games_result = requests.get(games_url)
        games_result_data = games_result.json()
        save_response(games_result_data, games_save_path)
        return games_result_data
    else:
        games_result = load_response(games_save_path)
    
    return games_result

In [43]:
def get_season_start_and_end_dates(season):
    return season['seasons'][0]['regularSeasonStartDate'], season['seasons'][0]['regularSeasonEndDate']

In [49]:
def get_team_data(team_id, year):
    """This gets the team data for each year.
    We need this data on a per-year basis so that the teams and their divisions
    line up properly. The games data doesn't include the divisions of each team
    so we need to look that up. Teams have moved divisions throughout history
    which is why we need to look it up.
    """
    team_url = f"https://statsapi.mlb.com/api/v1/teams/{team_id}?season={year}"
    team_save_path = f"{year}-{str(team_id)}_team.json"
    if not Path(team_save_path).is_file():
        team_result = requests.get(team_url)
        while team_result.status_code != 200:
            print("Status code: ", team_result.status_code)
            print("Retrying in 3 secs...")
            time.sleep(3)
            team_result = requests.get(team_url)
        team_result_data = team_result.json()
        save_response(team_result_data, team_save_path)
        return team_result_data
    else:
        team_result = load_response(team_save_path)
    
    return team_result

In [45]:
season = get_season_with_cache(SEASON_YEAR)
season_date_range = get_season_start_and_end_dates(season)

Cached season found for 2017


In [46]:
# yes, you see that correctly, a triple-nested for loop. O(n^3)
# optimize if you wish, but it's fast enough for this small data set imo

start_date = season_date_range[0]
end_date = season_date_range[1]
results = []

games_data = get_games_with_cache(start_date, end_date)
dates = games_data['dates']
for date in dates:
    games = date['games']
    for game in games:
        if game['seriesDescription'] == "Regular Season":
            teams = [game['teams']['away'], game['teams']['home']]
            for team in teams:
                # If isWinner doesn't exist, then the game was postponed
                team_data = get_team_data(str(team['team']['id']), SEASON_YEAR)
                if "isWinner" in team and "division" in team_data['teams'][0]:
                    team_result = {
                        "team_name": team['team']['name'],
                        "wins": int(team['leagueRecord']['wins']),
                        "losses": int(team['leagueRecord']['losses']),
                        "division": team_data['teams'][0]['division']['name'],
                        "team_short_name": team_data['teams'][0]['shortName'],
                        "games_played": int(team['leagueRecord']['wins'])+int(team['leagueRecord']['losses']),
                        "pct": team["leagueRecord"]['pct'],
                        "date": date['date'],
                        "isWinner": bool(team["isWinner"])
                    }
                    results.append(team_result)


2017-04-02


In [47]:
df = pd.DataFrame(results)
df.sort_values(by=['games_played'], inplace=True)

In [48]:
df.to_csv(f"{SEASON_YEAR}-cumulative-season-games.csv", index=False)