In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests, os, datetime
import time as tm

In [2]:
# Different files

# games.csv - general game info - DONE
# hittersByGame.csv - how each player did in each game - DONE
# pitchersByGame.csv - how each player did in each game - DONE

# plays.csv - batter events - batter singled, batter struck out, etc. - DONE
# events.csv - general events - Have event id (per game) to join with next - DONE
# pitches.csv - one row per pitch per game - DONE
# inningScore.csv - score per inning - DONE
# inningHighlights.csv - # of runs, hits, and errors per inning - DONE

# hittingNotes.csv - DONE
# pitchingNotes.csv - DONE
# baserunningNotes.csv - DONE
# fieldingNotes.csv - DONE
# letterNotes.csv - for notes attached to batters (and maybe pitchers) - DONE

# can always add TEAM row to csvs later


In [3]:
def saveDF(df, file):
    global year, iii, all_df_dict
    
    # update all_df_dict
    if file in all_df_dict.keys():
        old_df = all_df_dict[file]
        new_df = pd.concat([old_df, df], ignore_index=True)
    else:
        new_df = df
    all_df_dict[file] = new_df
    
    # want to save in chunks of 100 games, so return otherwise
    if iii % 100 != 0:
        return
    
    print('writing with iii:', iii)
    
    path = os.path.join('data', year)
    total_path = os.path.join(path, file)
    if not os.path.exists(path):
        os.makedirs(path)
    
    if os.path.exists(total_path): # if it already exists, get the old one & combine
        old_df = pd.read_csv(total_path)
        to_write_df = pd.concat([old_df, new_df], ignore_index=True)
    else:
        to_write_df = new_df
    
    to_write_df.to_csv(total_path, index=False)
    del all_df_dict[file] # delete dataframe from dict since we just wrote it to a file
    return


def saveDF2(df, file):
    global year, iii
    path = os.path.join('data', year)
    total_path = os.path.join(path, file)
    
    if not os.path.exists(path):
        os.makedirs(path)
        new_df = df # ???
    
    if os.path.exists(total_path): # if it already exists, get the old one & combine
        old_df = pd.read_csv(total_path)
        new_df = pd.concat([old_df, df], ignore_index=True)
    else:
        new_df = df
    
    new_df.to_csv(total_path, index=False)
    return

def getName(x):
    if x.text == 'TEAM':
        return x.text
    
    if len(x.contents) == 3:
        return x.contents[1].text
    else:
        return x.contents[0].text

def getPos(x):
    if x.text == 'TEAM':
        return x.text
    
    if len(x.contents) == 3:
        return x.contents[2]
    else:
        return x.contents[1]
    
def loadPage(url):
    while True:
        try:
            r = requests.get(url)
            r.raise_for_status()
            break
        except:
            print('AT:', url)
            tm.sleep(2)
    return BeautifulSoup(r.text, "html.parser")

# plays
def scrapePitchByPitchPage(gameid):
    url = 'https://www.espn.com/mlb/playbyplay/_/gameId/{}'.format(gameid)
    soup = loadPage(url)

    s = soup.find_all('div', attrs={'data-module':'playbyplay'})[0]
    
    all_pitches = []
    events = []
    inning_highlights = []
    header = []

    event_id = 0
    # each half-inning
    for inning in s.find_all('section'):
        if 'allPlays' not in inning.get('id'):
            continue

        current_pitcher = ''
        current_pitching_team = ''
        current_hitting_team = ''
        info = inning.find('h1').text.split(' - ')
        # each hitter in half-inning
        for batter in inning.find_all('li'):
            if 'accordion-item' in batter.get('class'): # get pitches
                t = batter.find_all('table')[0]

                # get the header - can rip this out of the loop
                header = ['Num']
                for row in t.find('thead').find_all('th'):
                    header.append(row.text if row.text != '' else row.get('class')[0])

                # get all the pitches to this batter
                pitches = []
                for row in t.find('tbody').find_all('tr'):
                    pitch = []
                    tds = row.find_all('td')

                    pitch.append(tds[0].find('span').text) # pitch number
                    pitch.append(tds[0].contents[1]) # call
                    pitch.append(tds[1].text) # type
                    pitch.append(row.find('td', attrs={'class':'mph'}).text) # mph

                    # pitch location
                    try:
                        pitch.append(row.find('span', attrs={'class':'pitch-location'}).get('style')) # pitch location (top - 0 seems to be the top)
                    except:
                        pitch.append('NA')

                    # bases
                    bases = ''
                    for x in row.find('td', attrs={'class':'play-bases'}).find_all('span'):
                        if 'active' in str(x):
                            bases += x.get('class')[1].split('--')[1]
                    pitch.append(bases) # bases

                    # hit location
                    try:
                        pitch.append(row.find('span', attrs={'class':'hit-location'}).get('style')) # hit location
                    except:
                        pitch.append('')

                    pitch.append(current_pitcher)
                    pitch.append(current_pitching_team)
                    pitch.append(current_hitting_team) # team
                    pitch.append(info[1]) # inning
                    pitch.append(event_id)
                    pitches.append(pitch)
                all_pitches.extend(pitches)
            # get inning summary - # of runs, hits, and errors
            elif 'info-row--footer' in batter.get('class'):
                cats = batter.text.split(', ')
                inning_highlights.append([batter.get('id').replace('allPlays', '').replace('Linescore', '').replace('op', '').replace('ottom', ''), 
                                          int(cats[0].split(' ')[0]), int(cats[1].split(' ')[0]), int(cats[2].split(' ')[0])])
                continue

            hl = batter.find('span', attrs={'class':'headline'}).text
            home = batter.find('span', attrs={'class':'home'}).text
            away = batter.find('span', attrs={'class':'away'}).text
            
            # beginning of inning or change of pitchers
            if 'pitching for' in hl:
                current_pitcher = hl.split(' pitching for ')[0]
                current_pitching_team = hl.split(' pitching for ')[1]
                current_hitting_team = home if current_pitching_team == away else away 
            
            events.append([gameid, current_pitching_team, current_hitting_team, info[1], event_id, hl, home, away])
            event_id += 1

    # no events listed - didn't make it into loop
    if header == []:
        return False
    
    # inningHighlights.csv
    inning_highlights_df = pd.DataFrame(inning_highlights, columns=['Inning', 'Runs', 'Hits', 'Errors'])
    inning_highlights_df['Game'] = gameid

    saveDF(inning_highlights_df, 'inningHighlights.csv') # inningHighlights.csv -------------------------------

    # events.csv
    events_df = pd.DataFrame(events, columns=['Game', 'Pitching Team', 'Batting Team', 
                                              'Inning', 'Event Id', 'Events', 'Away', 'Home']) # events.csv
    saveDF(events_df, 'events.csv') # events.csv ------------------------------------------------------
    
    # pitches.csv
    header.extend(['Pitcher', 'Pitching Team', 'Batting Team', 'Inning', 'Event Id'])
    pitches_df = pd.DataFrame(all_pitches, columns=header)
    pitches_df['Game'] = gameid
    saveDF(pitches_df, 'pitches.csv') # pitches.csv ---------------------------------------------------
    return True

# stats
def scrapeBoxScorePage(gameid, game_stats):
    url = 'https://www.espn.com/mlb/boxscore/_/gameId/{}'.format(gameid) # https://www.espn.com/mlb/boxscore/_/gameId/401227647
    soup1 = loadPage(url)

    # Inning Score - inningScore.csv
    inning_score = pd.read_html(str(soup1.find('table', attrs={'class':'linescore__table'})))[0] # new csv (put H, E in games.csv?)
    inning_score = inning_score.rename(columns={'Unnamed: 0': 'Team'})
    inning_score['Game'] = gameid
    inning_score['Team'] = inning_score['Team'].apply(lambda x: x.split(' ')[-1])

    saveDF(inning_score, 'inningScore.csv') # inningScore.csv --------------------------------------

    # win/loss/save pitchers
    pitchers = soup1.find('div', attrs={'class':'linescore__situation-container'}).find_all('div', attrs={'class':'linescore__player_stats'})

    for pitcher in pitchers:
        cond = pitcher.contents[0].text # win/lose/save
        game_stats[cond+' - Pitcher - Stats'] = pitcher.contents[2].text # ip/k/etc
        game_stats[cond+' - Pitcher - Id'] = pitcher.find('a').get('href').split('/')[-1]
        game_stats[cond+' - Pitcher - Name'] = pitcher.contents[1].find('span', attrs={'class':'fullName'}).text
        game_stats[cond+' - Pitcher - AbbrName'] = pitcher.contents[1].find('span', attrs={'class':'abbrName'}).text
        try:
            game_stats[cond+' - Pitcher - Record'] = pitcher.contents[1].find('span', attrs={'class':'stat'}).text
        except:
            game_stats[cond+' - Pitcher - Record'] = 'NA'


    # hitters/pitchers
    player_stats = soup1.find('div', attrs={'data-module':'boxscore'})
    stat_groups = player_stats.find_all('article', attrs={'class':'sub-module boxscore-2017'})

    # letterNotes.csv
    notes = soup1.find_all('div', attrs={'data-note-id':True}) # https://stackoverflow.com/a/45365599
    final_notes = []

    for x in notes:
        note_id = x.get('data-note-id')
        final_notes.append([gameid, note_id.split('-')[0], note_id.split('-')[1], x.text])

    letter_notes_df = pd.DataFrame(final_notes, columns=['Game', 'Player Id', 'Player-Note Id', 'Note'])
    saveDF(letter_notes_df, 'letterNotes.csv') # letterNotes.csv --------------------------------------


    # batting
    team = 'away'
    batting_notes = []
    for hitting in stat_groups[0], stat_groups[2]:
        # pitcher stats
        hitting_df = pd.read_html(str(hitting))[0]
        hitting_df['Game'] = gameid
        hitting_df['Team'] = game_stats[team]

        # to get names and positions in separate columns
        snames = hitting.find_all('td', attrs={'class':'name'})
        names = [getName(x) for x in snames]
        pos = [getPos(x) for x in snames]

        hitting_df['Hitters'] = names
        hitting_df['Position'] = pos
        hitting_df['Hitter Id'] = [x.get('data-athlete-id') for x in hitting.find_all('tbody', class_='athletes')] + ['-']

        # extra batting info
        p_stats = hitting.find('div', attrs={'data-type':'battingDetails'}).find_all('li')[1:]
        for stat in p_stats:
            batting_notes.append([gameid, game_stats[team], stat.contents[0].text.replace(':', ''), stat.contents[1]])

        team = 'home'

    batting_notes_df = pd.DataFrame(batting_notes, columns=['Game', 'Team', 'Stat', 'Data'])
    saveDF(batting_notes_df, 'hittingNotes.csv') # hittingNotes.csv ---------------------------------
    saveDF(hitting_df, 'hittersByGame.csv') # hittersByGame.csv -------------------------------------

    # pitching
    team = 'away'
    pitching_notes = []
    for pitching in stat_groups[1], stat_groups[3]:
        # pitcher stats
        pitching_df = pd.read_html(str(pitching))[0] # rip team row into games.csv?
        pitching_df['Game'] = gameid
        pitching_df['Team'] = game_stats[team]

        # split record off
        pitching_df['Extra'] = pitching_df['Pitchers'].apply(lambda x: '' if '(' not in x else '(' + x.split(' (')[1])
        pitching_df['Pitchers'] = pitching_df['Pitchers'].apply(lambda x: x if '(' not in x else x.split(' (')[0])
        pitching_df['Pitcher Id'] = [x.get('data-athlete-id') for x in pitching.find_all('tbody', class_='athletes')] + ['-']

        # extra pitching info
        p_stats = pitching.find('div', attrs={'data-type':'pitchingDetails'}).find_all('li')[1:]
        for stat in p_stats:
            pitching_notes.append([gameid, game_stats[team], stat.contents[0].text.replace(':', ''), stat.contents[1]])

        team = 'home'

    pitching_notes_df = pd.DataFrame(pitching_notes, columns=['Game', 'Team', 'Stat', 'Data'])
    saveDF(pitching_notes_df, 'pitchingNotes.csv') # pitchingNotes.csv -------------------------------
    saveDF(pitching_df, 'pitchersByGame.csv') # pitchersByGame.csv -----------------------------------


    # could put the next part in a fcn
    # baserunning
    baserunning_notes = []
    team = 'away'
    for ba in soup1.find_all('div', attrs={'data-type':'baserunningDetails'}):
        ba_stats = ba.find_all('li')[1:]

        for stat in ba_stats:
            baserunning_notes.append([gameid, game_stats[team], stat.contents[0].text.replace(':', ''), stat.contents[1]])

        team = 'home'

    baserunning_notes_df = pd.DataFrame(baserunning_notes, columns=['Game', 'Team', 'Stat', 'Data'])
    saveDF(baserunning_notes_df, 'baserunningNotes.csv') # baserunningNotes.csv -----------------------


    # fielding
    fielding_notes = []
    team = 'away'
    for fi in soup1.find_all('div', attrs={'data-type':'fieldingDetails'}):
        fi_stats = fi.find_all('li')[1:]

        for stat in fi_stats:
            fielding_notes.append([gameid, game_stats[team], stat.contents[0].text.replace(':', ''), stat.contents[1]])

        team = 'home'

    fielding_notes_df = pd.DataFrame(fielding_notes, columns=['Game', 'Team', 'Stat', 'Data'])
    saveDF(fielding_notes_df, 'fieldingNotes.csv') # fieldingNotes.csv -----------------------------------
    return game_stats

# main page
def scrapeMainPage(gameid, canceled=False):
    url = 'https://www.espn.com/mlb/game/_/gameId/{}'.format(gameid)
    soup2 = loadPage(url)
    
    try:
        if soup2.find('div', attrs={'class':'game-details header'}).text.upper() == 'SPRING TRAINING':
            print('Spring Training:', gameid)
            return {}, True
    except:
        pass
    
    game_stats = {'Game': gameid}

    # Team Info & Final Score
    top = soup2.find('div', attrs={'id':'gamepackage-matchup-wrap'})

    place = 'away'
    for team in top.find_all('div', attrs={'class':'team-container'}):
        record = team.find('div', attrs={'class':'record'}).text.split(', ')
        try:
            name = team.find('a', attrs={'class':'team-name'}).contents # also has city and team name
        except:
            name = team.find('div', attrs={'class':'team-name'}).contents # for All-Star games

        game_stats[place] = name[2].text
        game_stats[place+'-record'] = record[0]
        if len(record) > 1:
            game_stats[place+place+'-record'] = record[1]

        place = 'home' # switch

    scores = top.find_all('div', attrs={'class':'score-container'})
    game_stats['away-score'] = scores[0].text
    game_stats['home-score'] = scores[1].text

    if soup2.find('div', attrs={'class':'game-details header'}) is not None:
        game_stats['postseason info'] = soup2.find('div', attrs={'class':'game-details header'}).text
    else:
        game_stats['postseason info'] = ''
    
    # Statistics (from bar graphs)
    stats = soup2.find('div', attrs={'class': 'sub-module team-statistics'})
    if not canceled: # is none for canceled games
        for bars in stats.find_all('li', attrs={'class': 'stat-box'}):
            stat = bars.find('h3').text

            values = bars.find_all('span', attrs={'class':'chartValue'})
            game_stats[stat + ' - Away'] = values[0].text # away
            game_stats[stat + ' - Home'] = values[1].text # home
    
    # Game Info *********************************
    gs = soup2.find('article', attrs={'class':'sub-module game-information'})
    try:
        game_stats['Stadium'] = gs.find('div', attrs={'class':'game-location'}).text # stadium
    except:
        game_stats['Stadium'] = gs.find('div', attrs={'class':'game-field'}).text # stadium
    
    game_stats['Date'] = gs.find('span', attrs={'data-behavior':'date_time'}).get('data-date') # date
    
    try:
        game_stats['Location'] = gs.find('li', attrs={'class':'icon-font-before icon-location-solid-before'}).text.strip() # city, state
    except:
        game_stats['Location'] = 'NA'
    
    # odds
    try:
        odds = gs.find('div', attrs={'class':'odds-lines-plus-logo'}).text.strip().split('\n') # odds
        game_stats['Odds'] = odds[0]
        game_stats['O/U'] = odds[1]
    except:
        game_stats['Odds'] = 'NA'
        game_stats['O/U'] = 'NA'

    # Attendance/Capacity
    attcap = gs.find_all('div', attrs={'class':'game-info-note capacity'})
    for ac in attcap:
        if 'Capacity' in ac.text:
            game_stats['Capacity'] = ac.text.split(': ')[1] # capacity
        elif 'Attendance' in ac.text:
            game_stats['Attendance'] = ac.text.split(': ')[1] # attendance
    
    # Duration and Umps
    extra_info = gs.find_all('div', attrs={'class':'game-info-note__container'})
    for ei in extra_info:
        if 'Game Time' in ei.text:
            game_stats['Duration'] = ei.text.split(': ')[1] # game time
        elif 'Umpires' in ei.text:
            game_stats['Umpires'] = ei.text.split(': ')[1] # umpires

    if canceled:
        game_stats['Duration'] = 'Canceled'
    
    # How each batter did in each at bat (or mention if they had no at-bats)
    if not canceled:
        rolling = pd.DataFrame()
        team = game_stats['away']
        for j in [0, 3]: # 0 is away, 3 is home
            # find all of the rows
            for i, row in enumerate(soup2.find_all('table')[j].find_all('tr')[1:]):
                # alternates between name and at-bats
                if i % 2 == 0:
                    name = row.find('span', attrs={'class':'name'}).text
                    player_id = row.find('div', attrs={'class':'accordion-item'}).get('data-key').split('-')[1]
                else:
                    bats = row.find_all('li')
                    bats = [(player_id, name, x.text) for x in bats]

                    temp = pd.DataFrame(bats)
                    temp['Team'] = team
                    rolling = pd.concat([rolling, temp])

            team = game_stats['home'] # switch to home team for second iteration

        rolling.columns = ['Batter Id', 'Batter', 'Event', 'Team']
        rolling['Game'] = gameid
        rolling = rolling[['Game', 'Team', 'Batter Id', 'Batter', 'Event']]
        rolling = rolling.reset_index(drop=True)
        saveDF(rolling, 'plays.csv') # plays.csv -------------------------------------------
    return game_stats, False

In [4]:
# ---------------- NEED TO BE CHANGED ----------------------
url_date =  "20210401" # start date
stop_date = "20210816" # end date

#url_date =  "20200723" # start date
#stop_date = "20201028" # end date

#url_date =  "20190320" # start date
#stop_date = "20191030" # end date

#url_date =  "20180329" # start date
#stop_date = "20181028" # end date

#url_date =  "20170402" # start date
#stop_date = "20171101" # end date

#url_date =  "20160403" # start date
#stop_date = "20161102" # end date

# ----------------------------------------------------------

all_df_dict = {}

three_days = datetime.timedelta(3)
year = url_date[:4] # used by saveDF
url = "https://www.espn.com/mlb/schedule/_/date/{}"
no_events = []

# get list of games data is present for
try:
    games_df = pd.read_csv(os.path.join('data', year, 'games.csv'))
    list_of_games = games_df['Game'].unique()
except:
    list_of_games = []

# Main Loop
iii = 0
while True: 
    # stopping condition
    if url_date > stop_date:
        break
    
    tm.sleep(1)
        
    # print('Looking at...', url_date)
    soup = loadPage(url.format(url_date)) # Get page for 3 days of games
    
    # get inner page container
    stuff = soup.find_all('div', attrs={'id':'sched-container'})[0]
    
    current_date = datetime.datetime.strptime(url_date, "%Y%m%d")
    
    # if there are no games, keep going
    if stuff.text == 'No games scheduled':
        url_date = (current_date + three_days).strftime("%Y%m%d")
        continue
    
    # zip each chunk of games with it's header (the date)
    for h2, table in zip(stuff.find_all('h2'), stuff.find_all('table')):
        games = table.find_all('tr', class_=['even', 'odd'])
        print(10* '*' + ' ' + h2.text + ' ' + 10 * '*')
        
        # some problems have arisen, so if day is past stop_date, just break
        if datetime.datetime.strptime(h2.text+" {}".format(url_date[:4]), "%A, %B %d %Y").strftime("%Y%m%d") > stop_date:
            break
        
        # go through each game
        for game in games:
            if game.find('td', attrs={'class':'tickets'}) is not None: # game didn't start
                # print('Tickets {} - future game, skipped'.format(gameid))
                continue
            elif game.find('td', attrs={'class':'live'}) is not None: # game is live
                print('Live {}, skipped'.format(gameid))
                continue
            elif len(game.find_all('td')) == 1: # result row during postseason
                continue
            
            gameid = game.find_all('td')[2].find('a').get('href').split('/')[-1]
            # all star game is different
            result = game.find_all('a')[4].text if len(game.find_all('a')) != 4 else game.find_all('a')[0].text 
            canceled = False
            
            # skip if game was postponed
            if result == 'Postponed':
                print('Postponed {}, skipped'.format(gameid))
                continue
            elif result == 'Canceled':
                canceled = True
            
            # if data has already been collected for game, continue
            if int(gameid) in list_of_games:
                continue
            
            # print(gameid)
            iii += 1
            ttt = tm.time()
            
            # scrape each of three pages
            game_stats, spring_training = scrapeMainPage(gameid, canceled)
            if spring_training:
                continue
            if not canceled:
                game_stats = scrapeBoxScorePage(gameid, game_stats)
                events_flag = scrapePitchByPitchPage(gameid)
                if not events_flag:
                    no_events.append(gameid)
            
            # extra innings flag
            if 'F/' in result:
                game_stats['Extra Innings'] = True
            else:
                False
            
            games_df = pd.DataFrame.from_dict(game_stats, orient='index').T 
            saveDF(games_df, 'games.csv') # games.csv --------------------------------------------
            print(round((tm.time() - ttt), 2))
    
    # move url_date forward
    url_date = (current_date + three_days).strftime("%Y%m%d")
    
# flush all_df_dict
path = os.path.join('data', year)
if not os.path.exists(path):
    os.makedirs(path)

for key in all_df_dict:
    total_path = os.path.join(path, key) 
    df = all_df_dict[key] # get df
    
    if os.path.exists(total_path): # if it already exists, get the old one & combine
        old_df = pd.read_csv(total_path)
        to_write_df = pd.concat([old_df, df], ignore_index=True)
    else:
        to_write_df = df
    
    to_write_df.to_csv(total_path, index=False)

# no inningHighlights.csv, events.csv, pitches.csv
with open(os.path.join('data', year, 'no_events.txt'), 'a') as f:
    for gameid in no_events:
        f.write(gameid+'\n')

********** Thursday, April 1 **********
Postponed 401227054, skipped
Postponed 401227060, skipped
********** Friday, April 2 **********
********** Saturday, April 3 **********
Postponed 401227081, skipped
********** Sunday, April 4 **********
Postponed 401227096, skipped
********** Monday, April 5 **********
Postponed 401227101, skipped
********** Tuesday, April 6 **********
********** Wednesday, April 7 **********
********** Thursday, April 8 **********
********** Friday, April 9 **********
********** Saturday, April 10 **********
Postponed 401227167, skipped
********** Sunday, April 11 **********
Postponed 401227176, skipped
********** Monday, April 12 **********
Postponed 401227191, skipped
Postponed 401227201, skipped
Postponed 401227199, skipped
********** Tuesday, April 13 **********
********** Wednesday, April 14 **********
Postponed 401227230, skipped
********** Thursday, April 15 **********
Postponed 401227239, skipped
********** Friday, April 16 **********
Postponed 401227247

In [5]:
games_df = pd.read_csv(os.path.join('data', year, 'games.csv'))
games_df.tail()

Unnamed: 0,Game,away,away-record,awayaway-record,home,home-record,homehome-record,away-score,home-score,postseason info,...,LOSS - Pitcher - Id,LOSS - Pitcher - Name,LOSS - Pitcher - AbbrName,LOSS - Pitcher - Record,SAVE - Pitcher - Stats,SAVE - Pitcher - Id,SAVE - Pitcher - Name,SAVE - Pitcher - AbbrName,SAVE - Pitcher - Record,Extra Innings
1785,401228837,CLE,57-60,28-33 Away,MIN,53-66,29-32 Home,4,5,,...,33267.0,Nick Wittgren,N. Wittgren,(2-6),,,,,,True
1786,401228838,HOU,70-48,33-25 Away,KC,50-67,30-30 Home,6,7,,...,32888.0,Yimi Garcia,Y. Garcia,(3-8),,,,,,
1787,401289753,SD,67-54,27-30 Away,COL,53-66,39-21 Home,5,6,,...,30376.0,Daniel Hudson,D. Hudson,(4-2),,,,,,
1788,401228839,NYM,59-59,23-36 Away,SF,77-42,41-18 Home,5,7,,...,33820.0,Miguel Castro,M. Castro,(3-4),"1.0 IP, 0 ER, 1 K, 0 BB",28959.0,Jake McGee,J. McGee,(26),
1789,401228841,PIT,42-77,18-42 Away,LAD,73-46,37-20 Home,1,2,,...,33014.0,Chasen Shreve,C. Shreve,(1-1),"1.0 IP, 0 ER, 1 K, 0 BB",29630.0,Kenley Jansen,K. Jansen,(24),


In [6]:
pitches_df = pd.read_csv(os.path.join('data', year, 'pitches.csv'))
pitches_df.tail()

Unnamed: 0,Num,Pitch,Type,MPH,play-hitzone,play-bases,play-field,Pitcher,Pitching Team,Batting Team,Inning,Event Id,Game
522434,4,Double,Sinker,95,top: 9.5px; right: 17.22px;,2.0,top: 11.38px; right: 28.12px;,Jansen,LAD,PIT,Top 9th,97,401228841
522435,1,Strike Looking,Cutter,93,top: 15.14px; right: 19.67px;,2.0,,Jansen,LAD,PIT,Top 9th,98,401228841
522436,2,Ground Out,Cutter,93,top: 12.75px; right: 27.2px;,3.0,top: 16.95px; right: 16.52px;,Jansen,LAD,PIT,Top 9th,98,401228841
522437,1,Strike Looking,Cutter,93,top: 14.27px; right: 19.06px;,3.0,,Jansen,LAD,PIT,Top 9th,99,401228841
522438,2,Ground Out,Sinker,95,top: 14.71px; right: 20.89px;,3.0,top: 15.33px; right: 16.4px;,Jansen,LAD,PIT,Top 9th,99,401228841


In [7]:
# Combines the seasons into a 'total' folder
first_year = 2016
last_year = 2021

# make sure the path exists
path = os.path.join('data', 'total')
if not os.path.exists(path):
    os.makedirs(path)

# go through each file
for file in os.listdir(os.path.join('data', '2020')):
    if '.csv' not in file:
        continue
    # go through each year
    for year in range(first_year, last_year+1): 
        new_df = pd.read_csv(os.path.join('data', str(year), file))
        
        if year == first_year:
            rolling_df = new_df
        else:
            rolling_df = rolling_df.append(new_df, ignore_index=True)
    
    # write it
    rolling_df.to_csv(os.path.join(path, file), index=False)
rolling_df = None
new_df = None

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
strike_calls = ['Strike Looking', 'Strike Swinging', 'Foul Ball', 
                'Ground Out', 'Fly Out', 'Line Out', 'Pop Out', 
                'Foul Out', 'Bunted Foul', 'Sacrifice Fly', 'Sacrifice', 
                'Batters Fielders Choice (runner Out)', 'Bunt Ground Out', 
                'Bunt Pop Out', 'Strikeout Batter Safe, Passed Ball']
hit_calls = ['Single', 'Double', 'Triple', 'Home Run', 'Ground Rule Double', 
             'Bunt Single', 'Bunt Double', 'Inside The Park Home Run']
ball_calls = ['Ball', 'Hit By Pitch', 'Wild Pitch; Runner Reached', 'Intentional Ball']
error_calls = ['Batter Reached On Error (batter To First)', 'Catchers Interference (batter To First/error)']
other_calls = ['Batters Fielders Choice (all Runners Safe)', 'Batters Interference (batter Out)', 
               'Official Ruling Pending'] # ?

pitches_df['Pitch'].value_counts()

Ball                                             187540
Foul Ball                                         91817
Strike Looking                                    86676
Strike Swinging                                   64661
Ground Out                                        25145
Single                                            18098
Fly Out                                           15338
Line Out                                           7459
Double                                             5440
Home Run                                           4328
Pop Out                                            3881
Batters Fielders Choice (runner Out)               2976
Foul Out                                           2417
Hit By Pitch                                       1560
Bunted Foul                                        1201
Batter Reached On Error (batter To First)           946
Sacrifice Fly                                       819
Sacrifice                                       