In [16]:
import pandas as pd
import time

Retrosheet Game Logs

Columns of interest:

- (Field number)
- (0) Date in the form "yyyymmdd"
- (1) Number of game : '0' indicates only one game played that day
- (3) Visiting team
- (5) Visiting team game number
- (6) Home team
- (8) Home team game number
- (9-10) Visiting and home team score
- (13) completion information, field should be empty for games that were completed same day
- (21-37) Visiting team offensive stats:
    - at-bats
    - hits
    - doubles
    - triples
    - homeruns
    - RBI
    - sacrifice hits.  This may include sacrifice flies for years prior to 1954 when sacrifice flies were allowed.
    - sacrifice flies (since 1954)
    - hit-by-pitch
    - walks
    - intentional walks
    - strikeouts
    - stolen bases
    - caught stealing
    - grounded into double plays
    - awarded first on catcher's interference
    - left on base
- (49-65) Home team offensive stats
- (101-102) Visiting starting pitcher ID and name
- (103-104) Home starting pitcher ID and name
- (105-131) Visiting starting players ID, name and defensive position, listed in the order (1-9) they appeared in the batting order
- (132-158) Home starting players ID, name and defensive position listed in the order (1-9) they appeared in the batting order

In [2]:
bs_2018 = pd.read_csv('data/GL2018.TXT', header=None)
bs_2018.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,151,152,153,154,155,156,157,158,159,160
0,20180329,0,Thu,COL,NL,1,ARI,NL,1,2,...,Nick Ahmed,6,dysoj001,Jarrod Dyson,9,corbp001,Patrick Corbin,1,,Y
1,20180329,0,Thu,PHI,NL,1,ATL,NL,1,5,...,Dansby Swanson,6,flahr001,Ryan Flaherty,5,tehej001,Julio Teheran,1,,Y
2,20180329,0,Thu,SFN,NL,1,LAN,NL,1,1,...,Yasmani Grandal,2,forsl001,Logan Forsythe,5,kersc001,Clayton Kershaw,1,,Y
3,20180329,0,Thu,CHN,NL,1,MIA,NL,1,8,...,Miguel Rojas,6,wallc001,Chad Wallach,2,urenj001,Jose Urena,1,,Y
4,20180329,0,Thu,SLN,NL,1,NYN,NL,1,4,...,Kevin Plawecki,2,syndn001,Noah Syndergaard,1,rosaa003,Amed Rosario,6,,Y


In [8]:
field_nos = [0,1,3,5,6,8,9,10,13]+list(range(21,38))+list(range(49,66))+[101,102,103,104]
bs_2018_cut = bs_2018.iloc[:,field_nos] # only columns of interest
bs_2018_cut = bs_2018_cut.loc[bs_2018_cut[13].isna(),:] # keep games that were completed same day

In [2]:
# rename columns
col_rename_dict = {0 : 'date',
                   1 : 'game_num',
                   3 : 'v_team',
                   5 : 'v_game_num',
                   6 : 'h_team',
                   8 : 'h_game_num',
                   9 : 'v_score',
                   10 : 'h_score',
                   13 : 'completion_info',
                   21 : 'v_ab',
                   22 : 'v_h',
                   23 : 'v_2b',
                   24 : 'v_3b',
                   25 : 'v_hr',
                   26 : 'v_rbi',
                   27 : 'v_sac_bunt',
                   28 : 'v_sac_fly',
                   29 : 'v_hbp',
                   30 : 'v_bb',
                   31 : 'v_ibb',
                   32 : 'v_k',
                   33 : 'v_sb',
                   34 : 'v_cs',
                   35 : 'v_gidp',
                   36 : 'v_catch_int',
                   37 : 'v_lob',
                   49 : 'h_ab',
                   50 : 'h_h',
                   51 : 'h_2b',
                   52 : 'h_3b',
                   53 : 'h_hr',
                   54 : 'h_rbi',
                   55 : 'h_sac_bunt',
                   56 : 'h_sac_fly',
                   57 : 'h_hbp',
                   58 : 'h_bb',
                   59 : 'h_ibb',
                   60 : 'h_k',
                   61 : 'h_sb',
                   62 : 'h_cs',
                   63 : 'h_gidp',
                   64 : 'h_catch_int',
                   65 : 'h_lob',
                   101 : 'v_sp_id',
                   102 : 'v_sp_name',
                   103 : 'h_sp_id',
                   104 : 'h_sp_name'}

bs_2018_cut.rename(columns=col_rename_dict, inplace=True)

NameError: name 'bs_2018_cut' is not defined

team hitting stats over last 10 games

In [10]:
cutoff_day = 20180501 # keep games at least one month into season
look_back = 10 # how many games to look back

keep_2018 = bs_2018_cut.loc[bs_2018_cut['date']>=cutoff_day]

# columns for hitting dataframe
hitting_col = ['date','h_team','h_obp','h_slg','h_k_rate','h_bb_rate','v_team','v_obp','v_slg','v_k_rate','v_bb_rate']
y_col = ['home_win']
hitting_2018 = pd.DataFrame(columns=(hitting_col+y_col))

# list of offensive stats (removing v_ from column names)
hit_stats = list(keep_2018.columns.values[9:26])
for i in range(len(hit_stats)):
    hit_stats[i] = hit_stats[i][2:]

for i in range(keep_2018.shape[0]):
    game = keep_2018.iloc[i,:]
    teams = ['v_', 'h_']
    
    # to be used to add new row to hitting dataframe
    new_row_dict = {}
    new_row_dict['date'] = game['date']
    new_row_dict['home_win'] = int(game['h_score']>game['v_score'])
    new_row_dict['h_team'] = game['h_team']
    new_row_dict['v_team'] = game['v_team']
    
    for t in teams:
        t_name = game[t+'team']
        t_game_num = game[t+'game_num']
        
        # boolean for filtering last n games
        # games when team was away
        away = (bs_2018_cut['v_team']==t_name) & (bs_2018_cut['v_game_num']>=(t_game_num-look_back)) & (bs_2018_cut['v_game_num']<t_game_num)
        # when team was home
        home = (bs_2018_cut['h_team']==t_name) & (bs_2018_cut['h_game_num']>=(t_game_num-look_back)) & (bs_2018_cut['h_game_num']<t_game_num)
         
        last_away = bs_2018_cut.loc[away,:]
        last_home = bs_2018_cut.loc[home,:]
        
        last_n_dict = {}
        for stat in hit_stats:
            last_n_dict[stat] = list(last_away['v_'+stat].values)+list(last_home['h_'+stat].values)
        
        # last n games for given team
        last_n = pd.DataFrame.from_dict(last_n_dict)
        totals = last_n.sum()
        
        # team on base percentage
        obp = (totals['h']+totals['bb']+totals['ibb']+totals['hbp'])/(totals['ab']+totals['bb']+totals['ibb']+totals['hbp']+totals['sac_fly'])
        new_row_dict[t+'obp'] = round(obp,3)
        
        # team slugging percentage
        n_1b = totals['h']-(totals['2b']+totals['3b']+totals['hr'])
        slg = (n_1b + 2*totals['2b'] + 3*totals['3b'] + 4*totals['hr'])/totals['ab']
        new_row_dict[t+'slg'] = round(slg,3)
        
        # team strikeout rate
        # NOTE : PA calculation excludes reaching on fielding error
        pa = totals['ab']+totals['bb']+totals['ibb']+totals['sac_bunt']+totals['sac_fly']+totals['hbp']+totals['catch_int']
        k_rate = totals['k']/pa
        new_row_dict[t+'k_rate'] = round(k_rate,3)
        
        # team walk rate
        bb_rate = (totals['bb']+totals['ibb'])/pa
        new_row_dict[t+'bb_rate'] = round(bb_rate,3)
    
    hitting_2018 = hitting_2018.append(new_row_dict, ignore_index=True)

In [44]:
hitting_2018

Unnamed: 0,date,h_team,h_obp,h_slg,h_k_rate,h_bb_rate,v_team,v_obp,v_slg,v_k_rate,v_bb_rate,home_win
0,20180501,ARI,0.334,0.443,0.236,0.119,LAN,0.335,0.428,0.228,0.103,1
1,20180501,CHN,0.322,0.411,0.207,0.065,COL,0.336,0.366,0.249,0.110,0
2,20180501,CIN,0.369,0.434,0.181,0.117,MIL,0.291,0.330,0.209,0.106,0
3,20180501,MIA,0.308,0.366,0.252,0.087,PHI,0.320,0.373,0.261,0.102,1
4,20180501,NYN,0.326,0.357,0.206,0.108,ATL,0.341,0.464,0.215,0.088,0
5,20180501,SFN,0.332,0.414,0.237,0.096,SDN,0.303,0.348,0.277,0.083,0
6,20180501,SLN,0.308,0.359,0.211,0.096,CHA,0.318,0.472,0.217,0.056,1
7,20180501,WAS,0.336,0.378,0.198,0.112,PIT,0.342,0.387,0.197,0.102,1
8,20180501,ANA,0.246,0.323,0.255,0.080,BAL,0.312,0.447,0.207,0.077,1
9,20180501,BOS,0.295,0.363,0.252,0.067,KCA,0.337,0.396,0.156,0.090,0


checkpoint : basic team hitting stats done

next : starting pitchers recent performance

In [3]:
from pybaseball import playerid_lookup
from pybaseball import pitching_stats_range
from datetime import datetime, timedelta

In [17]:
data = pitching_stats_range('2018-03-29', '2018-04-30')

In [18]:
data.columns

Index(['Name', 'Age', '#days', 'Lev', 'Tm', 'G', 'GS', 'W', 'L', 'SV', 'IP',
       'H', 'R', 'ER', 'BB', 'SO', 'HR', 'HBP', 'ERA', 'AB', '2B', '3B', 'IBB',
       'GDP', 'SF', 'SB', 'CS', 'PO', 'BF', 'Pit', 'Str', 'StL', 'StS',
       'GB/FB', 'LD', 'PU', 'WHIP', 'BAbip', 'SO9', 'SO/W'],
      dtype='object')

In [16]:
sp_look_back = 18 # days
pitch_pull_cols = ['H','BB','SO','HR','HBP','AB','2B','3B','IBB','SF','SB','GB/FB','LD','SO/W']

# columns for pitching dataframe
pitching_col = ['date','h_team','h_sp_name','h_sp_obp','h_sp_slg','h_sp_gb/fb','h_sp_ld','h_sp_k/bb','v_team','v_sp_name','v_sp_obp','v_sp_slg','v_sp_gb/fb','v_sp_ld','v_sp_k/bb']
pitching_2018 = pd.DataFrame(columns=pitching_col)

dates = keep_2018['date'].unique()
for date in dates:
    day = keep_2018.loc[keep_2018['date']==date,:]
    start_dt = (datetime.strptime(str(date), '%Y%m%d') - timedelta(days=sp_look_back)).strftime('%Y-%m-%d')
    end_dt = (datetime.strptime(str(date), '%Y%m%d') - timedelta(days=1)).strftime('%Y-%m-%d')
    
    # league-wide pitching stats during look back window
    try:
        pitch_stats = pitching_stats_range(start_dt, end_dt)

        for i in range(day.shape[0]):
            new_row_dict = {}
            new_row_dict['date'] = date
            
            game = day.iloc[i,:]
            teams = ['v_', 'h_']
            for t in teams:
                t_team = game[t+'team']
                t_sp_name = game[t+'sp_name']

                if t_sp_name in list(pitch_stats['Name']):
                    new_row_dict[t+'team'] = t_team
                    new_row_dict[t+'sp_name'] = t_sp_name
                    
                    sp_stats = pitch_stats.loc[pitch_stats['Name']==t_sp_name,:][pitch_pull_cols].reset_index(drop=True).iloc[0,:]
                    
                    # opposing on base percentage
                    obp = (sp_stats['H']+sp_stats['BB']+sp_stats['IBB']+sp_stats['HBP'])/(sp_stats['AB']+sp_stats['BB']+sp_stats['IBB']+sp_stats['HBP']+sp_stats['SF'])
                    new_row_dict[t+'sp_obp'] = round(obp, 3)

                    # opposing slugging
                    n_1b = sp_stats['H']-(sp_stats['2B']+sp_stats['3B']+sp_stats['HR'])
                    slg = (sp_stats['H'] + 2*sp_stats['2B'] + 3*sp_stats['3B'] + 4*sp_stats['HR'])/sp_stats['AB']
                    new_row_dict[t+'sp_slg'] = round(slg, 3)

                    # line drive rate
                    new_row_dict[t+'sp_ld'] = sp_stats['LD']
                    # groundball to flyball ratio
                    new_row_dict[t+'sp_gb/fb'] = sp_stats['GB/FB']
                    # strikeout to walk ratio
                    new_row_dict[t+'sp_k/bb'] = sp_stats['SO/W']
                else:
                    new_row_dict[t+'team'] = t_team
                    new_row_dict[t+'sp_name'] = t_sp_name

                    for attr in ['sp_obp', 'sp_slg', 'sp_ld', 'sp_gb/fb', 'sp_k/bb']:
                        new_row_dict[t+attr] = 'Debut!'
                        
            pitching_2018 = pitching_2018.append(new_row_dict, ignore_index=True)
    
    except IndexError:
        print('No pitching stats within range!')    


day done
day done
day done
day done
day done
day done
day done
day done
day done


KeyboardInterrupt: 

In [41]:
pitching_2018

Unnamed: 0,date,h_team,h_sp_name,h_sp_obp,h_sp_slg,h_sp_gb/fb,h_sp_ld,h_sp_k/bb,v_team,v_sp_name,v_sp_obp,v_sp_slg,v_sp_gb/fb,v_sp_ld,v_sp_k/bb
0,20180501,ARI,Matt Koch,0.261,0.429,0.62,0.24,2,LAN,Clayton Kershaw,0.278,0.466,0.54,0.21,3.83
1,20180501,CHN,Kyle Hendricks,0.222,0.432,0.67,0.18,,COL,Jon Gray,0.338,0.545,0.33,0.45,3.6
2,20180501,CIN,Homer Bailey,0.333,0.706,0.42,0.22,1.8,MIL,Chase Anderson,0.271,0.381,0.5,0.08,2
3,20180501,MIA,Jarlin Garcia,0.256,0.351,0.32,0.21,1.5,PHI,Zach Eflin,Debute,Debute,Debute,Debute,Debute
4,20180501,NYN,Noah Syndergaard,0.219,0.38,0.42,0.27,24,ATL,Mike Soroka,Debute,Debute,Debute,Debute,Debute
5,20180501,SFN,Andrew Suarez,Debute,Debute,Debute,Debute,Debute,SDN,Tyson Ross,0.28,0.273,0.41,0.34,3.14
6,20180501,SLN,Michael Wacha,0.265,0.239,0.44,0.36,5.5,CHA,James Shields,0.426,0.549,0.27,0.36,0.8
7,20180501,WAS,Max Scherzer,0.243,0.424,0.37,0.2,5,PIT,Chad Kuhl,0.37,0.561,0.36,0.26,2.8
8,20180501,ANA,Nick Tropeano,0.378,0.744,0.39,0.23,1.8,BAL,Alex Cobb,0.507,0.935,0.47,0.25,1.33
9,20180501,BOS,Chris Sale,0.25,0.569,0.36,0.18,4.4,KCA,Jake Junis,Debute,Debute,Debute,Debute,Debute


left to handle

- K/BB is nan when pitcher issued no walks during time period
- no data for pitcher making season debut

### Output Data

In [45]:
hitting_2018.to_csv(path_or_buf = 'data/hitting_2018.csv', index=False)
pitching_2018.to_csv(path_or_buf = 'data/pitching_2018.csv', index=False)

## Turning this into a function

- set cutoff date to 6/1, we can look back up to two months worth of games

In [4]:
bs_2017 = pd.read_csv('data/GL2017.TXT', header=None)
bs_2016 = pd.read_csv('data/GL2016.TXT', header=None)

In [23]:
def rename_col(to_rename, col_rename_dict):
    field_nos = [0,1,3,5,6,8,9,10,13]+list(range(21,38))+list(range(49,66))+[101,102,103,104]
    to_rename = to_rename.iloc[:,field_nos] # only columns of interest
    to_rename = to_rename.loc[to_rename[13].isna(),:] # keep games that were completed same day
    return to_rename.rename(columns=col_rename_dict, inplace=False)

def hitting_stats(full_year, start_day, look_back):
    keep = full_year.loc[full_year['date']>=start_day]

    # columns for hitting dataframe
    hitting_col = ['date','h_team','h_obp','h_slg','h_k_rate','h_bb_rate','v_team','v_obp','v_slg','v_k_rate','v_bb_rate']
    y_col = ['home_win']
    hitting = pd.DataFrame(columns=(hitting_col+y_col))

    # list of offensive stats (removing v_ from column names)
    hit_stats = list(keep.columns.values[9:26])
    for i in range(len(hit_stats)):
        hit_stats[i] = hit_stats[i][2:]

    for i in range(keep.shape[0]):
        game = keep.iloc[i,:]
        teams = ['v_', 'h_']

        # to be used to add new row to hitting dataframe
        new_row_dict = {}
        new_row_dict['date'] = game['date']
        new_row_dict['home_win'] = int(game['h_score']>game['v_score'])
        new_row_dict['h_team'] = game['h_team']
        new_row_dict['v_team'] = game['v_team']

        for t in teams:
            t_name = game[t+'team']
            t_game_num = game[t+'game_num']

            # boolean for filtering last n games
            # games when team was away
            away = (full_year['v_team']==t_name) & (full_year['v_game_num']>=(t_game_num-look_back)) & (full_year['v_game_num']<t_game_num)
            # when team was home
            home = (full_year['h_team']==t_name) & (full_year['h_game_num']>=(t_game_num-look_back)) & (full_year['h_game_num']<t_game_num)

            last_away = full_year.loc[away,:]
            last_home = full_year.loc[home,:]

            last_n_dict = {}
            for stat in hit_stats:
                last_n_dict[stat] = list(last_away['v_'+stat].values)+list(last_home['h_'+stat].values)

            # last n games for given team
            last_n = pd.DataFrame.from_dict(last_n_dict)
            totals = last_n.sum()

            # team on base percentage
            obp = (totals['h']+totals['bb']+totals['ibb']+totals['hbp'])/(totals['ab']+totals['bb']+totals['ibb']+totals['hbp']+totals['sac_fly'])
            new_row_dict[t+'obp'] = round(obp,3)

            # team slugging percentage
            n_1b = totals['h']-(totals['2b']+totals['3b']+totals['hr'])
            slg = (n_1b + 2*totals['2b'] + 3*totals['3b'] + 4*totals['hr'])/totals['ab']
            new_row_dict[t+'slg'] = round(slg,3)

            # team strikeout rate
            # NOTE : PA calculation excludes reaching on fielding error
            pa = totals['ab']+totals['bb']+totals['ibb']+totals['sac_bunt']+totals['sac_fly']+totals['hbp']+totals['catch_int']
            k_rate = totals['k']/pa
            new_row_dict[t+'k_rate'] = round(k_rate,3)

            # team walk rate
            bb_rate = (totals['bb']+totals['ibb'])/pa
            new_row_dict[t+'bb_rate'] = round(bb_rate,3)

        hitting = hitting.append(new_row_dict, ignore_index=True)
        
    return (hitting, keep)

def pitching_stats(keep, look_back):
    pitch_pull_cols = ['H','BB','SO','HR','HBP','AB','2B','3B','IBB','SF','SB','GB/FB','LD','SO/W']

    # columns for pitching dataframe
    pitching_col = ['date','h_team','h_sp_name','h_sp_obp','h_sp_slg','h_sp_gb/fb','h_sp_ld','h_sp_k/bb','v_team','v_sp_name','v_sp_obp','v_sp_slg','v_sp_gb/fb','v_sp_ld','v_sp_k/bb']
    pitching = pd.DataFrame(columns=pitching_col)

    dates = keep['date'].unique()
    days_done = 0
    for date in dates:
        day = keep.loc[keep['date']==date,:]
        start_dt = (datetime.strptime(str(date), '%Y%m%d') - timedelta(days=look_back)).strftime('%Y-%m-%d')
        end_dt = (datetime.strptime(str(date), '%Y%m%d') - timedelta(days=1)).strftime('%Y-%m-%d')

        # league-wide pitching stats during look back window
        try:
            pitch_stats = pitching_stats_range(start_dt, end_dt)

            for i in range(day.shape[0]):
                new_row_dict = {}
                new_row_dict['date'] = date

                game = day.iloc[i,:]
                teams = ['v_', 'h_']
                for t in teams:
                    t_team = game[t+'team']
                    t_sp_name = game[t+'sp_name']

                    if t_sp_name in list(pitch_stats['Name']):
                        new_row_dict[t+'team'] = t_team
                        new_row_dict[t+'sp_name'] = t_sp_name

                        sp_stats = pitch_stats.loc[pitch_stats['Name']==t_sp_name,:][pitch_pull_cols].reset_index(drop=True).iloc[0,:]

                        # opposing on base percentage
                        obp = (sp_stats['H']+sp_stats['BB']+sp_stats['IBB']+sp_stats['HBP'])/(sp_stats['AB']+sp_stats['BB']+sp_stats['IBB']+sp_stats['HBP']+sp_stats['SF'])
                        new_row_dict[t+'sp_obp'] = round(obp, 3)

                        # opposing slugging
                        n_1b = sp_stats['H']-(sp_stats['2B']+sp_stats['3B']+sp_stats['HR'])
                        slg = (sp_stats['H'] + 2*sp_stats['2B'] + 3*sp_stats['3B'] + 4*sp_stats['HR'])/sp_stats['AB']
                        new_row_dict[t+'sp_slg'] = round(slg, 3)

                        # line drive rate
                        new_row_dict[t+'sp_ld'] = sp_stats['LD']
                        # groundball to flyball ratio
                        new_row_dict[t+'sp_gb/fb'] = sp_stats['GB/FB']
                        # strikeout to walk ratio
                        new_row_dict[t+'sp_k/bb'] = sp_stats['SO/W']
                    else:
                        new_row_dict[t+'team'] = t_team
                        new_row_dict[t+'sp_name'] = t_sp_name

                        for attr in ['sp_obp', 'sp_slg', 'sp_ld', 'sp_gb/fb', 'sp_k/bb']:
                            new_row_dict[t+attr] = 'Debut!'

                pitching = pitching.append(new_row_dict, ignore_index=True)
            days_done += 1
            print(str(days_done)+' done')
            time.sleep(1)
            if (days_done%5)==0:
                pitching.to_csv('data/pitching_2017.csv', index=False)
        except IndexError:
            print('No pitching stats within range!')
    return pitching

In [24]:
messy_data = {#2016:bs_2016,
              2017:bs_2017
              #2018:bs_2018}
}
clean_hitting = {}
clean_pitching = {}
clean_seasons = {}
for year in [2017]:
    full_year = rename_col(messy_data[year], col_rename_dict)
    day = 501
    start_date = year*10000+day
    hitting, keep = hitting_stats(full_year, start_day=start_date, look_back=10)
    clean_hitting[year] = hitting
    pitching = pitching_stats(keep, look_back=30)
    pitching.replace('Debut!', float('nan'), inplace=True)
    col_to_change = ['h_sp_obp','h_sp_slg','h_sp_gb/fb','h_sp_ld','h_sp_k/bb','v_sp_obp','v_sp_slg',
                     'v_sp_gb/fb','v_sp_ld','v_sp_k/bb']
    pitching[col_to_change] = pitching[col_to_change].apply(pd.to_numeric)
    clean_pitching[year] = pitching
    clean_year = pitching.merge(hitting.drop(columns=['date','h_team','v_team']), how='outer', left_index=True, right_index=True).dropna()
    clean_seasons[year] = clean_year
    print('year done')

1 done
2 done
3 done
4 done
5 done
6 done
7 done
8 done
9 done
10 done
11 done
12 done
13 done
14 done
15 done
16 done
17 done
18 done
19 done
20 done
21 done
22 done
23 done
24 done
25 done
26 done
27 done
28 done
29 done
30 done
31 done
32 done
33 done
34 done
35 done
36 done
37 done
38 done
39 done
40 done
41 done
42 done
43 done
44 done
45 done
46 done
47 done
48 done
49 done
50 done
51 done
52 done
53 done
54 done
55 done
56 done
57 done
58 done
59 done
60 done
61 done
62 done
63 done
64 done
65 done
66 done
67 done
68 done
69 done
70 done
71 done
72 done
73 done
74 done
75 done
76 done
77 done
78 done
79 done
80 done
81 done
82 done
83 done
84 done
85 done
86 done
87 done
88 done
89 done
90 done
91 done
92 done
93 done
94 done
95 done
96 done
97 done
98 done
99 done
100 done
101 done
102 done
103 done
104 done
105 done
106 done
107 done
108 done
109 done
110 done
111 done
112 done
113 done
114 done
115 done
116 done
117 done
118 done
119 done
120 done
121 done
122 done
123 done
1

In [26]:
clean_seasons[2017].to_csv('data/data_2017.csv', index=False)

In [29]:
hi = pitching_stats_range('2017-05-01', '2017-06-01')
hi.head()

Unnamed: 0,Name,Age,#days,Lev,Tm,G,GS,W,L,SV,...,Str,StL,StS,GB/FB,LD,PU,WHIP,BAbip,SO9,SO/W
1,Fernando Abad,31,918,MLB-AL,Boston,10,0,1.0,,,...,0.61,0.15,0.07,0.27,0.19,0.15,0.857,0.2,6.8,3.5
2,Tim Adleman,29,919,MLB-NL,Cincinnati,6,6,3.0,1.0,,...,0.61,0.16,0.1,0.42,0.25,0.07,1.337,0.287,7.0,1.85
3,Matt Albers,34,920,MLB-NL,Washington,13,0,1.0,,2.0,...,0.59,0.19,0.1,0.44,0.26,0.15,1.114,0.241,11.6,3.75
4,Al Alburquerque,31,931,MLB-AL,Kansas City,5,0,,1.0,,...,0.66,0.17,0.24,0.86,0.14,0.0,1.25,0.286,13.5,2.0
5,Scott Alexander,27,943,MLB-AL,Kansas City,3,0,,,,...,0.66,0.21,0.03,0.88,0.0,0.0,0.333,0.0,3.0,1.0


In [30]:
hi.columns

Index(['Name', 'Age', '#days', 'Lev', 'Tm', 'G', 'GS', 'W', 'L', 'SV', 'IP',
       'H', 'R', 'ER', 'BB', 'SO', 'HR', 'HBP', 'ERA', 'AB', '2B', '3B', 'IBB',
       'GDP', 'SF', 'SB', 'CS', 'PO', 'BF', 'Pit', 'Str', 'StL', 'StS',
       'GB/FB', 'LD', 'PU', 'WHIP', 'BAbip', 'SO9', 'SO/W'],
      dtype='object')