In [1]:
import os
import pandas as pd
import numpy as np
from pybaseball import team_game_logs, cache

In [2]:
TEAMS = [
    "ARI", "ATL", "BAL", "BOS", "CHC", "CHW", "CIN", "CLE", "COL", "DET",
    "MIA", "HOU", "KCR", "LAA", "LAD", "MIL", "MIN", "NYM", "NYY", "OAK",
    "PHI", "PIT", "SDP", "SFG", "SEA", "STL", "TBR", "TEX", "TOR", "WSN"
]

START = 2010 
END = 2023

In [3]:
cache.enable()
test = team_game_logs(2012, 'MIA')

In [5]:
pd.set_option("display.max_columns", 100)
test = team_game_logs(2024, 'MIA', 'pitching')
test

Unnamed: 0,Game,Date,Home,Opp,Rslt,IP,H,R,ER,UER,BB,SO,HR,HBP,ERA,BF,Pit,Str,IR,IS,SB,CS,AB,2B,3B,IBB,SH,SF,ROE,GDP,NumPlayers,Umpire,PitchersUsed
0,1,Mar 28,True,PIT,"L,5-6",12.0,9,6,5,1,8,17,3,0,3.75,52,213,127,4.0,1.0,0,0,44,2,0,2,0,0,1,0,7,Chris Guccione,"J.Luzardo (99-61), G.Soriano (99-H), A.Nardi (..."
1,2,Mar 29,True,PIT,"L,2-7",9.0,10,7,7,0,7,9,0,0,5.14,43,172,104,2.0,0.0,0,0,34,3,0,0,0,2,0,0,4,Brian Knight,"A.Puk (99-29-L), B.Hoeing (99), B.Smith (99), ..."
2,3,Mar 30,True,PIT,"L,3-9",9.0,16,9,8,1,4,9,0,1,6.00,48,197,126,0.0,0.0,0,0,43,5,0,0,0,0,0,1,5,Gabe Morales,"R.Weathers (99-39-L), S.Sánchez (1), A.Nardi (..."
3,4,Mar 31,True,PIT,"L,7-9",10.0,11,9,7,2,7,13,1,0,6.08,50,204,125,0.0,0.0,2,0,41,0,1,0,2,0,0,0,3,Ryan Additon,"T.Rogers (99-39), V.Gutierrez (99-BSv), T.Scot..."
4,5,Apr 1,True,LAA,"L,4-7",9.0,7,7,7,0,6,7,3,0,6.24,39,170,100,1.0,1.0,2,1,33,1,1,0,0,0,0,0,6,Jordan Baker,"M.Meyer (99-57), G.Soriano (2-BSv), A.Nardi (1..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,60,Jun 2,True,TEX,"L,0-6",9.0,10,6,6,0,4,5,1,0,4.56,40,150,93,0.0,0.0,0,0,36,1,0,0,0,0,0,1,4,Will Little,"T.Rogers (5-45-L), A.Puk (1), D.Cronin (1), B...."
60,61,Jun 4,True,TBR,"L,5-9",9.0,12,9,9,0,4,9,1,2,4.63,44,183,120,1.0,1.0,0,1,38,3,0,0,0,0,0,0,5,Nestor Ceja,"J.Luzardo (6-9-L), D.Cronin (1), H.Brazobán (2..."
61,62,Jun 5,True,TBR,"L,3-5",9.0,8,5,5,0,3,9,0,1,4.64,37,139,82,2.0,0.0,1,0,33,5,0,0,0,0,0,1,6,Manny Gonzalez,"B.Garrett (6-28-L), D.Cronin (0), A.Puk (2), A..."
62,63,Jun 7,True,CLE,"W,3-2",9.0,6,2,2,0,2,8,1,0,4.60,32,114,74,2.0,0.0,0,2,30,2,1,0,0,0,0,0,6,Larry Vanover,"R.Weathers (5-44), E.Ramirez (2), A.Nardi (1),..."


In [18]:
def collect_logs(start, end):
    df = pd.DataFrame()
    for year in range(start, end):
        for team in TEAMS:
            if team == 'MIA' and year < 2012:
                team = 'FLA'
            print(year, team)
            batting_df = team_game_logs(year, team)
            pitching_df = team_game_logs(year, team, "pitching")
            print(pitching_df)
            print(batting_df)
            new_df = batting_df.merge(pitching_df, left_on=['Game'], right_on=['Game'], suffixes=('', '_P'))
            print(new_df)
            new_df['team'] = team
            new_df['year'] = year
            if not df.empty:
                df = pd.concat([df, new_df])
                print(df)
            else:
                df = new_df
    return df
            
    

In [19]:
if os.path.exists("game_logs.csv"):
    games = pd.read_csv("game_logs.csv", index_col=0)
else:
    games = collect_logs(START, END)
    games.to_csv("game_logs.csv")

In [20]:
import datetime
from time import strptime

In [21]:
def edit_date(game):
    og_date = game['Date'].strip().replace('susp', '')
    og_date = og_date.split(' ')

    first = 0
    if len(og_date) > 2 and og_date[2] == '(2)':
        first = 1


    game['Date'] = datetime.datetime(game['year'], strptime(og_date[0], '%b').tm_mon, int(og_date[1]), hour=first)
    
    return game

games = games.apply(edit_date, axis=1)

In [22]:
def game_results(game):
    result = game['Rslt'].split(',')
    game['WIN'] = True if result[0] == 'W' else False

    result = result[1].split('-')
    game['runs_for'] = int(result[0])
    game['runs_ag'] = int(result[1])
    
    
    return game

games = games.apply(game_results, axis=1)

In [23]:
def xbh(game):
    xbh = game['2B'] + game['3B']
    game['XBH'] = xbh
    return game

games = games.apply(xbh, axis=1)

In [24]:
def add_target(team):
    team['TARGET'] = team['WIN'].shift(-1)
    team['Opp_Next'] = team['Opp'].shift(-1)
    team['Date_Next'] = team['Date'].shift(-1)
    return team

games = games.groupby(['team', 'year'], group_keys=False).apply(add_target)

  games = games.groupby(['team', 'year'], group_keys=False).apply(add_target)


In [25]:
def edit_pitcher_name(row):
    row['OppStart'] = row['OppStart'].split('(')[0]
    return row
games = games.apply(edit_pitcher_name, axis=1)

In [26]:
pd.set_option("display.max_columns", 100)
games

Unnamed: 0,Game,Date,Home,Opp,Rslt,PA,AB,R,H,2B,3B,HR,RBI,BB,IBB,SO,HBP,SH,SF,ROE,GDP,SB,CS,BA,OBP,SLG,OPS,LOB,NumPlayers,Thr,OppStart,Date_P,Home_P,Opp_P,Rslt_P,IP,H_P,R_P,ER,UER,BB_P,SO_P,HR_P,HBP_P,ERA,BF,Pit,Str,IR,IS,SB_P,CS_P,AB_P,2B_P,3B_P,IBB_P,SH_P,SF_P,ROE_P,GDP_P,NumPlayers_P,Umpire,PitchersUsed,team,year,WIN,runs_for,runs_ag,XBH,TARGET,Opp_Next,Date_Next
0,1,2010-04-05,True,SDP,"W,6-3",36,33,6,8,0,0,2,6,3,0,8,0,0,0,2,1,1,0,0.242,0.306,0.424,0.730,6,12,R,J.Garland,Apr 5,True,SDP,"W,6-3",9.0,5,3,3,0,0,6,2,0,3.00,32,126.0,87.0,0.0,0.0,0,0,32,2,0,0,0,0,0,0,3,Brian Gorman,"D.Haren (99-71-W), A.Heilman (99), B.Howry (99)",ARI,2010,True,6,3,0,False,SDP,2010-04-06
0,1,2011-04-01,False,COL,"W,7-6",49,44,7,15,6,0,2,6,3,1,7,1,1,0,1,3,1,0,0.341,0.396,0.614,1.009,9,16,R,U.Jiménez,Apr 1,False,COL,"W,7-6",11.0,13,6,3,3,4,11,1,0,2.45,51,204.0,123.0,0.0,0.0,2,0,44,3,0,0,2,1,1,1,6,Tim Tschida,"I.Kennedy (99-46), J.Gutiérrez (99-BSv), A.Hei...",ARI,2011,True,7,6,6,False,COL,2011-04-02
0,1,2012-04-06,True,SFG,"W,5-4",36,32,5,7,2,0,2,5,2,0,9,1,1,0,3,0,0,0,0.219,0.286,0.469,0.754,7,14,R,T.Lincecum,Apr 6,True,SFG,"W,5-4",9.0,11,4,4,0,2,6,1,0,4.00,40,130.0,94.0,2.0,0.0,0,0,37,3,0,0,1,0,0,0,4,Jim Joyce,"I.Kennedy (99-45-W), J.Paterson (99-H), D.Hern...",ARI,2012,True,5,4,2,True,SFG,2012-04-07
0,1,2013-04-01,True,STL,"W,6-2",38,36,6,15,7,0,0,5,1,1,8,0,0,1,0,1,0,1,0.417,0.421,0.611,1.032,8,12,R,A.Wainwright,Apr 1,True,STL,"W,6-2",9.0,5,2,2,0,1,10,0,0,2.00,31,119.0,85.0,0.0,0.0,0,0,30,3,0,0,0,0,0,1,3,Gerry Davis,"I.Kennedy (99-66-W), D.Hernandez (99), B.Ziegl...",ARI,2013,True,6,2,7,False,STL,2013-04-02
0,1,2014-03-22,True,LAD,"L,1-3",35,33,1,5,1,0,0,1,2,0,10,0,0,0,1,0,0,0,0.152,0.200,0.182,0.382,7,15,L,C.Kershaw,Mar 22,True,LAD,"L,1-3",9.0,5,3,3,0,3,11,1,1,3.00,37,136.0,85.0,1.0,0.0,0,0,33,2,0,0,0,0,1,0,5,Tim Welke,"W.Miley (99-55-L), W.Harris (99), B.Ziegler (9...",ARI,2014,False,1,3,1,False,LAD,2014-03-23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162,163,2018-10-01,False,LAD,"L,2-5",35,29,2,4,0,0,2,2,4,0,6,1,1,0,0,0,0,1,0.256,0.322,0.435,0.757,6,16,R,W.Buehler,Oct 1,False,LAD,"L,2-5",8.0,9,5,3,2,2,10,2,0,4.33,35,,,1.0,0.0,0,0,33,3,0,1,0,0,0,1,5,Bill Welke,"G.Márquez (4-49-L), H.Musgrave (2), S.Oberg (2...",COL,2018,False,2,5,0,,,NaT
162,163,2018-10-01,True,COL,"W,5-2",35,33,5,9,3,0,2,5,2,1,10,0,0,0,0,1,0,0,0.250,0.333,0.442,0.774,6,14,R,G.Márquez,Oct 1,True,COL,"W,5-2",9.0,4,2,2,0,4,6,2,1,3.38,35,,,2.0,0.0,0,1,29,0,0,0,1,0,0,0,5,Bill Welke,"W.Buehler (5-72-W), P.Báez (2), S.Alexander (2...",LAD,2018,True,5,2,3,,,NaT
162,163,2018-10-01,False,CHC,"W,3-1",36,34,3,12,1,0,0,3,0,0,6,0,2,0,0,2,1,1,0.252,0.323,0.424,0.747,6,16,L,J.Quintana,Oct 1,False,CHC,"W,3-1",9.0,3,1,1,0,3,8,1,0,3.73,31,134.0,84.0,2.0,0.0,0,0,28,0,0,0,0,0,0,2,5,Fieldin Culbreth,"J.Chacín (4-64), X.Cedeño (1), J.Soria (1), C....",MIL,2018,True,3,1,1,,,NaT
162,163,2013-09-30,False,TEX,"W,5-2",40,32,5,7,2,0,1,4,7,1,8,0,0,1,0,0,1,0,0.257,0.329,0.408,0.737,8,12,L,M.Pérez,Sep 30,False,TEX,"W,5-2",9.0,7,2,2,0,1,4,0,0,3.74,33,118.0,81.0,,,1,1,32,3,0,0,0,0,0,0,1,Jeff Kellogg,D.Price (4-68-W),TBR,2013,True,5,2,2,,,NaT


In [27]:
game_copy = games.copy()

In [28]:
games = games.reset_index(drop=True)
games

Unnamed: 0,Game,Date,Home,Opp,Rslt,PA,AB,R,H,2B,3B,HR,RBI,BB,IBB,SO,HBP,SH,SF,ROE,GDP,SB,CS,BA,OBP,SLG,OPS,LOB,NumPlayers,Thr,OppStart,Date_P,Home_P,Opp_P,Rslt_P,IP,H_P,R_P,ER,UER,BB_P,SO_P,HR_P,HBP_P,ERA,BF,Pit,Str,IR,IS,SB_P,CS_P,AB_P,2B_P,3B_P,IBB_P,SH_P,SF_P,ROE_P,GDP_P,NumPlayers_P,Umpire,PitchersUsed,team,year,WIN,runs_for,runs_ag,XBH,TARGET,Opp_Next,Date_Next
0,1,2010-04-05,True,SDP,"W,6-3",36,33,6,8,0,0,2,6,3,0,8,0,0,0,2,1,1,0,0.242,0.306,0.424,0.730,6,12,R,J.Garland,Apr 5,True,SDP,"W,6-3",9.0,5,3,3,0,0,6,2,0,3.00,32,126.0,87.0,0.0,0.0,0,0,32,2,0,0,0,0,0,0,3,Brian Gorman,"D.Haren (99-71-W), A.Heilman (99), B.Howry (99)",ARI,2010,True,6,3,0,False,SDP,2010-04-06
1,1,2011-04-01,False,COL,"W,7-6",49,44,7,15,6,0,2,6,3,1,7,1,1,0,1,3,1,0,0.341,0.396,0.614,1.009,9,16,R,U.Jiménez,Apr 1,False,COL,"W,7-6",11.0,13,6,3,3,4,11,1,0,2.45,51,204.0,123.0,0.0,0.0,2,0,44,3,0,0,2,1,1,1,6,Tim Tschida,"I.Kennedy (99-46), J.Gutiérrez (99-BSv), A.Hei...",ARI,2011,True,7,6,6,False,COL,2011-04-02
2,1,2012-04-06,True,SFG,"W,5-4",36,32,5,7,2,0,2,5,2,0,9,1,1,0,3,0,0,0,0.219,0.286,0.469,0.754,7,14,R,T.Lincecum,Apr 6,True,SFG,"W,5-4",9.0,11,4,4,0,2,6,1,0,4.00,40,130.0,94.0,2.0,0.0,0,0,37,3,0,0,1,0,0,0,4,Jim Joyce,"I.Kennedy (99-45-W), J.Paterson (99-H), D.Hern...",ARI,2012,True,5,4,2,True,SFG,2012-04-07
3,1,2013-04-01,True,STL,"W,6-2",38,36,6,15,7,0,0,5,1,1,8,0,0,1,0,1,0,1,0.417,0.421,0.611,1.032,8,12,R,A.Wainwright,Apr 1,True,STL,"W,6-2",9.0,5,2,2,0,1,10,0,0,2.00,31,119.0,85.0,0.0,0.0,0,0,30,3,0,0,0,0,0,1,3,Gerry Davis,"I.Kennedy (99-66-W), D.Hernandez (99), B.Ziegl...",ARI,2013,True,6,2,7,False,STL,2013-04-02
4,1,2014-03-22,True,LAD,"L,1-3",35,33,1,5,1,0,0,1,2,0,10,0,0,0,1,0,0,0,0.152,0.200,0.182,0.382,7,15,L,C.Kershaw,Mar 22,True,LAD,"L,1-3",9.0,5,3,3,0,3,11,1,1,3.00,37,136.0,85.0,1.0,0.0,0,0,33,2,0,0,0,0,1,0,5,Tim Welke,"W.Miley (99-55-L), W.Harris (99), B.Ziegler (9...",ARI,2014,False,1,3,1,False,LAD,2014-03-23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60103,163,2018-10-01,False,LAD,"L,2-5",35,29,2,4,0,0,2,2,4,0,6,1,1,0,0,0,0,1,0.256,0.322,0.435,0.757,6,16,R,W.Buehler,Oct 1,False,LAD,"L,2-5",8.0,9,5,3,2,2,10,2,0,4.33,35,,,1.0,0.0,0,0,33,3,0,1,0,0,0,1,5,Bill Welke,"G.Márquez (4-49-L), H.Musgrave (2), S.Oberg (2...",COL,2018,False,2,5,0,,,NaT
60104,163,2018-10-01,True,COL,"W,5-2",35,33,5,9,3,0,2,5,2,1,10,0,0,0,0,1,0,0,0.250,0.333,0.442,0.774,6,14,R,G.Márquez,Oct 1,True,COL,"W,5-2",9.0,4,2,2,0,4,6,2,1,3.38,35,,,2.0,0.0,0,1,29,0,0,0,1,0,0,0,5,Bill Welke,"W.Buehler (5-72-W), P.Báez (2), S.Alexander (2...",LAD,2018,True,5,2,3,,,NaT
60105,163,2018-10-01,False,CHC,"W,3-1",36,34,3,12,1,0,0,3,0,0,6,0,2,0,0,2,1,1,0.252,0.323,0.424,0.747,6,16,L,J.Quintana,Oct 1,False,CHC,"W,3-1",9.0,3,1,1,0,3,8,1,0,3.73,31,134.0,84.0,2.0,0.0,0,0,28,0,0,0,0,0,0,2,5,Fieldin Culbreth,"J.Chacín (4-64), X.Cedeño (1), J.Soria (1), C....",MIL,2018,True,3,1,1,,,NaT
60106,163,2013-09-30,False,TEX,"W,5-2",40,32,5,7,2,0,1,4,7,1,8,0,0,1,0,0,1,0,0.257,0.329,0.408,0.737,8,12,L,M.Pérez,Sep 30,False,TEX,"W,5-2",9.0,7,2,2,0,1,4,0,0,3.74,33,118.0,81.0,,,1,1,32,3,0,0,0,0,0,0,1,Jeff Kellogg,D.Price (4-68-W),TBR,2013,True,5,2,2,,,NaT


In [34]:
games_copy2 = games.copy()
games = games_copy2.copy()

In [35]:
# del games['R']
# del games['RBI']
# del games['NumPlayers']
# del games['ROE']
# del games['HBP']
# del games['CS']
# del games['IBB']
# del games['BB']
# del games['PA']
# del games['2B']
# del games['3B']
# del games['SF']
# del games['SH']

columns_to_delete = [
    'Date_P', 'Home_P', 'Opp_P', 'Rslt_P', 'H_P', 'R_P', 'HBP_P', 
    'IR', 'IS', 'CS_P', 'AB_P', '2B_P', '3B_P', 'IBB_P', 'SH_P', 
    'SF_P', 'ROE_P', 'GDP_P', 'Umpire', 'PitchersUsed',
    'R', 'RBI', 'NumPlayers', 'ROE', 'HBP', 'CS', 'IBB', 'BB', 
    'PA', '2B', '3B', 'SF', 'SH'
]
# Deleting the columns
games.drop(columns=columns_to_delete, inplace=True)

In [36]:
games

Unnamed: 0,Game,Date,Home,Opp,Rslt,AB,H,HR,SO,GDP,SB,BA,OBP,SLG,OPS,LOB,Thr,OppStart,IP,ER,UER,BB_P,SO_P,HR_P,ERA,BF,Pit,Str,SB_P,NumPlayers_P,team,year,WIN,runs_for,runs_ag,XBH,TARGET,Opp_Next,Date_Next
0,1,2010-04-05,True,SDP,"W,6-3",33,8,2,8,1,1,0.242,0.306,0.424,0.730,6,R,J.Garland,9.0,3,0,0,6,2,3.00,32,126.0,87.0,0,3,ARI,2010,True,6,3,0,False,SDP,2010-04-06
1,1,2011-04-01,False,COL,"W,7-6",44,15,2,7,3,1,0.341,0.396,0.614,1.009,9,R,U.Jiménez,11.0,3,3,4,11,1,2.45,51,204.0,123.0,2,6,ARI,2011,True,7,6,6,False,COL,2011-04-02
2,1,2012-04-06,True,SFG,"W,5-4",32,7,2,9,0,0,0.219,0.286,0.469,0.754,7,R,T.Lincecum,9.0,4,0,2,6,1,4.00,40,130.0,94.0,0,4,ARI,2012,True,5,4,2,True,SFG,2012-04-07
3,1,2013-04-01,True,STL,"W,6-2",36,15,0,8,1,0,0.417,0.421,0.611,1.032,8,R,A.Wainwright,9.0,2,0,1,10,0,2.00,31,119.0,85.0,0,3,ARI,2013,True,6,2,7,False,STL,2013-04-02
4,1,2014-03-22,True,LAD,"L,1-3",33,5,0,10,0,0,0.152,0.200,0.182,0.382,7,L,C.Kershaw,9.0,3,0,3,11,1,3.00,37,136.0,85.0,0,5,ARI,2014,False,1,3,1,False,LAD,2014-03-23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60103,163,2018-10-01,False,LAD,"L,2-5",29,4,2,6,0,0,0.256,0.322,0.435,0.757,6,R,W.Buehler,8.0,3,2,2,10,2,4.33,35,,,0,5,COL,2018,False,2,5,0,,,NaT
60104,163,2018-10-01,True,COL,"W,5-2",33,9,2,10,1,0,0.250,0.333,0.442,0.774,6,R,G.Márquez,9.0,2,0,4,6,2,3.38,35,,,0,5,LAD,2018,True,5,2,3,,,NaT
60105,163,2018-10-01,False,CHC,"W,3-1",34,12,0,6,2,1,0.252,0.323,0.424,0.747,6,L,J.Quintana,9.0,1,0,3,8,1,3.73,31,134.0,84.0,0,5,MIL,2018,True,3,1,1,,,NaT
60106,163,2013-09-30,False,TEX,"W,5-2",32,7,1,8,0,1,0.257,0.329,0.408,0.737,8,L,M.Pérez,9.0,2,0,1,4,0,3.74,33,118.0,81.0,1,1,TBR,2013,True,5,2,2,,,NaT


In [37]:
# Statistics to calculate running averages for after removing specified columns
stats = ['AB', 'H', 'XBH', 'HR', 'SO', 'GDP', 'SB', 'SO_P', 'HR_P', 'ERA', 'Str']

# Function to calculate running averages
def running_avg(group):
    return group.expanding().mean()

def last_10_average(group):
    return group.rolling(10,  min_periods=1).mean()

for stat in stats:
    games[f'avg_{stat}'] = games.groupby(['team', 'year'], group_keys=False)[stat].apply(running_avg)
    games[f'rolling_{stat}'] = games.groupby(['team', 'year'], group_keys=False)[stat].apply(last_10_average)



In [38]:
df = games.copy()

In [39]:
df['TARGET'][pd.isnull(df['TARGET'])] = 2

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['TARGET'][pd.isnull(df['TARGET'])] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['TARGET'][pd.isnul

In [40]:
df['TARGET'] = df['TARGET'].astype(int, errors='ignore') # go make change earlier in code

In [41]:
df = df.merge(df, left_on=['Date_Next', 'Opp_Next'], right_on=['Date_Next', 'team'], suffixes=('', '_opp'))
cols_drop = ['Date_opp', 'Opp_Next_opp', 'team_opp', 'year_opp', 'WIN_opp', 'runs_for_opp', 'runs_ag_opp', 'TARGET_opp', 'Game_opp', 'Home_opp', 'Opp_opp', 'Rslt_opp']

# Drop the specified columns
df = df.drop(columns=cols_drop)

df = df.rename(columns={'OppStart_opp': 'Starter', 'Thr_opp':'Thr_Starter'})

In [88]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier

rr = RidgeClassifier(alpha=1)
split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr, n_features_to_select=20, direction='backward', cv=split)

In [89]:
df

Unnamed: 0,Game,Date,Home,Opp,Rslt,AB,H,HR,SO,GDP,SB,BA,OBP,SLG,OPS,LOB,Thr,OppStart,IP,ER,UER,BB_P,SO_P,HR_P,ERA,BF,Pit,Str,SB_P,NumPlayers_P,team,year,WIN,runs_for,runs_ag,XBH,TARGET,Opp_Next,Date_Next,avg_AB,rolling_AB,avg_H,rolling_H,avg_XBH,rolling_XBH,avg_HR,rolling_HR,avg_SO,rolling_SO,avg_GDP,...,avg_Str,rolling_Str,AB_opp,H_opp,HR_opp,SO_opp,GDP_opp,SB_opp,BA_opp,OBP_opp,SLG_opp,OPS_opp,LOB_opp,Thr_Starter,Starter,IP_opp,ER_opp,UER_opp,BB_P_opp,SO_P_opp,HR_P_opp,ERA_opp,BF_opp,Pit_opp,Str_opp,SB_P_opp,NumPlayers_P_opp,XBH_opp,avg_AB_opp,rolling_AB_opp,avg_H_opp,rolling_H_opp,avg_XBH_opp,rolling_XBH_opp,avg_HR_opp,rolling_HR_opp,avg_SO_opp,rolling_SO_opp,avg_GDP_opp,rolling_GDP_opp,avg_SB_opp,rolling_SB_opp,avg_SO_P_opp,rolling_SO_P_opp,avg_HR_P_opp,rolling_HR_P_opp,avg_ERA_opp,rolling_ERA_opp,avg_Str_opp,rolling_Str_opp
0,1,2010-04-05,True,SDP,"W,6-3",0.283333,0.275862,0.250,0.307692,0.166667,0.111111,0.529563,0.498840,0.380020,0.451368,0.24,R,J.Garland,0.3125,0.111111,0.000000,0.000000,0.230769,0.250,0.190476,0.205479,0.204142,0.256831,0.000000,0.166667,ARI,2010,True,0.206897,0.103448,0.000000,0,SDP,2010-04-06,0.425000,0.4250,0.411765,0.411765,0.000000,0.0000,0.250000,0.2500,0.444444,0.444444,0.250000,...,0.356522,0.356522,0.266667,0.172414,0.250,0.230769,0.000000,0.000000,0.308483,0.150812,0.362390,0.324468,0.08,R,D.Haren,0.2500,0.074074,0.363636,0.166667,0.307692,0.250,0.142857,0.260274,0.266272,0.267760,0.111111,0.250000,0.181818,0.400000,0.4000,0.235294,0.235294,0.250000,0.2500,0.250000,0.2500,0.333333,0.333333,0.000000,0.000,0.000000,0.000,0.444444,0.444444,0.250000,0.2500,0.142857,0.142857,0.373913,0.373913
1,1,2011-04-01,False,COL,"W,7-6",0.466667,0.517241,0.250,0.269231,0.500000,0.111111,0.784062,0.707657,0.566112,0.663374,0.36,R,U.Jiménez,0.4375,0.111111,0.272727,0.222222,0.423077,0.125,0.155556,0.465753,0.434911,0.453552,0.222222,0.416667,ARI,2011,True,0.241379,0.206897,0.545455,0,COL,2011-04-02,0.700000,0.7000,0.823529,0.823529,0.750000,0.7500,0.250000,0.2500,0.388889,0.388889,0.750000,...,0.669565,0.669565,0.466667,0.448276,0.125,0.423077,0.166667,0.222222,0.665810,0.593968,0.387855,0.488602,0.48,R,I.Kennedy,0.4375,0.222222,0.090909,0.166667,0.269231,0.250,0.311746,0.438356,0.337278,0.387978,0.111111,0.500000,0.272727,0.700000,0.7000,0.705882,0.705882,0.375000,0.3750,0.125000,0.1250,0.611111,0.611111,0.250000,0.250,0.500000,0.500,0.388889,0.388889,0.250000,0.2500,0.311746,0.311746,0.565217,0.565217
2,1,2012-04-06,True,SFG,"W,5-4",0.266667,0.241379,0.250,0.346154,0.000000,0.000000,0.470437,0.452436,0.424094,0.469605,0.28,R,T.Lincecum,0.3125,0.148148,0.000000,0.111111,0.230769,0.125,0.253968,0.315068,0.215976,0.295082,0.000000,0.250000,ARI,2012,True,0.172414,0.137931,0.181818,1,SFG,2012-04-07,0.400000,0.4000,0.352941,0.352941,0.250000,0.2500,0.250000,0.2500,0.500000,0.500000,0.000000,...,0.417391,0.417391,0.350000,0.379310,0.125,0.230769,0.000000,0.000000,0.670951,0.561485,0.414300,0.499240,0.36,R,I.Kennedy,0.2500,0.185185,0.000000,0.111111,0.346154,0.250,0.356825,0.260274,0.233728,0.240437,0.000000,0.250000,0.272727,0.525000,0.5250,0.588235,0.588235,0.375000,0.3750,0.125000,0.1250,0.333333,0.333333,0.000000,0.000,0.000000,0.000,0.500000,0.500000,0.250000,0.2500,0.356825,0.356825,0.330435,0.330435
3,1,2013-04-01,True,STL,"W,6-2",0.333333,0.517241,0.000,0.307692,0.166667,0.000000,0.979434,0.765661,0.563173,0.680851,0.32,R,A.Wainwright,0.3125,0.074074,0.000000,0.055556,0.384615,0.000,0.126984,0.191781,0.183432,0.245902,0.000000,0.166667,ARI,2013,True,0.206897,0.068966,0.636364,0,STL,2013-04-02,0.500000,0.5000,0.823529,0.823529,0.875000,0.8750,0.000000,0.0000,0.444444,0.444444,0.250000,...,0.339130,0.339130,0.233333,0.172414,0.000,0.384615,0.166667,0.000000,0.336761,0.238979,0.226249,0.246201,0.08,R,I.Kennedy,0.2500,0.185185,0.090909,0.055556,0.307692,0.000,0.356825,0.287671,0.286982,0.344262,0.000000,0.250000,0.272727,0.350000,0.3500,0.235294,0.235294,0.375000,0.3750,0.000000,0.0000,0.555556,0.555556,0.250000,0.250,0.000000,0.000,0.444444,0.444444,0.000000,0.0000,0.356825,0.356825,0.495652,0.495652
4,1,2014-03-22,True,LAD,"L,1-3",0.283333,0.172414,0.000,0.384615,0.000000,0.000000,0.298201,0.252900,0.142997,0.186930,0.28,L,C.Kershaw,0.3125,0.111111,0.000000,0.166667,0.423077,0.125,0.190476,0.273973,0.233728,0.245902,0.000000,0.333333,ARI,2014,False,0.034483,0.103448,0.090909,0,LAD,2014-03-23,0.425000,0.4250,0.235294,0.235294,0.125000,0.1250,0.000000,0.0000,0.555556,0.555556,0.000000,...,0.339130,0.339130,0.283333,0.172414,0.125,0.423077,0.000000,0.000000,0.298201,0.352668,0.261508,0.311550,0.28,L,W.Miley,0.3125,0.037037,0.000000,0.111111,0.384615,0.000,0.063492,0.246575,0.227811,0.289617,0.000000,0.250000,0.181818,0.425000,0.4250,0.235294,0.235294,0.250000,0.2500,0.125000,0.1250,0.611111,0.611111,0.000000,0.000,0.000000,0.000,0.555556,0.555556,0.000000,0.0000,0.063492,0.063492,0.408696,0.408696
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59709,162,2018-09-30,True,WSN,"W,12-0",0.316667,0.448276,0.625,0.269231,0.000000,0.111111,0.568123,0.535963,0.391773,0.472644,0.24,R,E.Fedde,0.3125,0.000000,0.000000,0.166667,0.230769,0.000,0.275556,0.205479,0.177515,0.207650,0.000000,0.166667,COL,2018,True,0.413793,0.000000,0.272727,0,LAD,2018-10-01,0.450617,0.4550,0.454611,0.535294,0.248457,0.3000,0.160494,0.2500,0.477023,0.427778,0.175926,...,0.410039,0.395031,0.400000,0.551724,0.375,0.230769,0.333333,0.000000,0.550129,0.561485,0.396670,0.484802,0.20,L,A.Suárez,0.3125,0.000000,0.000000,0.000000,0.423077,0.000,0.215238,0.164384,0.142012,0.207650,0.000000,0.166667,0.545455,0.454784,0.5075,0.444081,0.588235,0.251543,0.3500,0.179784,0.2125,0.489026,0.466667,0.182099,0.225,0.115741,0.150,0.534636,0.550000,0.136574,0.1250,0.221105,0.215365,0.424412,0.381159
59710,162,2018-09-30,False,SFG,"W,15-0",0.400000,0.551724,0.375,0.230769,0.333333,0.000000,0.550129,0.561485,0.396670,0.484802,0.20,L,A.Suárez,0.3125,0.000000,0.000000,0.000000,0.423077,0.000,0.215238,0.164384,0.142012,0.207650,0.000000,0.166667,LAD,2018,True,0.517241,0.000000,0.545455,1,COL,2018-10-01,0.454784,0.5075,0.444081,0.588235,0.251543,0.3500,0.179784,0.2125,0.489026,0.466667,0.182099,...,0.424412,0.381159,0.316667,0.448276,0.625,0.269231,0.000000,0.111111,0.568123,0.535963,0.391773,0.472644,0.24,R,E.Fedde,0.3125,0.000000,0.000000,0.166667,0.230769,0.000,0.275556,0.205479,0.177515,0.207650,0.000000,0.166667,0.272727,0.450617,0.4550,0.454611,0.535294,0.248457,0.3000,0.160494,0.2500,0.477023,0.427778,0.175926,0.200,0.146605,0.025,0.479767,0.516667,0.140432,0.0875,0.298052,0.277651,0.410039,0.395031
59711,162,2018-09-30,True,DET,"W,11-0",0.333333,0.413793,0.250,0.346154,0.000000,0.222222,0.555270,0.535963,0.380999,0.464286,0.24,R,S.Turnbull,0.3125,0.000000,0.000000,0.055556,0.384615,0.000,0.238095,0.232877,0.263156,0.288651,0.000000,0.166667,MIL,2018,True,0.379310,0.000000,0.272727,1,CHC,2018-10-01,0.450000,0.4650,0.444444,0.494118,0.212191,0.2250,0.168210,0.2625,0.497942,0.461111,0.194444,...,0.411060,0.394783,0.316667,0.413793,0.125,0.307692,0.166667,0.000000,0.573265,0.563805,0.367287,0.462766,0.28,R,J.Flaherty,0.3125,0.148148,0.090909,0.277778,0.384615,0.000,0.231746,0.369863,0.263156,0.288651,0.000000,0.666667,0.545455,0.463580,0.4650,0.467683,0.482353,0.246914,0.2500,0.128086,0.0875,0.473251,0.427778,0.162037,0.100,0.101852,0.075,0.455075,0.444444,0.121142,0.1000,0.223680,0.233333,0.404931,0.437681
59712,162,2013-09-29,False,TOR,"W,7-6",0.316667,0.310345,0.000,0.384615,0.000000,0.000000,0.568123,0.552204,0.364349,0.456687,0.12,R,T.Redmond,0.3125,0.222222,0.000000,0.388889,0.269231,0.000,0.238095,0.369863,0.346154,0.349727,0.000000,0.250000,TBR,2013,True,0.241379,0.206897,0.363636,1,TEX,2013-09-30,0.449691,0.5125,0.454611,0.488235,0.244599,0.3625,0.126543,0.1250,0.398834,0.438889,0.216049,...,0.414385,0.497391,0.266667,0.344828,0.250,0.230769,0.166667,0.222222,0.583548,0.540603,0.369246,0.455927,0.20,L,J.Vargas,0.3125,0.074074,0.000000,0.111111,0.423077,0.125,0.229841,0.219178,0.218935,0.256831,0.111111,0.333333,0.272727,0.456944,0.4000,0.470588,0.452941,0.217593,0.2375,0.135802,0.0750,0.364540,0.316667,0.191358,0.125,0.228395,0.400,0.446159,0.522222,0.120370,0.0625,0.223990,0.231873,0.401879,0.402609


In [90]:
removed_cols = ['Game', 'Date', 'Home', 'Opp', 'Rslt', 'Thr', 'OppStart', 'team', 'year', 'WIN', 'TARGET', 'Thr_Starter', 'Starter', 'Opp_Next', 'Date_Next']
selected_cols = df.columns[~df.columns.isin(removed_cols)]

In [91]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[selected_cols] = scaler.fit_transform(df[selected_cols])

In [92]:
copy_df = df.copy()
df = df.fillna(df[df[selected_cols].isna().columns.tolist()].mean(skipna=True))


In [93]:
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 10)
df[(df['team'] == 'ATL') & (df['year'] == 2020)]

Unnamed: 0,Game,Date,Home,Opp,Rslt,AB,H,HR,SO,GDP,SB,BA,OBP,SLG,OPS,LOB,Thr,OppStart,IP,ER,UER,BB_P,SO_P,HR_P,ERA,BF,Pit,Str,SB_P,NumPlayers_P,team,year,WIN,runs_for,runs_ag,XBH,TARGET,Opp_Next,Date_Next,avg_AB,rolling_AB,avg_H,rolling_H,avg_XBH,rolling_XBH,avg_HR,rolling_HR,avg_SO,rolling_SO,avg_GDP,...,avg_Str,rolling_Str,AB_opp,H_opp,HR_opp,SO_opp,GDP_opp,SB_opp,BA_opp,OBP_opp,SLG_opp,OPS_opp,LOB_opp,Thr_Starter,Starter,IP_opp,ER_opp,UER_opp,BB_P_opp,SO_P_opp,HR_P_opp,ERA_opp,BF_opp,Pit_opp,Str_opp,SB_P_opp,NumPlayers_P_opp,XBH_opp,avg_AB_opp,rolling_AB_opp,avg_H_opp,rolling_H_opp,avg_XBH_opp,rolling_XBH_opp,avg_HR_opp,rolling_HR_opp,avg_SO_opp,rolling_SO_opp,avg_GDP_opp,rolling_GDP_opp,avg_SB_opp,rolling_SB_opp,avg_SO_P_opp,rolling_SO_P_opp,avg_HR_P_opp,rolling_HR_P_opp,avg_ERA_opp,rolling_ERA_opp,avg_Str_opp,rolling_Str_opp
23,1,2020-07-24,False,NYM,"L,0-1",0.233333,0.103448,0.000,0.576923,0.000000,0.000000,0.164524,0.150812,0.095005,0.117021,0.20,R,J.deGrom,0.2500,0.037037,0.000000,0.000000,0.153846,0.125,0.071111,0.164384,0.106509,0.136612,0.000000,0.166667,ATL,2020,False,0.000000,0.034483,0.090909,1,NYM,2020-07-25,0.350000,0.350000,0.117647,0.117647,0.125000,0.12500,0.000000,0.0000,0.833333,0.833333,0.000000,...,0.165217,0.165217,0.200000,0.206897,0.125,0.153846,0.166667,0.000000,0.457584,0.348028,0.314398,0.351824,0.16,R,M.Soroka,0.3125,0.000000,0.000000,0.111111,0.576923,0.000,0.000000,0.205479,0.221893,0.256831,0.000000,0.250000,0.090909,0.300000,0.30000,0.294118,0.294118,0.125000,0.125000,0.125000,0.125000,0.222222,0.222222,0.250000,0.250000,0.000000,0.0000,0.833333,0.833333,0.000000,0.000000,0.000000,0.000000,0.356522,0.356522
410,2,2020-07-25,False,NYM,"W,5-3",0.350000,0.275862,0.250,0.461538,0.333333,0.000000,0.329049,0.252900,0.242899,0.264438,0.16,L,S.Matz,0.3750,0.074074,0.090909,0.111111,0.423077,0.000,0.095238,0.342466,0.269231,0.300546,0.111111,0.333333,ATL,2020,True,0.172414,0.103448,0.090909,1,NYM,2020-07-26,0.437500,0.437500,0.264706,0.264706,0.125000,0.12500,0.125000,0.1250,0.750000,0.750000,0.250000,...,0.295652,0.295652,0.350000,0.310345,0.000,0.423077,0.000000,0.111111,0.501285,0.410673,0.295788,0.357143,0.40,L,M.Fried,0.3750,0.148148,0.090909,0.055556,0.461538,0.250,0.120000,0.287671,0.322485,0.366120,0.000000,0.500000,0.181818,0.412500,0.41250,0.382353,0.382353,0.187500,0.187500,0.062500,0.062500,0.416667,0.416667,0.125000,0.125000,0.125000,0.1250,0.750000,0.750000,0.125000,0.125000,0.060000,0.060000,0.443478,0.443478
797,3,2020-07-26,False,NYM,"W,14-1",0.450000,0.586207,0.500,0.230769,0.166667,0.000000,0.562982,0.496520,0.454456,0.508359,0.28,R,R.Porcello,0.3125,0.037037,0.000000,0.166667,0.230769,0.000,0.084444,0.287671,0.328402,0.333333,0.000000,0.250000,ATL,2020,True,0.482759,0.034483,0.636364,0,TBR,2020-07-27,0.516667,0.516667,0.490196,0.490196,0.375000,0.37500,0.250000,0.2500,0.611111,0.611111,0.250000,...,0.356522,0.356522,0.316667,0.275862,0.125,0.307692,0.000000,0.000000,0.390746,0.535963,0.332027,0.426292,0.36,R,T.Hatch,0.3750,0.148148,0.090909,0.222222,0.423077,0.000,0.224762,0.397260,0.349112,0.404372,0.222222,0.416667,0.454545,0.400000,0.40000,0.294118,0.294118,0.416667,0.416667,0.083333,0.083333,0.425926,0.425926,0.083333,0.083333,0.000000,0.0000,0.500000,0.500000,0.083333,0.083333,0.275979,0.275979,0.492754,0.492754
1185,4,2020-07-27,False,TBR,"L,5-14",0.300000,0.137931,0.250,0.730769,0.000000,0.000000,0.478149,0.422274,0.406464,0.446809,0.16,R,T.Glasnow,0.2500,0.518519,0.000000,0.333333,0.461538,0.625,0.293968,0.342466,0.369822,0.387978,0.111111,0.250000,ATL,2020,False,0.172414,0.482759,0.000000,0,TBR,2020-07-28,0.500000,0.500000,0.411765,0.411765,0.281250,0.28125,0.250000,0.2500,0.722222,0.722222,0.187500,...,0.408696,0.408696,0.316667,0.448276,0.625,0.461538,0.333333,0.111111,0.516710,0.631090,0.473066,0.566869,0.16,R,M.Foltynewicz,0.3125,0.148148,0.090909,0.111111,0.730769,0.250,0.231746,0.260274,0.260355,0.333333,0.000000,0.250000,0.272727,0.418750,0.41875,0.397059,0.397059,0.406250,0.406250,0.218750,0.218750,0.486111,0.486111,0.187500,0.187500,0.062500,0.0625,0.638889,0.638889,0.125000,0.125000,0.264921,0.264921,0.489130,0.489130
1575,5,2020-07-28,False,TBR,"L,2-5",0.266667,0.206897,0.000,0.461538,0.333333,0.000000,0.462725,0.424594,0.365328,0.415653,0.28,R,Y.Chirinos,0.2500,0.185185,0.000000,0.166667,0.307692,0.000,0.305397,0.219178,0.186391,0.218579,0.111111,0.333333,ATL,2020,False,0.068966,0.172414,0.090909,1,TBR,2020-07-29,0.480000,0.480000,0.388235,0.388235,0.250000,0.25000,0.200000,0.2000,0.711111,0.711111,0.250000,...,0.386087,0.386087,0.233333,0.241379,0.000,0.307692,0.166667,0.111111,0.514139,0.607889,0.421156,0.518237,0.16,R,K.Wright,0.3125,0.000000,0.181818,0.222222,0.461538,0.000,0.186032,0.260274,0.295858,0.316940,0.000000,0.416667,0.000000,0.405000,0.40500,0.388235,0.388235,0.325000,0.325000,0.175000,0.175000,0.477778,0.477778,0.200000,0.200000,0.100000,0.1000,0.644444,0.644444,0.100000,0.100000,0.249143,0.249143,0.481739,0.481739
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21075,55,2020-09-22,True,MIA,"W,11-1",0.383333,0.517241,0.625,0.230769,0.166667,0.000000,0.606684,0.598608,0.448580,0.537234,0.24,R,J.Ureña,0.3125,0.037037,0.000000,0.111111,0.384615,0.125,0.282540,0.260274,0.230769,0.251366,0.000000,0.250000,ATL,2020,True,0.379310,0.034483,0.272727,1,MIA,2020-09-23,0.463636,0.477500,0.493048,0.505882,0.286364,0.32500,0.222727,0.2250,0.527273,0.577778,0.168182,...,0.382609,0.415459,0.300000,0.344828,0.125,0.384615,0.333333,0.000000,0.542416,0.538283,0.345739,0.437690,0.32,R,B.Wilson,0.2500,0.333333,0.181818,0.111111,0.230769,0.625,0.307937,0.328767,0.227811,0.256831,0.000000,0.166667,0.000000,0.399545,0.36500,0.405348,0.382353,0.188636,0.137500,0.122727,0.125000,0.495960,0.461111,0.159091,0.125000,0.222727,0.2750,0.415152,0.405556,0.177273,0.150000,0.267786,0.295937,0.376618,0.424638
21465,56,2020-09-23,True,MIA,"W,9-4",0.283333,0.344828,0.125,0.192308,0.166667,0.000000,0.606684,0.600928,0.447600,0.537234,0.24,R,S.Sánchez,0.3125,0.148148,0.000000,0.055556,0.384615,0.250,0.281905,0.232877,0.233728,0.234973,0.000000,0.333333,ATL,2020,True,0.310345,0.137931,0.090909,0,MIA,2020-09-24,0.462946,0.470000,0.493697,0.511765,0.283482,0.32500,0.220982,0.2375,0.522817,0.538889,0.169643,...,0.381193,0.410628,0.283333,0.241379,0.250,0.384615,0.166667,0.000000,0.539846,0.535963,0.346719,0.436930,0.12,L,M.Fried,0.2500,0.333333,0.000000,0.333333,0.192308,0.125,0.313651,0.301370,0.286982,0.289617,0.000000,0.333333,0.090909,0.400000,0.37250,0.404412,0.400000,0.187500,0.137500,0.125000,0.150000,0.497024,0.455556,0.160714,0.150000,0.218750,0.2750,0.412698,0.383333,0.176339,0.162500,0.268605,0.297905,0.377316,0.422360
21855,57,2020-09-24,True,MIA,"L,2-4",0.283333,0.206897,0.000,0.500000,0.000000,0.222222,0.604113,0.600928,0.442703,0.533435,0.44,R,P.López,0.3125,0.037037,0.272727,0.111111,0.384615,0.125,0.278095,0.315068,0.284024,0.316940,0.111111,0.250000,ATL,2020,False,0.068966,0.137931,0.090909,1,BOS,2020-09-25,0.462281,0.460000,0.490196,0.488235,0.280702,0.30000,0.217105,0.2125,0.526316,0.533333,0.166667,...,0.382806,0.414783,0.233333,0.241379,0.000,0.269231,0.666667,0.000000,0.588689,0.552204,0.399608,0.484043,0.20,R,A.Cobb,0.3125,0.481481,0.000000,0.111111,0.115385,0.375,0.366349,0.410959,0.352071,0.420765,0.111111,0.333333,0.000000,0.462719,0.49250,0.479876,0.517647,0.267544,0.287500,0.162281,0.125000,0.501949,0.500000,0.223684,0.300000,0.131579,0.2250,0.491228,0.566667,0.206140,0.150000,0.363264,0.373841,0.444331,0.485024
22245,58,2020-09-25,True,BOS,"W,8-7",0.433333,0.379310,0.250,0.423077,0.000000,0.111111,0.601542,0.603248,0.442703,0.534195,0.68,R,C.Mazza,0.4375,0.185185,0.181818,0.388889,0.269231,0.125,0.277460,0.410959,0.263156,0.288651,0.000000,0.416667,ATL,2020,True,0.275862,0.241379,0.272727,0,BOS,2020-09-26,0.465517,0.490000,0.491886,0.523529,0.282328,0.33750,0.217672,0.2250,0.527778,0.566667,0.163793,...,0.382806,0.401932,0.366667,0.241379,0.125,0.269231,0.000000,0.000000,0.586118,0.549884,0.395690,0.481003,0.36,R,K.Wright,0.3750,0.185185,0.272727,0.388889,0.423077,0.250,0.365079,0.493151,0.263156,0.288651,0.111111,0.416667,0.000000,0.464224,0.49500,0.477688,0.494118,0.262931,0.262500,0.161638,0.112500,0.500000,0.505556,0.219828,0.275000,0.129310,0.2000,0.493295,0.561111,0.206897,0.162500,0.363295,0.371492,0.444331,0.480435


In [94]:
sfs.fit(df[selected_cols], df['TARGET'])

In [95]:
predictors = list(selected_cols[sfs.get_support()])

In [96]:
predictors

['AB',
 'BA',
 'OBP',
 'IP',
 'ERA',
 'avg_HR',
 'rolling_SB',
 'avg_SO_P',
 'rolling_HR_P',
 'rolling_Str',
 'H_opp',
 'SB_opp',
 'OBP_opp',
 'IP_opp',
 'ERA_opp',
 'Pit_opp',
 'rolling_H_opp',
 'avg_SB_opp',
 'avg_SO_P_opp',
 'rolling_Str_opp']

In [97]:
def backtest(data, model, predictors, start=2, step=1):
    all_predictions = []

    seasons = sorted(data['year'].unique())

    for i in range(start, len(seasons), step):
        season = seasons[i]

        train = data[data['year'] < season]
        test = data[data['year'] == season]

        model.fit(train[predictors], train['TARGET'])

        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)

        combined = pd.concat([test['TARGET'], preds], axis=1)
        combined.columns = ['actual', 'prediction']

        all_predictions.append(combined)
    
    return pd.concat(all_predictions)

In [98]:
predictions = backtest(df, rr, predictors)

In [99]:
predictions

Unnamed: 0,actual,prediction
2,1,1
15,0,0
28,1,1
41,0,0
54,0,0
...,...,...
59660,0,1
59672,0,1
59684,1,0
59696,1,1


0.5538796547866389
0.5552181556656545

In [100]:
from sklearn.metrics import accuracy_score

predictions = predictions[predictions['actual'] != 2]
accuracy_score(predictions['actual'], predictions['prediction'])

0.555357999041074

In [101]:
df.groupby('Home').apply(lambda x: x[x['WIN']].shape[0] / x.shape[0])

  df.groupby('Home').apply(lambda x: x[x['WIN']].shape[0] / x.shape[0])


Home
False    0.464094
True     0.535970
dtype: float64