In [1]:
import pandas as pd
from pybaseball import schedule_and_record

In [2]:
years = [2015, 2016, 2017, 2018, 2019]
teams = ["ATL", "BAL", "CHW", "CIN", "CLE",\
         "DET", "MIL", "MIN", "SDP", "STL", "TEX"]

In [3]:
data = pd.DataFrame()
for team in teams:
    for year in years:
        d = schedule_and_record(year, team)
        d['Year'] = str(year)
        data = data.append(d, ignore_index=True)

In [4]:
data.iloc[160:162,:]

Unnamed: 0,Date,Tm,Home_Away,Opp,W/L,R,RA,Inn,W-L,Rank,GB,Win,Loss,Save,Time,D/N,Attendance,Streak,Orig. Scheduled,Year
160,"Sunday, Oct 4 (1)",ATL,Home,STL,W,6.0,0.0,9.0,66-95,4.0,23.0,Miller,Lackey,,2:14,D,,2.0,,2015
161,"Sunday, Oct 4 (2)",ATL,Home,STL,W,2.0,0.0,9.0,67-95,4.0,23.0,Wisler,Lynn,Jackson,2:19,D,31441.0,3.0,2015-10-03 (Rain),2015


In [5]:
adj = data.copy(deep=True)

In [6]:
adj['DblHdrGm#'] = adj['Date'].apply(lambda row: row[-2] if row[-3] == '(' else 0).astype(int)
adj['DblHdr'] = adj['Date'].apply(lambda row: 1 if row[-3] == '(' else 0).astype(int)
adj['Date'] = adj['Date'].apply(lambda row: row[:-4] if row[-3] == '(' else row)

In [7]:
adj.iloc[160:162,:]

Unnamed: 0,Date,Tm,Home_Away,Opp,W/L,R,RA,Inn,W-L,Rank,...,Loss,Save,Time,D/N,Attendance,Streak,Orig. Scheduled,Year,DblHdrGm#,DblHdr
160,"Sunday, Oct 4",ATL,Home,STL,W,6.0,0.0,9.0,66-95,4.0,...,Lackey,,2:14,D,,2.0,,2015,1,1
161,"Sunday, Oct 4",ATL,Home,STL,W,2.0,0.0,9.0,67-95,4.0,...,Lynn,Jackson,2:19,D,31441.0,3.0,2015-10-03 (Rain),2015,2,1


In [8]:
adj['Date'] = (adj['Date'] + ' ' + (adj['Year'])).astype('datetime64')

In [9]:
adj['Year'] = adj['Date'].dt.year
adj['Month'] = adj['Date'].dt.month
adj['Week'] = adj['Date'].dt.week
adj['Day_of_Year'] = adj['Date'].dt.dayofyear
adj['Day_of_Month'] = adj['Date'].dt.day
adj['Day_of_Week'] = adj['Date'].dt.dayofweek
adj['Weekend'] = adj['Day_of_Week'].apply(lambda row: 1 if row in [5,6] else 0)

In [10]:
adj['Home_Away'] = adj.Home_Away.map({'@':0, 'Home':1})

In [11]:
adj['Walkoff'] = adj[pd.notnull(adj['W/L'])]['W/L'].apply(lambda row: 1 if len(row) > 1 else 0).astype(int)
adj['W/L'] = adj[pd.notnull(adj['W/L'])]['W/L'].apply(lambda row: 1 if row[0] == 'W' else 0).astype(int)

In [12]:
adj['Wins'] = adj[pd.notnull(adj['W-L'])]['W-L'].apply(lambda row: row.split('-')[0]).astype(int)
adj['Losses'] = adj[pd.notnull(adj['W-L'])]['W-L'].apply(lambda row: row.split('-')[1]).astype(int)
adj['W-L'] = round(adj['Wins']/(adj['Wins']+adj['Losses']),3)

In [13]:
adj['Shortened'] = adj[pd.notnull(adj['Inn'])]['Inn'].apply(lambda row: 1 if row < 9 else 0).astype(int)
adj['Extra'] = adj[pd.notnull(adj['Inn'])]['Inn'].apply(lambda row: 1 if row > 9 else 0).astype(int)

In [14]:
adj['GB'] = adj[pd.notnull(adj['GB'])]['GB'].apply(lambda row: row.split('up')[1] if row.split('up')[0] == ''
                                                   else '-' + row.split('up')[0])
adj['GB'] = adj.GB.replace({'-Tied':0}).astype(float)

In [15]:
adj['Team_Pitcher'] = adj[pd.notnull(adj['W/L'])].apply(lambda row: row['Win'] if row['W/L'] == 1 else row['Loss'], axis=1)
adj['Opp_Pitcher'] = adj[pd.notnull(adj['W/L'])].apply(lambda row: row['Win'] if row['W/L'] == 0 else row['Loss'], axis=1)

In [16]:
adj['Duration'] = adj[pd.notnull(adj['Time'])]['Time'].str.split(':').apply(lambda row: (int(row[0])*60)+int(row[1]))

In [17]:
adj['D/N'] = adj['D/N'].map({'N':0, 'D':1})

In [18]:
adj['Rescheduled'] = adj[pd.notnull(adj['W/L'])]['Orig. Scheduled'].apply(lambda row: 0 if pd.isnull(row) == True else 1)
adj['Reschedule_Reason'] = adj[adj['Rescheduled'] == 1]['Orig. Scheduled'].apply(lambda row: row[row.find('(')+1:-1])
adj['Orig. Scheduled'] = adj[adj['Rescheduled'] == 1]['Orig. Scheduled'].apply(lambda row: row.split(' (')[0]).astype('datetime64')

In [19]:
AL = ["BAL","BOS","NYY","TBR","TOR",
      "CHW","CLE","DET","KCR","MIN",
      "HOU","LAA","OAK","SEA","TEX"]
NL = ["ATL","MIA","NYM","PHI","WSN",
      "CHC","CIN","MIL","PIT","STL",
      "ARI","COL","LAD","SDP","SFG"]

def interleague(adj):
    if (adj['Tm'] in AL and adj['Opp'] in AL) or \
       (adj['Tm'] in NL and adj['Opp'] in NL):
        return 0
    else:
        return 1
    
adj['Interleague'] = adj.apply(interleague, axis=1)

In [20]:
ALE = ["BAL","BOS","NYY","TBR","TOR"]
ALC = ["CHW","CLE","DET","KCR","MIN"]
ALW = ["HOU","LAA","OAK","SEA","TEX"]
NLE = ["ATL","MIA","NYM","PHI","WSN"]
NLC = ["CHC","CIN","MIL","PIT","STL"]
NLW = ["ARI","COL","LAD","SDP","SFG"]

def rival(adj):
    if (adj['Tm'] in ALE and adj['Opp'] in ALE) or \
       (adj['Tm'] in ALC and adj['Opp'] in ALC) or \
       (adj['Tm'] in ALW and adj['Opp'] in ALW) or \
       (adj['Tm'] in NLE and adj['Opp'] in NLE) or \
       (adj['Tm'] in NLC and adj['Opp'] in NLC) or \
       (adj['Tm'] in NLW and adj['Opp'] in NLW):
        return 1
    else:
        return 0
    
adj['Rival'] = adj.apply(rival, axis=1)

In [24]:
adj['Gm#'] = adj.groupby(['Year','Tm']).cumcount() + 1
adj['HmGm#'] = adj[adj['Home_Away'] == 1].groupby(['Year','Tm']).cumcount() + 1

In [28]:
adj

Unnamed: 0,Date,Tm,Home_Away,Opp,W/L,R,RA,Inn,W-L,Rank,...,Extra,Team_Pitcher,Opp_Pitcher,Duration,Rescheduled,Reschedule_Reason,Interleague,Rival,Gm#,HmGm#
0,2015-04-06,ATL,0,MIA,1.0,2.0,1.0,9.0,1.000,1.0,...,0.0,Teheran,Alvarez,156.0,0.0,,0,1,1,
1,2015-04-07,ATL,0,MIA,1.0,12.0,2.0,9.0,1.000,1.0,...,0.0,Wood,Latos,194.0,0.0,,0,1,2,
2,2015-04-08,ATL,0,MIA,1.0,2.0,0.0,9.0,1.000,1.0,...,0.0,Cunniff,Koehler,165.0,0.0,,0,1,3,
3,2015-04-10,ATL,1,NYM,1.0,5.0,3.0,9.0,1.000,1.0,...,0.0,Johnson,Montero,181.0,0.0,,0,1,4,1.0
4,2015-04-11,ATL,1,NYM,1.0,5.0,3.0,9.0,1.000,1.0,...,0.0,Teheran,Gee,145.0,0.0,,0,1,5,2.0
5,2015-04-12,ATL,1,NYM,0.0,3.0,4.0,9.0,0.833,1.0,...,0.0,Jaime,Colon,157.0,0.0,,0,1,6,3.0
6,2015-04-13,ATL,1,MIA,1.0,3.0,2.0,9.0,0.857,1.0,...,0.0,Miller,Latos,182.0,0.0,,0,1,7,4.0
7,2015-04-14,ATL,1,MIA,0.0,2.0,8.0,9.0,0.750,1.0,...,0.0,Cahill,Koehler,180.0,0.0,,0,1,8,5.0
8,2015-04-15,ATL,1,MIA,0.0,2.0,6.0,9.0,0.667,1.0,...,0.0,Stults,Haren,145.0,0.0,,0,1,9,6.0
9,2015-04-17,ATL,0,TOR,1.0,8.0,7.0,9.0,0.700,2.0,...,0.0,Martin,Cecil,189.0,0.0,,1,0,10,


In [27]:
adj.dtypes

Date                 datetime64[ns]
Tm                           object
Home_Away                     int64
Opp                          object
W/L                         float64
R                           float64
RA                          float64
Inn                         float64
W-L                         float64
Rank                        float64
GB                          float64
Win                          object
Loss                         object
Save                         object
Time                         object
D/N                         float64
Attendance                  float64
Streak                      float64
Orig. Scheduled      datetime64[ns]
Year                          int64
DblHdrGm#                     int64
DblHdr                        int64
Month                         int64
Week                          int64
Day_of_Year                   int64
Day_of_Month                  int64
Day_of_Week                   int64
Weekend                     

In [None]:
# adj.to_csv('MLBGameStats.csv', index=False)

In [None]:
# import saspy

In [None]:
# mysas = saspy.SASsession(cfgname='iomwin')

In [None]:
# mysas.df2sd(adj,table='MLB_GAME_STATS',libref="lcssan")