In [6]:
import pandas as pd
from typing import Tuple

# Cleaning Game Info
This notebook provides a clean version of the raw `data/gameinfo.csv` file.
If does the following:
1. Filtering for competitive games.
2. Adding a 'homewon' column, that is 1 if the home team won and 0 otherwise.
3. Adding a 'timestamp' column, which provides a `pd.Timestamp' for the game start time.
4. Save to a new csv, `.data/gameinfo_clean.csv`

In [None]:
all_games = pd.read_csv('./gameinfo.csv')
all_games.head()

  all_games = pd.read_csv('../data/gameinfo.csv')


Unnamed: 0,gid,visteam,hometeam,site,date,number,starttime,daynight,innings,tiebreaker,...,vruns,hruns,wteam,lteam,line,batteries,lineups,box,pbp,season
0,LS3189904140,CHN,LS3,LOU03,18990414,0.0,0:00PM,day,,,...,15,1,CHN,LS3,y,both,y,y,,1899
1,PHI189904140,WSN,PHI,PHI09,18990414,0.0,0:00PM,day,,,...,5,6,PHI,WSN,y,both,y,y,,1899
2,BLN189904150,NY1,BLN,BAL07,18990415,0.0,0:00PM,day,,,...,3,5,BLN,NY1,y,both,y,y,,1899
3,BRO189904150,BSN,BRO,NYC12,18990415,0.0,0:00PM,day,,,...,1,0,BSN,BRO,y,both,y,y,,1899
4,CIN189904150,PIT,CIN,CIN05,18990415,0.0,0:00PM,day,,,...,5,2,PIT,CIN,y,both,y,y,,1899


## 1. Filter for Competitive Games

In [None]:
valid_games = ['regular', 'championship', 'worldseries', 'lcs',
                'playoff', 'divisionseries', 'wildcard']

all_games = all_games[all_games['gametype'].isin(valid_games)]

## 2. Add 'homewon' Column

In [None]:
all_games['homewon'] = list((all_games['hruns'] > all_games['vruns']).apply(int))
all_games.head()

Unnamed: 0,gid,visteam,hometeam,site,date,number,starttime,daynight,innings,tiebreaker,...,hruns,wteam,lteam,line,batteries,lineups,box,pbp,season,homewon
0,LS3189904140,CHN,LS3,LOU03,18990414,0.0,0:00PM,day,,,...,1,CHN,LS3,y,both,y,y,,1899,0
1,PHI189904140,WSN,PHI,PHI09,18990414,0.0,0:00PM,day,,,...,6,PHI,WSN,y,both,y,y,,1899,1
2,BLN189904150,NY1,BLN,BAL07,18990415,0.0,0:00PM,day,,,...,5,BLN,NY1,y,both,y,y,,1899,1
3,BRO189904150,BSN,BRO,NYC12,18990415,0.0,0:00PM,day,,,...,0,BSN,BRO,y,both,y,y,,1899,0
4,CIN189904150,PIT,CIN,CIN05,18990415,0.0,0:00PM,day,,,...,2,PIT,CIN,y,both,y,y,,1899,0


## 3. Add 'timestamp' Column

In [None]:
def get_hms(raw_starttime: str) -> Tuple[int, int]:
    """For the given raw start time (e.g. '5:30PM'), returns the hour and minute.
    
    If raw_starttime is nan, this will return 0, 0.
    
    If there is no 'AM' or 'PM' (e.g., '' or '?M'), it will assume a 24 hour clock.
    """

    if pd.isna(raw_starttime):
        return 0, 0
    
    col_idx = raw_starttime.find(':')
    h = int(raw_starttime[:col_idx]) 
    m = int(raw_starttime[col_idx + 1:col_idx + 3])
    
    # If AM or PM, need to do some more conversions
        
    # If time is PM and not 12:00, add 12 hours
    # Note some entries in all_games are 0:00 PM - this would also correctly add 12 hours,
    # making it the familiar 12:00 PM
    if raw_starttime[-2:].upper() == 'PM' and h != 12:
        h += 12
            
    elif raw_starttime[-2:].upper() == 'AM' and h == 12: # Edge case - if midnight, h should be 0
        h = 0
        
    return h, m

def get_timestamp(game: pd.DataFrame) -> pd.Timestamp:
    """For a single row, gets its game start timestamp."""
    #print(game['starttime'])
    h, min = get_hms(game['starttime'])
    
    raw_date = str(game['date'])

    y = int(raw_date[:4])
    mon = int(raw_date[4:6])
    d = int(raw_date[6:])
    return pd.Timestamp(year=y, month=mon, day=d, hour=h, minute=min)

Unnamed: 0,gid,visteam,hometeam,site,date,number,starttime,daynight,innings,tiebreaker,...,wteam,lteam,line,batteries,lineups,box,pbp,season,homewon,timestamp
0,LS3189904140,CHN,LS3,LOU03,18990414,0.0,0:00PM,day,,,...,CHN,LS3,y,both,y,y,,1899,0,1899-04-14 12:00:00
1,PHI189904140,WSN,PHI,PHI09,18990414,0.0,0:00PM,day,,,...,PHI,WSN,y,both,y,y,,1899,1,1899-04-14 12:00:00
2,BLN189904150,NY1,BLN,BAL07,18990415,0.0,0:00PM,day,,,...,BLN,NY1,y,both,y,y,,1899,1,1899-04-15 12:00:00
3,BRO189904150,BSN,BRO,NYC12,18990415,0.0,0:00PM,day,,,...,BSN,BRO,y,both,y,y,,1899,0,1899-04-15 12:00:00
4,CIN189904150,PIT,CIN,CIN05,18990415,0.0,0:00PM,day,,,...,PIT,CIN,y,both,y,y,,1899,0,1899-04-15 12:00:00


In [9]:
# Add the timestamp column

all_games['timestamp'] = all_games.apply(get_timestamp, axis=1)
all_games.head()

Unnamed: 0,gid,visteam,hometeam,site,date,number,starttime,daynight,innings,tiebreaker,...,wteam,lteam,line,batteries,lineups,box,pbp,season,homewon,timestamp
0,LS3189904140,CHN,LS3,LOU03,18990414,0.0,0:00PM,day,,,...,CHN,LS3,y,both,y,y,,1899,0,1899-04-14 12:00:00
1,PHI189904140,WSN,PHI,PHI09,18990414,0.0,0:00PM,day,,,...,PHI,WSN,y,both,y,y,,1899,1,1899-04-14 12:00:00
2,BLN189904150,NY1,BLN,BAL07,18990415,0.0,0:00PM,day,,,...,BLN,NY1,y,both,y,y,,1899,1,1899-04-15 12:00:00
3,BRO189904150,BSN,BRO,NYC12,18990415,0.0,0:00PM,day,,,...,BSN,BRO,y,both,y,y,,1899,0,1899-04-15 12:00:00
4,CIN189904150,PIT,CIN,CIN05,18990415,0.0,0:00PM,day,,,...,PIT,CIN,y,both,y,y,,1899,0,1899-04-15 12:00:00


## 4. Save to `.csv`

In [None]:
all_games.to_csv('./gameinfo_cleaned.csv', index=False)