In [1]:
import pickle
import json
import pandas as pd
from pandas.io.json import json_normalize

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

with open('outfile1', 'rb') as fp:
    baseball_data = pickle.load(fp)  # Load up the 2018 data

del baseball_data[-1] # Last day, no games
del baseball_data[108] # All star break, no games
del baseball_data[108] # All star break, no games
del baseball_data[108] # All star break, no games
del baseball_data[30] # April 31 does not exist
del baseball_data[91] # June 31 does not exist


In [2]:
baseball_json = []

for days in baseball_data:
    baseball_json.append(json.loads(days))
    
for days in baseball_json:
    del days['_comment']  # API pull had weird last key value pair, delete
    
baseball_json_norm = json_normalize(baseball_json)
baseball_json_norm = baseball_json_norm.drop(
    columns=['league.alias', 'league.date','league.id', 'league.name'])

data = json_normalize(baseball_json_norm.iloc[0,0])
for rows in range(1,180):
    data = data.append(json_normalize(baseball_json_norm.iloc[rows, 0]), 
                       ignore_index=True, sort=False)
    
data['home.win.pct'] = data['game.home.win'] / (data['game.home.win'] + data['game.home.loss'])
data['away.win.pct'] = data['game.away.win'] / (data['game.away.win'] + data['game.away.loss'])    
data['total.runs'] = data['game.home.runs'] + data['game.away.runs'] # Response variable!

In [3]:
with open('outfile_april_12_pre', 'rb') as fp:
    baseball_data1 = pickle.load(fp)

In [4]:
data_all_cols = list(data.columns.values)
data_good_cols = list(baseball_data1.columns.values)
data_good_cols.append('home.win.pct')
data_good_cols.append('away.win.pct')
data_good_cols.append('total.runs')
data = data.drop(list(set(data_all_cols) - set(data_good_cols)), axis=1)
# data contains only the 67 variables available before today's MLB games start

# Must drop additional variables will not know before game starts
data = data.drop(['game.away.errors', 'game.away.hits', 'game.away.runs',
                 'game.home.errors', 'game.home.hits', 'game.home.runs'], axis=1)

# Must drop unnecessary variables
data = data.drop(['game.away.id', 'game.away.market', 'game.away.name',
                 'game.away.probable_pitcher.first_name', 
                 'game.away.probable_pitcher.jersey_number',
                 'game.away.probable_pitcher.last_name',
                 'game.away.probable_pitcher.preferred_name',
                 'game.away_team', 'game.coverage',
                 'game.game_number', 'game.home.id', 
                 'game.home.market', 'game.home.name',
                 'game.home.probable_pitcher.first_name', 
                 'game.home.probable_pitcher.jersey_number',
                 'game.home.probable_pitcher.last_name',
                 'game.home.probable_pitcher.preferred_name',
                 'game.home_team', 'game.rescheduled',
                 'game.scheduled', 'game.status',
                 'game.venue.address', 'game.venue.country',
                 'game.venue.market', 'game.venue.state',
                 'game.venue.zip', 'game.weather.forecast.obs_time', 
                 'game.id', 'game.venue.city', 'game.venue.id',
                 'game.double_header', 'game.venue.location.lat',
                 'game.venue.location.lng'], axis=1)

data.drop_duplicates(inplace=True)
data = data.drop([373]) # Pesky row

# Pitcher records set to 0 if NaN
data['game.away.probable_pitcher.loss'].fillna(0, inplace=True)
data['game.away.probable_pitcher.win'].fillna(0, inplace=True)
data['game.home.probable_pitcher.loss'].fillna(0, inplace=True)
data['game.home.probable_pitcher.win'].fillna(0, inplace=True)

# Missing values replaced with "unknown" for text values
data['game.away.probable_pitcher.id'].fillna('unknown', inplace=True)
data['game.home.probable_pitcher.id'].fillna('unknown', inplace=True)
data['game.weather.forecast.condition'].fillna('unknown', inplace=True)
data['game.weather.forecast.wind.direction'].fillna('unknown', inplace=True)
data['game.venue.field_orientation'].fillna('unknown', inplace=True)
data['game.venue.stadium_type'].fillna('unknown', inplace=True)


# Weather data simplified
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("moderate snow", "Snow", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("heavy snow", "Snow", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("moderate or heavy snow showers", "Snow", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("moderate or snow showers", "Snow", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("patchy snow", "Snow", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("patchy light snow", "Snow", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("light snow", "Snow", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("light snow, mist", "Snow", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("squalls", "Snow", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("snow, mist", "Snow", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("light sleet", "Snow", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("light drizzle", "Rain", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("patchy rain possible", "Rain", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("Light rain", "Rain", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("patchy light rain", "Rain", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("moderate rain", "Rain", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("light rain shower", "Rain", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("patchy light drizzle", "Rain", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("light rain, mist", "Rain", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("light drizzle, mist", "Rain", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("Shower in Vicinity", "Rain", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("overcast", "Cloudy", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("Thundery outbreaks possible", "Storm", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("rain shower", "Rain", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("thunderstorm in vicinity", "Storm", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("heavy rain", "Storm", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("torrential rain shower", "Storm", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("rain with thunderstorm, mist", "Storm", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("thunderstorm", "Storm", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("thunderstorm, haze", "Storm", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("heavy rain, mist", "Rain", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("Partly cloudy", "Cloudy", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("Patchy rain", "Rain", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("Rain, Mist", "Mist", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("Torrential Rain", "Storm", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("Moderate or Storm", "Storm", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("Patchy rain with thunder", "Storm", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("rain with storm", "Storm", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("storm, rain", "Storm", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("storm with storm", "Storm", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("Storm, Mist", "Mist", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("Rain with thunder", "Storm", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("Blowing Widespread Dust", "Haze", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("Smoke", "Haze", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("Storm, Haze", "Storm", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("Haze, Smoke", "Haze", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("Storm, Fog", "Storm", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("Haze, Haze", "Haze", case = False)
data['game.weather.forecast.condition'] = data['game.weather.forecast.condition'].str.replace("Fog", "Mist", case = False)

In [5]:
data.dropna(axis=0, inplace=True)
data.tail(7)

Unnamed: 0,game.away.abbr,game.away.loss,game.away.probable_pitcher.era,game.away.probable_pitcher.id,game.away.probable_pitcher.loss,game.away.probable_pitcher.win,game.away.win,game.broadcast.network,game.day_night,game.home.abbr,game.home.loss,game.home.probable_pitcher.id,game.home.win,game.venue.capacity,game.venue.field_orientation,game.venue.name,game.venue.stadium_type,game.venue.surface,game.weather.forecast.cloud_cover,game.weather.forecast.condition,game.weather.forecast.dew_point_f,game.weather.forecast.humidity,game.weather.forecast.temp_f,game.weather.forecast.wind.direction,game.weather.forecast.wind.speed_mph,game.home.probable_pitcher.era,game.home.probable_pitcher.loss,game.home.probable_pitcher.win,home.win.pct,away.win.pct,total.runs
2438,NYY,61,4.669,1fdf3f8d-43ea-4430-b377-8af7f0ac7322,3.0,1.0,100,TBS,D,BOS,54,571e8056-f01b-430a-8fc8-f8c74402d1c0,107,37731,NE,Fenway Park,outdoor,grass,75.0,Cloudy,43.0,38.0,69.0,WSW,9.0,4.326,7.0,17.0,0.664596,0.621118,12
2439,STL,73,3.155,9c506364-e0fa-45b9-9313-16f6e87e0327,8.0,8.0,88,NBCS-CHI,D,CHC,67,816fd431-265c-4104-9339-1fc8c2385fc5,94,41649,NE,Wrigley Field,outdoor,grass,100.0,Mist,56.0,86.0,60.0,N,0.0,3.995,6.0,5.0,0.583851,0.546584,15
2440,PIT,79,7.254,9a62f791-fe7a-4e1f-9823-82a361f546bd,3.0,1.0,81,FS-CIN,D,CIN,94,53b0311e-bafa-400e-a76e-8ee6c7a37be4,67,42319,SE,Great American Ball Park,outdoor,grass,0.0,Sunny,59.0,52.0,79.0,SSW,9.0,5.374,11.0,8.0,0.416149,0.50625,11
2441,CLE,71,3.417,683ac24d-7777-478b-8417-d0eb3cb04e83,10.0,16.0,90,FS-KC,D,KC,103,4ca919f7-7727-4849-b8a4-ec7f992ea67b,58,37903,NE,Kauffman Stadium,outdoor,grass,100.0,Cloudy,64.0,69.0,76.0,S,12.0,5.4,5.0,1.0,0.360248,0.559006,3
2442,CWS,99,5.058,13b09db2-4de1-4452-b533-874d4f595121,13.0,5.0,62,FS-N,D,MIN,84,fe0fc487-45d3-4178-9fd7-6c00025aff7e,77,38649,E,Target Field,outdoor,grass,0.0,Sunny,42.0,66.0,53.0,N,6.0,6.612,2.0,0.0,0.478261,0.385093,9
2443,ARI,79,3.911,9deda06b-1928-4c58-acf0-717538f576f4,2.0,6.0,82,FS-SD,D,SD,96,c045bfe8-924c-4078-89e0-1d959c39a088,65,42445,N,PETCO Park,outdoor,grass,75.0,Cloudy,63.0,67.0,76.0,NW,12.0,4.137,9.0,8.0,0.403727,0.509317,7
2444,HOU,58,3.183,ff772241-8fdd-488c-a81e-49b44ce600fc,3.0,15.0,103,MASN,D,BAL,115,9046ffbf-da7f-4296-9e8c-b231d8c01bff,46,45971,NE,Oriole Park at Camden Yards,outdoor,grass,50.0,Cloudy,56.0,52.0,75.0,SSE,0.0,6.0,2.0,0.0,0.285714,0.639752,4


In [6]:
with open('model_data', 'wb') as fp:
    pickle.dump(data, fp)