# Objective:

The goal of this project is to determine the performance of a playoff team against their opponent and teams similar to their opponent, then use that performance data to simulate the winner of a game and series. Using this data that was tested on the 2018 playoffs, we will "simulate" the World Series to see who the more probable winner is according to the model. All data was scraped from baseball-reference.com.

In [1]:
## Import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import random
random.seed(145)
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MinMaxScaler

In [2]:
## Read in team statistics

df_pitching = pd.read_csv(r'C:\Users\jbean\Dropbox\Other\Python\WS_simulation\pitching_stats.csv')
df_batting = pd.read_csv(r'C:\Users\jbean\Dropbox\Other\Python\WS_simulation\batting_stats.csv')

# Define functions

In [3]:
def read_in_data(lower_team_abb):
    
    """
    Takes in the team abbreviation and reads in the data set.
    """
    
    df = pd.read_csv(r'C:\Users\jbean\Dropbox\Other\Python\WS_simulation\%s_data.csv' %lower_team_abb)
    
    return df

In [4]:
def standardize_dfs(year, df_pitching = df_pitching, df_batting = df_batting):
    
    """
    Filters the data sets for only the desired year, then standardizes performance so that all units are on the same
    scale using MinMaxScaler. 
    """
    
    df_pitching = df_pitching[df_pitching.year == year]
    df_batting = df_batting[df_batting.year == year]
    
    df_pitching_numeric = df_pitching.drop(['team','year'], axis=1)
    df_batting_numeric = df_batting.drop(['team','year', 'games'], axis=1)
    
    teams = list(df_batting[df_batting.year == year]['team'])

    def standardize_cols(df):
    
        scaler = MinMaxScaler()
    
        df = df.astype(float)
    
        scaled_df = scaler.fit_transform(df)
        scaled_df = pd.DataFrame(scaled_df, columns = df.columns, index=teams)
        
        return scaled_df

    df_pitching_s = standardize_cols(df_pitching_numeric)
    df_batting_s = standardize_cols(df_batting_numeric)
    
    return df_pitching_s, df_batting_s

In [5]:
def neighbors_df(opp_league, df, similar_team_abb, opponent_abb, num_similar = 6, n_neighbors=30):
    
    """
    Use NearestNeighbors from sklearn to determine the closest teams in performance between an opponent and other
    teams in their league. The function starts by identifying the teams in the opponent's league, followed by 
    fitting a NearestNeighbors model to the full data set. Then, the index and distances are extracted
    from the model, the data is filtered for the team's league, and the opponent plus the 5 teams that
    are "closest" to them in their league are returned.
    """
    
    league = {'American':['BAL','BOS','CHW','CLE','DET','HOU','KCR','LAA','MIN','NYY','OAK','SEA','TBR','TEX','TOR'],
             'National':['ARI','ATL','CHC','CIN','COL','LAD','MIA','MIL','NYM','PHI','PIT','SDP','SFG','STL','WSN']}
    
    league_teams = [v for k, v in league.items() if opp_league == k]
    league_teams = [item for sublist in league_teams for item in sublist]
    
    neighbors = NearestNeighbors(n_neighbors = n_neighbors)
    
    neighbors.fit(df)
    
    distance, idx = neighbors.kneighbors(df[df.index == similar_team_abb], n_neighbors=n_neighbors)

    diff_df = pd.DataFrame()

    diff_df['idx'] = [idx.item(i) for i in range(idx.shape[1])]
    diff_df['distance'] = [distance.item(i) for i in range(distance.shape[1])]

    diff_df = diff_df.sort_values(by='idx')

    df_neighbor = df.copy()
    df_neighbor['distances'] = diff_df['distance'].values

    leagues = []

    for i in df_neighbor.index:
    
        for k, v in league.items():
        
            if i in v:
            
                leagues.append(k)

    df_neighbor['league'] = leagues

    df_neighbor = df_neighbor[(df_neighbor.league == opp_league) & (df_neighbor.index != opponent_abb)]
    
    df_neighbor = df_neighbor.sort_values(by='distances')

    df_neighbor = df_neighbor.iloc[:num_similar,:]
    
    return df_neighbor

In [6]:
def series_simulation_batting(runs_for_team1, runs_for_team2, series_length = 5, num_games_to_win = 3, num_simulations=20000):
    
    """
    Using the performance of a team against their opponent and most similar teams, we estimate the outcome of 
    how a team will perform with batting by randomly picking a score from each team's batting performance, 
    then choosing the team with the higher score as the winner. We continue to run game simulations until
    the team hits the number of games needed to win the series, then repeat the process for as many simulations
    as we desire.
    """
    
    winner = []
    total_games = []
    
    for i in range(num_simulations):
        
        team1_wins = 0
        team2_wins = 0
    
        for i in range(series_length):
            
            team1_score = np.random.choice(runs_for_team1)
            team2_score = np.random.choice(runs_for_team2)
    
            while team1_score == team2_score:
        
                team1_score = np.random.choice(runs_for_team1)
                team2_score = np.random.choice(runs_for_team2)
                
            if team1_score > team2_score:
                
                team1_wins += 1
            
            elif team2_score > team1_score:
                
                team2_wins += 1

            if (team1_wins == num_games_to_win) | (team2_wins == num_games_to_win):
                
                winner.append([1 if team1_wins == num_games_to_win else 0])
                total_games.append(team1_wins + team2_wins)
                
                break
                
    winner = [item for sublist in winner for item in sublist]
        
    return winner, total_games

In [7]:
def series_simulation_pitching(runs_against_team1, runs_against_team2, series_length = 5, num_games_to_win = 3, 
                               num_simulations=20000):
    
    """
    Using the performance of a team against their opponent and most similar teams, we estimate the outcome of 
    how a team will perform with pitching by randomly picking a score from each team's pitching performance, 
    then choosing the team with the lower score as the winner (i.e. less runs given up). We continue to run game 
    simulations until the team hits the number of games needed to win the series, then repeat the process for as 
    many simulations as we desire.
    """
    
    winner = []
    total_games = []
    
    for i in range(num_simulations):
        
        team1_wins = 0
        team2_wins = 0
    
        for i in range(series_length):
            
            team1_runs_allowed = np.random.choice(runs_against_team1)
            team2_runs_allowed = np.random.choice(runs_against_team2)
            
            while team1_runs_allowed == team2_runs_allowed:
        
                team1_runs_allowed = np.random.choice(runs_against_team1)
                team2_runs_allowed = np.random.choice(runs_against_team2)
                
            if team1_runs_allowed < team2_runs_allowed:
                
                team1_wins += 1
            
            elif team2_runs_allowed < team1_runs_allowed:
                
                team2_wins += 1
                
            else:
                continue
            
            if (team1_wins == num_games_to_win) | (team2_wins == num_games_to_win):
                
                winner.append([1 if team1_wins == num_games_to_win else 0])
                total_games.append(team1_wins + team2_wins)
                
                break
                
    winner = [item for sublist in winner for item in sublist]
        
    return winner, total_games

In [9]:
def win_probability(pitching_simulation_data, batting_simulation_data):
    
    """
    Using the simulated data, we determine the total number of wins for a team by adding across the columns
    so that for each simulation the result will be 0, 1, or 2. The total possible number of wins is the length
    of the data frame times 2, so we divide by that number and return the probability.
    """
    
    winner_df = pd.DataFrame({'pitching_winner':pitching_simulation_data, 'batting_winner':batting_simulation_data})
    
    winner_df['combined'] = winner_df['pitching_winner'] + winner_df['batting_winner']
    
    total_pred = winner_df.combined.sum() / (len(winner_df) * 2)
    
    return total_pred

In [8]:
df_pitching_s, df_batting_s = standardize_dfs(2018)

# Play in Game: Yankees vs. Oakland

In [10]:
df_nyy = read_in_data('nyy')
df_oak = read_in_data('oak')

nyy_pitching_neighbors = neighbors_df('American', df_pitching_s, 'NYY', 'OAK')
oak_pitching_neighbors = neighbors_df('American', df_pitching_s, 'OAK', 'NYY')

nyy_batting_neighbors = neighbors_df('American', df_batting_s, 'NYY', 'OAK')
oak_batting_neighbors = neighbors_df('American', df_batting_s, 'OAK', 'NYY')

nyy_batting_performance = df_nyy[df_nyy.opponent.isin(oak_pitching_neighbors.index)]
oak_batting_performance = df_oak[df_oak.opponent.isin(nyy_pitching_neighbors.index)]

nyy_pitching_performance = df_nyy[df_nyy.opponent.isin(oak_batting_neighbors.index)]
oak_pitching_performance = df_oak[df_oak.opponent.isin(nyy_batting_neighbors.index)]

nyy_oak_winner_b, nyy_oak_totalgames_b = series_simulation_batting(nyy_batting_performance['runs_for'], 
                                                                   oak_batting_performance['runs_for'],
                                                                  series_length=1, num_games_to_win = 1)

nyy_oak_winner_p, nyy_oak_totalgames_p = series_simulation_pitching(nyy_pitching_performance['runs_against'], 
                                                                   oak_pitching_performance['runs_against'],
                                                                    series_length= 1, num_games_to_win = 1)

nyy_win_prob_playin = win_probability(nyy_oak_winner_b, nyy_oak_winner_p)
oak_win_prob_playin = 1-nyy_win_prob_playin

In [11]:
print("Yankees's win probability: {:.0f}%".format(nyy_win_prob_playin * 100))
print("Oakland's win probability: {:.0f}%".format(oak_win_prob_playin * 100))

Yankees's win probability: 58%
Oakland's win probability: 42%


# Play in Game: Rockies vs. Cubs

In [12]:
df_col = read_in_data('col')
df_chc = read_in_data('chc')

col_pitching_neighbors = neighbors_df('National', df_pitching_s, 'COL', 'CHC')
chc_pitching_neighbors = neighbors_df('National', df_pitching_s, 'CHC', 'COL')

col_batting_neighbors = neighbors_df('National', df_batting_s, 'CHC', 'LAD')
chc_batting_neighbors = neighbors_df('National', df_batting_s, 'LAD', 'CHC')

col_batting_performance = df_col[df_col.opponent.isin(chc_pitching_neighbors.index)]
chc_batting_performance = df_chc[df_chc.opponent.isin(col_pitching_neighbors.index)]

col_pitching_performance = df_col[df_col.opponent.isin(chc_batting_neighbors.index)]
chc_pitching_performance = df_chc[df_chc.opponent.isin(col_batting_neighbors.index)]

col_chc_winner_b, col_chc_totalgames_b = series_simulation_batting(col_batting_performance['runs_for'], 
                                                                   chc_batting_performance['runs_for'])

col_chc_winner_p, col_chc_totalgames_p = series_simulation_pitching(col_pitching_performance['runs_against'], 
                                                                   chc_pitching_performance['runs_against'])

col_win_prob_playin = win_probability(col_chc_winner_p, col_chc_winner_b)
chc_win_prob_playin = 1-col_win_prob_playin

In [13]:
print("Rockie's win probability: {:.0f}%".format(col_win_prob_playin * 100))
print("Cubs's win probability: {:.0f}%".format(chc_win_prob_playin * 100))

Rockie's win probability: 53%
Cubs's win probability: 47%


# ALCS: Red Sox vs. Yankees

In [14]:
df_bos = read_in_data('bos')

bos_pitching_neighbors = neighbors_df('American', df_pitching_s, 'BOS', 'NYY')
nyy_pitching_neighbors = neighbors_df('American', df_pitching_s, 'NYY', 'BOS')

bos_batting_neighbors = neighbors_df('American', df_batting_s, 'BOS', 'NYY')
nyy_batting_neighbors = neighbors_df('American', df_batting_s, 'NYY', 'BOS')

nyy_batting_performance = df_nyy[df_nyy.opponent.isin(bos_pitching_neighbors.index)]
bos_batting_performance = df_bos[df_bos.opponent.isin(nyy_pitching_neighbors.index)]

bos_pitching_performance = df_bos[df_bos.opponent.isin(nyy_batting_neighbors.index)]
nyy_pitching_performance = df_nyy[df_nyy.opponent.isin(bos_batting_neighbors.index)]

nyy_bos_winner_b, nyy_bos_totalgames_b = series_simulation_batting(nyy_batting_performance['runs_for'], 
                                                                   bos_batting_performance['runs_for'])

nyy_bos_winner_p, nyy_bos_totalgames_p = series_simulation_pitching(nyy_pitching_performance['runs_against'], 
                                                                   bos_pitching_performance['runs_against'])
nyy_win_prob = win_probability(nyy_bos_winner_p, nyy_bos_winner_b)
bos_win_prob = 1-nyy_win_prob

In [17]:
print("Yankees's win probability: {:.0f}%".format(nyy_win_prob * 100))
print("Red Sox's win probability: {:.0f}%".format(bos_win_prob * 100))

Yankees's win probability: 52%
Red Sox's win probability: 48%


# ALDS: Cleveland vs. Houston simulation

In [21]:
df_cle = read_in_data('cle')
df_hou = read_in_data('hou')

cle_pitching_neighbors = neighbors_df('American', df_pitching_s, 'CLE', 'HOU')
hou_pitching_neighbors = neighbors_df('American', df_pitching_s, 'HOU', 'CLE')

cle_batting_neighbors = neighbors_df('American', df_batting_s, 'CLE', 'HOU')
hou_batting_neighbors = neighbors_df('American', df_batting_s, 'HOU', 'CLE')

cle_batting_performance = df_cle[df_cle.opponent.isin(hou_pitching_neighbors.index)]
hou_batting_performance = df_hou[df_hou.opponent.isin(cle_pitching_neighbors.index)]

cle_pitching_performance = df_cle[df_cle.opponent.isin(hou_batting_neighbors.index)]
hou_pitching_performance = df_hou[df_hou.opponent.isin(cle_batting_neighbors.index)]

cle_hou_winner_b, cle_hou_totalgames_b = series_simulation_batting(cle_batting_performance['runs_for'], 
                                                                   hou_batting_performance['runs_for'])

cle_hou_winner_p, cle_hou_totalgames_p = series_simulation_pitching(cle_pitching_performance['runs_against'], 
                                                                   hou_pitching_performance['runs_against'])

cle_win_prob = win_probability(cle_hou_winner_p, cle_hou_winner_b)
hou_win_prob = 1-cle_win_prob

In [22]:
print("Cleveland's win probability: {:.0f}%".format(cle_win_prob * 100))
print("Houston's win probability: {:.0f}%".format(hou_win_prob * 100))

Cleveland's win probability: 33%
Houston's win probability: 67%


# NLDS: Los Angeles Dodgers vs. Atlanta Braves

In [23]:
df_atl = read_in_data('atl')
df_lad = read_in_data('lad')

atl_pitching_neighbors = neighbors_df('National', df_pitching_s, 'ATL', 'LAD')
lad_pitching_neighbors = neighbors_df('National', df_pitching_s, 'LAD', 'ATL')

atl_batting_neighbors = neighbors_df('National', df_batting_s, 'ATL', 'LAD')
lad_batting_neighbors = neighbors_df('National', df_batting_s, 'LAD', 'ATL')

atl_batting_performance = df_atl[df_atl.opponent.isin(lad_pitching_neighbors.index)]
lad_batting_performance = df_lad[df_lad.opponent.isin(atl_pitching_neighbors.index)]

atl_pitching_performance = df_atl[df_atl.opponent.isin(lad_batting_neighbors.index)]
lad_pitching_performance = df_lad[df_lad.opponent.isin(atl_batting_neighbors.index)]

lad_atl_winner_b, lad_atl_totalgames_b = series_simulation_batting(lad_batting_performance['runs_for'], 
                                                                   atl_batting_performance['runs_for'])

lad_atl_winner_p, lad_atl_totalgames_p = series_simulation_pitching(lad_pitching_performance['runs_against'], 
                                                                   atl_pitching_performance['runs_against'])

lad_win_prob = win_probability(lad_atl_winner_p, lad_atl_winner_b)
atl_win_prob = 1-lad_win_prob

In [24]:
print("Dodger's win probability: {:.0f}%".format(lad_win_prob * 100))
print("Atlanta's win probability: {:.0f}%".format(atl_win_prob * 100))

Dodger's win probability: 60%
Atlanta's win probability: 40%


# NLDS: Colorado vs. Milwaukee

In [25]:
df_mil = read_in_data('mil')

col_pitching_neighbors = neighbors_df('National', df_pitching_s, 'COL', 'MIL')
mil_pitching_neighbors = neighbors_df('National', df_pitching_s, 'MIL', 'COL')

col_batting_neighbors = neighbors_df('National', df_batting_s, 'COL', 'MIL')
mil_batting_neighbors = neighbors_df('National', df_batting_s, 'MIL', 'COL')

col_batting_performance = df_col[df_col.opponent.isin(mil_pitching_neighbors.index)]
mil_batting_performance = df_mil[df_mil.opponent.isin(col_pitching_neighbors.index)]

col_pitching_performance = df_col[df_col.opponent.isin(mil_batting_neighbors.index)]
mil_pitching_performance = df_mil[df_mil.opponent.isin(col_batting_neighbors.index)]

col_mil_winner_b, col_mil_totalgames_b = series_simulation_batting(col_batting_performance['runs_for'], 
                                                                   mil_batting_performance['runs_for'])

col_mil_winner_p, col_mil_totalgames_p = series_simulation_pitching(col_pitching_performance['runs_against'], 
                                                                   mil_pitching_performance['runs_against'])

col_win_prob = win_probability(col_mil_winner_p, col_mil_winner_b)
mil_win_prob = 1-col_win_prob

In [26]:
print("Colorado's win probability: {:.0f}%".format(col_win_prob * 100))
print("Milwaukee's win probability: {:.0f}%".format(mil_win_prob * 100))

Colorado's win probability: 37%
Milwaukee's win probability: 63%


# ALCS: Red Sox vs. Houston

In [30]:
bos_pitching_neighbors = neighbors_df('American', df_pitching_s, 'BOS', 'HOU')
hou_pitching_neighbors = neighbors_df('American', df_pitching_s, 'HOU', 'BOS')

bos_batting_neighbors = neighbors_df('American', df_batting_s, 'BOS', 'HOU')
hou_batting_neighbors = neighbors_df('American', df_batting_s, 'HOU', 'BOS')

bos_batting_performance = df_bos[df_bos.opponent.isin(hou_pitching_neighbors.index)]
hou_batting_performance = df_hou[df_hou.opponent.isin(bos_pitching_neighbors.index)]

bos_pitching_performance = df_bos[df_bos.opponent.isin(hou_batting_neighbors.index)]
hou_pitching_performance = df_hou[df_hou.opponent.isin(bos_batting_neighbors.index)]

bos_hou_winner_b, bos_hou_totalgames_b = series_simulation_batting(bos_batting_performance['runs_for'], 
                                                                   hou_batting_performance['runs_for'], 
                                                                   series_length = 7, num_games_to_win = 4)

bos_hou_winner_p, bos_hou_totalgames_p = series_simulation_pitching(bos_pitching_performance['runs_against'], 
                                                                   hou_pitching_performance['runs_against'],
                                                                   series_length = 7, num_games_to_win = 4)

bos_win_prob_alcs = win_probability(bos_hou_winner_p, bos_hou_winner_b)
hou_win_prob_alcs = 1-bos_win_prob_alcs

In [31]:
print("Boston's win probability: {:.0f}%".format(bos_win_prob_alcs * 100))
print("Houston's win probability: {:.0f}%".format(hou_win_prob_alcs * 100))

Boston's win probability: 45%
Houston's win probability: 55%


# NLCS: Brewers vs. Dodgers

In [32]:
mil_pitching_neighbors = neighbors_df('National', df_pitching_s, 'MIL', 'LAD')
lad_pitching_neighbors = neighbors_df('National', df_pitching_s, 'LAD', 'MIL')

mil_batting_neighbors = neighbors_df('National', df_batting_s, 'MIL', 'LAD')
lad_batting_neighbors = neighbors_df('National', df_batting_s, 'LAD', 'MIL')

mil_batting_performance = df_mil[df_mil.opponent.isin(lad_pitching_neighbors.index)]
lad_batting_performance = df_lad[df_lad.opponent.isin(mil_pitching_neighbors.index)]

mil_pitching_performance = df_mil[df_mil.opponent.isin(lad_batting_neighbors.index)]
lad_pitching_performance = df_lad[df_lad.opponent.isin(mil_batting_neighbors.index)]

lad_mil_winner_b, lad_mil_totalgames_b = series_simulation_batting(lad_batting_performance['runs_for'], 
                                                                   atl_batting_performance['runs_for'],
                                                                  series_length=7, num_games_to_win = 4)

lad_mil_winner_p, lad_mil_totalgames_p = series_simulation_pitching(lad_pitching_performance['runs_against'], 
                                                                   atl_pitching_performance['runs_against'],
                                                                    series_length=7, num_games_to_win = 4)

lad_win_prob_nlcs = win_probability(lad_mil_winner_p, lad_mil_winner_b)
mil_win_prob_nlcs = 1-lad_win_prob_nlcs

In [33]:
print("Dodger's win probability: {:.0f}%".format(lad_win_prob_nlcs * 100))
print("Milwaukee's win probability: {:.0f}%".format(mil_win_prob_nlcs * 100))

Dodger's win probability: 69%
Milwaukee's win probability: 31%


# World Series: Dodgers vs Red Sox

In [34]:
bos_pitching_neighbors = neighbors_df('National', df_pitching_s, 'BOS', 'LAD')
lad_pitching_neighbors = neighbors_df('American', df_pitching_s, 'LAD', 'BOS')

bos_batting_neighbors = neighbors_df('National', df_batting_s, 'BOS', 'LAD')
lad_batting_neighbors = neighbors_df('American', df_batting_s, 'LAD', 'BOS')

bos_batting_performance = df_bos[df_bos.opponent.isin(lad_pitching_neighbors.index)]
lad_batting_performance = df_lad[df_lad.opponent.isin(bos_pitching_neighbors.index)]

bos_pitching_performance = df_bos[df_bos.opponent.isin(lad_batting_neighbors.index)]
lad_pitching_performance = df_lad[df_lad.opponent.isin(bos_batting_neighbors.index)]

lad_bos_winner_b, lad_bos_totalgames_b = series_simulation_batting(lad_batting_performance['runs_for'], 
                                                                   bos_batting_performance['runs_for'],
                                                                  series_length=7, num_games_to_win = 4)

lad_bos_winner_p, lad_bos_totalgames_p = series_simulation_pitching(lad_pitching_performance['runs_against'], 
                                                                   bos_pitching_performance['runs_against'],
                                                                    series_length=7, num_games_to_win = 4)

lad_win_prob_ws = win_probability(lad_bos_winner_p, lad_bos_winner_b)
bos_win_prob_ws = 1-lad_win_prob_ws

In [36]:
print("Dodger's win probability: {:.0f}%".format(lad_win_prob_ws * 100))
print("Boston's win probability: {:.0f}%".format(bos_win_prob_ws * 100))

Dodger's win probability: 67%
Boston's win probability: 33%


In [35]:
avg_games = np.mean(lad_bos_totalgames_b + lad_bos_totalgames_p)

print('The average number of games to finish the series is: %.1f games' %avg_games)

The average number of games to finish the series is: 5.7 games
