# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import requests
from lxml import html
from bs4 import BeautifulSoup
import re
from urllib.parse import urlparse, parse_qs

import pprint

from openpyxl import load_workbook

from scipy.stats import truncnorm

from datetime import datetime

# Get data from retrosheet

In [2]:
session_a = requests.session()

In [3]:
url_a = 'https://www.retrosheet.org/boxesetc/1995/Y_1995.htm'

In [4]:
a = session_a.get(url_a)

In [5]:
soup_a = BeautifulSoup(a.text, 'html.parser')
soup_a

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0//EN" "https://www.w3.org/TR/REC-html40/strict.dtd">

<html dir="LTR" lang="EN">
<pre><a href="../MISC/Y_descr.htm">Read Me</a></pre>
<head>
<title>The 1995 Season</title>
<link href="https://www.retrosheet.org/menubar/menubar.css" rel="stylesheet" type="text/css"/>
<script src="https://www.retrosheet.org/menubar/menubar.js" type="text/javascript"></script>
</head>
<body>
<p class="nopad"><a href="https://www.retrosheet.org"><img alt="Retrosheet" class="bancenter" height="46" src="https://www.retrosheet.org/menubar/retro-logo.gif" width="400"/></a></p>
<div class="mbcenter">
<ul class="nav">
<li><a href="https://www.retrosheet.org/">Home</a>
<li><a href="https://www.retrosheet.org/searches/search.html">Search</a></li>
<li><a href="#">Games/People/Parks ↓</a>
<ul>
<li><a href="#">People →</a>
<ul>
<li><a href="https://www.retrosheet.org/boxesetc/index.html#Players">Players</a>
<li><a href="https://www.retrosheet.org/boxesetc/index.html#Managers

In [6]:
tags = soup_a.find_all('a', {'href': re.compile(r'../1995/T\w.*')})
tags


[<a href="../1995/TATL01995.htm">Atlanta Braves</a>,
 <a href="../1995/TNYN01995.htm">New York Mets</a>,
 <a href="../1995/TPHI01995.htm">Philadelphia Phillies</a>,
 <a href="../1995/TFLO01995.htm">Florida Marlins</a>,
 <a href="../1995/TMON01995.htm">Montreal Expos</a>,
 <a href="../1995/TCIN01995.htm">Cincinnati Reds</a>,
 <a href="../1995/THOU01995.htm">Houston Astros</a>,
 <a href="../1995/TCHN01995.htm">Chicago Cubs</a>,
 <a href="../1995/TSLN01995.htm">St. Louis Cardinals</a>,
 <a href="../1995/TPIT01995.htm">Pittsburgh Pirates</a>,
 <a href="../1995/TLAN01995.htm">Los Angeles Dodgers</a>,
 <a href="../1995/TCOL01995.htm">Colorado Rockies</a>,
 <a href="../1995/TSDN01995.htm">San Diego Padres</a>,
 <a href="../1995/TSFN01995.htm">San Francisco Giants</a>,
 <a href="../1995/TBOS01995.htm">Boston Red Sox</a>,
 <a href="../1995/TNYA01995.htm">New York Yankees</a>,
 <a href="../1995/TBAL01995.htm">Baltimore Orioles</a>,
 <a href="../1995/TDET01995.htm">Detroit Tigers</a>,
 <a href=".

In [7]:
my_hrefs = []
for t in tags:
    link = t.get('href')
    p = urlparse(link).path.rsplit("/", 1)[-1]
    my_hrefs.append(p)
my_hrefs

['TATL01995.htm',
 'TNYN01995.htm',
 'TPHI01995.htm',
 'TFLO01995.htm',
 'TMON01995.htm',
 'TCIN01995.htm',
 'THOU01995.htm',
 'TCHN01995.htm',
 'TSLN01995.htm',
 'TPIT01995.htm',
 'TLAN01995.htm',
 'TCOL01995.htm',
 'TSDN01995.htm',
 'TSFN01995.htm',
 'TBOS01995.htm',
 'TNYA01995.htm',
 'TBAL01995.htm',
 'TDET01995.htm',
 'TTOR01995.htm',
 'TCLE01995.htm',
 'TKCA01995.htm',
 'TCHA01995.htm',
 'TMIL01995.htm',
 'TMIN01995.htm',
 'TSEA01995.htm',
 'TCAL01995.htm',
 'TTEX01995.htm',
 'TOAK01995.htm']

In [8]:
my_teams = [char[1:4] for char in my_hrefs]

In [9]:
my_teams

['ATL',
 'NYN',
 'PHI',
 'FLO',
 'MON',
 'CIN',
 'HOU',
 'CHN',
 'SLN',
 'PIT',
 'LAN',
 'COL',
 'SDN',
 'SFN',
 'BOS',
 'NYA',
 'BAL',
 'DET',
 'TOR',
 'CLE',
 'KCA',
 'CHA',
 'MIL',
 'MIN',
 'SEA',
 'CAL',
 'TEX',
 'OAK']

In [10]:
session = requests.session()

In [11]:
url = "https://www.retrosheet.org/boxesetc/1995/TCLE01995.htm"

In [12]:
s = session.get(url)

In [13]:
soup = BeautifulSoup(s.text, 'html.parser')

In [14]:
soup

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0//EN" "https://www.w3.org/TR/REC-html40/strict.dtd">

<html dir="LTR" lang="EN">
<pre><a href="../MISC/Tdescr.htm">Read Me</a></pre>
<head>
<title>The 1995 Cleveland Indians</title>
<link href="https://www.retrosheet.org/menubar/menubar.css" rel="stylesheet" type="text/css"/>
<script src="https://www.retrosheet.org/menubar/menubar.js" type="text/javascript"></script>
</head>
<body>
<p class="nopad"><a href="https://www.retrosheet.org"><img alt="Retrosheet" class="bancenter" height="46" src="https://www.retrosheet.org/menubar/retro-logo.gif" width="400"/></a></p>
<div class="mbcenter">
<ul class="nav">
<li><a href="https://www.retrosheet.org/">Home</a>
<li><a href="https://www.retrosheet.org/searches/search.html">Search</a></li>
<li><a href="#">Games/People/Parks ↓</a>
<ul>
<li><a href="#">People →</a>
<ul>
<li><a href="https://www.retrosheet.org/boxesetc/index.html#Players">Players</a>
<li><a href="https://www.retrosheet.org/boxesetc/index.htm

In [15]:
first_pre = soup.find('h4')
first_pre

<h4>Home and Away Record</h4>

In [16]:
first_pre.find_next_siblings('pre')[0].text

'                   Total                      Home                       Away\nTeam       G    W    L   RS   RA      G    W    L   RS   RA      G    W    L   RS   RA\nTotal    144  100   44  840  607     72   54   18  400  272     72   46   26  440  335\nBAL A     12   10    2   57   39      6    6    0   30   11      6    4    2   27   28\nBOS A     13    7    6   71   69      7    4    3   34   31      6    3    3   37   38\nCAL A      5    2    3   27   32      2    1    1   10   13      3    1    2   17   19\nCHI A     13    8    5   79   59      7    6    1   47   29      6    2    4   32   30\nDET A     13   10    3   91   44      6    6    0   36   14      7    4    3   55   30\nKC  A     12   11    1   76   28      6    6    0   48   15      6    5    1   28   13\nMIL A     13    9    4   83   64      7    4    3   37   37      6    5    1   46   27\nMIN A     13    9    4   91   66      6    4    2   40   31      7    5    2   51   35\nNY  A     12    6    6   54   57      6 

In [17]:
data = first_pre.find_next_siblings('pre')[0].text
data

'                   Total                      Home                       Away\nTeam       G    W    L   RS   RA      G    W    L   RS   RA      G    W    L   RS   RA\nTotal    144  100   44  840  607     72   54   18  400  272     72   46   26  440  335\nBAL A     12   10    2   57   39      6    6    0   30   11      6    4    2   27   28\nBOS A     13    7    6   71   69      7    4    3   34   31      6    3    3   37   38\nCAL A      5    2    3   27   32      2    1    1   10   13      3    1    2   17   19\nCHI A     13    8    5   79   59      7    6    1   47   29      6    2    4   32   30\nDET A     13   10    3   91   44      6    6    0   36   14      7    4    3   55   30\nKC  A     12   11    1   76   28      6    6    0   48   15      6    5    1   28   13\nMIL A     13    9    4   83   64      7    4    3   37   37      6    5    1   46   27\nMIN A     13    9    4   91   66      6    4    2   40   31      7    5    2   51   35\nNY  A     12    6    6   54   57      6 

In [18]:
table = pd.DataFrame([x.strip('\t') for x in data.split('\n')], columns=["unsplit"])
table.drop(table.index[0:3], axis=0, inplace=True)
table.drop(table.index[-1], axis=0, inplace=True)
table = table.reset_index()
table = table.drop(columns="index")
table

Unnamed: 0,unsplit
0,BAL A 12 10 2 57 39 6 6 ...
1,BOS A 13 7 6 71 69 7 4 ...
2,CAL A 5 2 3 27 32 2 1 ...
3,CHI A 13 8 5 79 59 7 6 ...
4,DET A 13 10 3 91 44 6 6 ...
5,KC A 12 11 1 76 28 6 6 ...
6,MIL A 13 9 4 83 64 7 4 ...
7,MIN A 13 9 4 91 66 6 4 ...
8,NY A 12 6 6 54 57 6 2 ...
9,OAK A 7 7 0 34 17 4 4 ...


In [19]:
table.shape

(13, 1)

In [20]:
my_cols = [
    'opponent', 'league', 'total_g', 'total_w', 'total_l', 'total_rs', 'total_ra', 
    'home_g', 'home_w', 'home_l', 'home_rs', 'home_ra', 
    'away_g', 'away_w', 'away_l', 'away_rs', 'away_ra'
]
df = pd.DataFrame() 
df[my_cols] = table['unsplit'].str.split(r'\s+',expand=True)
df["team"] = "CLE"
df

Unnamed: 0,opponent,league,total_g,total_w,total_l,total_rs,total_ra,home_g,home_w,home_l,home_rs,home_ra,away_g,away_w,away_l,away_rs,away_ra,team
0,BAL,A,12,10,2,57,39,6,6,0,30,11,6,4,2,27,28,CLE
1,BOS,A,13,7,6,71,69,7,4,3,34,31,6,3,3,37,38,CLE
2,CAL,A,5,2,3,27,32,2,1,1,10,13,3,1,2,17,19,CLE
3,CHI,A,13,8,5,79,59,7,6,1,47,29,6,2,4,32,30,CLE
4,DET,A,13,10,3,91,44,6,6,0,36,14,7,4,3,55,30,CLE
5,KC,A,12,11,1,76,28,6,6,0,48,15,6,5,1,28,13,CLE
6,MIL,A,13,9,4,83,64,7,4,3,37,37,6,5,1,46,27,CLE
7,MIN,A,13,9,4,91,66,6,4,2,40,31,7,5,2,51,35,CLE
8,NY,A,12,6,6,54,57,6,2,4,21,30,6,4,2,33,27,CLE
9,OAK,A,7,7,0,34,17,4,4,0,20,12,3,3,0,14,5,CLE


In [21]:
type_dict = {
    'total_g': 'int', 
    'total_w': 'int',
    'total_l': 'int',
    'total_rs': 'int',
    'total_ra': 'int',
    'home_g': 'int', 
    'home_w': 'int',
    'home_l': 'int',
    'home_rs': 'int',
    'home_ra': 'int',
    'away_g': 'int', 
    'away_w': 'int',
    'away_l': 'int',
    'away_rs': 'int',
    'away_ra': 'int',
}
df = df.astype(type_dict)
df

Unnamed: 0,opponent,league,total_g,total_w,total_l,total_rs,total_ra,home_g,home_w,home_l,home_rs,home_ra,away_g,away_w,away_l,away_rs,away_ra,team
0,BAL,A,12,10,2,57,39,6,6,0,30,11,6,4,2,27,28,CLE
1,BOS,A,13,7,6,71,69,7,4,3,34,31,6,3,3,37,38,CLE
2,CAL,A,5,2,3,27,32,2,1,1,10,13,3,1,2,17,19,CLE
3,CHI,A,13,8,5,79,59,7,6,1,47,29,6,2,4,32,30,CLE
4,DET,A,13,10,3,91,44,6,6,0,36,14,7,4,3,55,30,CLE
5,KC,A,12,11,1,76,28,6,6,0,48,15,6,5,1,28,13,CLE
6,MIL,A,13,9,4,83,64,7,4,3,37,37,6,5,1,46,27,CLE
7,MIN,A,13,9,4,91,66,6,4,2,40,31,7,5,2,51,35,CLE
8,NY,A,12,6,6,54,57,6,2,4,21,30,6,4,2,33,27,CLE
9,OAK,A,7,7,0,34,17,4,4,0,20,12,3,3,0,14,5,CLE


In [22]:
df["total_win_pct"] = (df["total_w"] / df["total_g"]).astype(float)
df

Unnamed: 0,opponent,league,total_g,total_w,total_l,total_rs,total_ra,home_g,home_w,home_l,home_rs,home_ra,away_g,away_w,away_l,away_rs,away_ra,team,total_win_pct
0,BAL,A,12,10,2,57,39,6,6,0,30,11,6,4,2,27,28,CLE,0.833333
1,BOS,A,13,7,6,71,69,7,4,3,34,31,6,3,3,37,38,CLE,0.538462
2,CAL,A,5,2,3,27,32,2,1,1,10,13,3,1,2,17,19,CLE,0.4
3,CHI,A,13,8,5,79,59,7,6,1,47,29,6,2,4,32,30,CLE,0.615385
4,DET,A,13,10,3,91,44,6,6,0,36,14,7,4,3,55,30,CLE,0.769231
5,KC,A,12,11,1,76,28,6,6,0,48,15,6,5,1,28,13,CLE,0.916667
6,MIL,A,13,9,4,83,64,7,4,3,37,37,6,5,1,46,27,CLE,0.692308
7,MIN,A,13,9,4,91,66,6,4,2,40,31,7,5,2,51,35,CLE,0.692308
8,NY,A,12,6,6,54,57,6,2,4,21,30,6,4,2,33,27,CLE,0.5
9,OAK,A,7,7,0,34,17,4,4,0,20,12,3,3,0,14,5,CLE,1.0


In [23]:
df2 = df[["team", "opponent", "league", "total_win_pct"]]
df2

Unnamed: 0,team,opponent,league,total_win_pct
0,CLE,BAL,A,0.833333
1,CLE,BOS,A,0.538462
2,CLE,CAL,A,0.4
3,CLE,CHI,A,0.615385
4,CLE,DET,A,0.769231
5,CLE,KC,A,0.916667
6,CLE,MIL,A,0.692308
7,CLE,MIN,A,0.692308
8,CLE,NY,A,0.5
9,CLE,OAK,A,1.0


In [24]:
my_hrefs[0]

'TATL01995.htm'

In [25]:
my_hrefs

['TATL01995.htm',
 'TNYN01995.htm',
 'TPHI01995.htm',
 'TFLO01995.htm',
 'TMON01995.htm',
 'TCIN01995.htm',
 'THOU01995.htm',
 'TCHN01995.htm',
 'TSLN01995.htm',
 'TPIT01995.htm',
 'TLAN01995.htm',
 'TCOL01995.htm',
 'TSDN01995.htm',
 'TSFN01995.htm',
 'TBOS01995.htm',
 'TNYA01995.htm',
 'TBAL01995.htm',
 'TDET01995.htm',
 'TTOR01995.htm',
 'TCLE01995.htm',
 'TKCA01995.htm',
 'TCHA01995.htm',
 'TMIL01995.htm',
 'TMIN01995.htm',
 'TSEA01995.htm',
 'TCAL01995.htm',
 'TTEX01995.htm',
 'TOAK01995.htm']

In [28]:
# Create function to scrape all relevant pages
base_url = "https://www.retrosheet.org/boxesetc/"



def get_head2head(year, hrefs, cols, teams, type_dict):
    session = requests.session()
    data_frames = []  # List to store each event2 DataFrame
    div_count = 0
    
    for href in hrefs:
        result = session.get(base_url + str(year) + "/" + hrefs[div_count])
        soup = BeautifulSoup(result.text, 'html.parser')
        
        first_pre = soup.find('h4')
        data = first_pre.find_next_siblings('pre')[0].text
        
        table = pd.DataFrame([x.strip('\t') for x in data.split('\n')], columns=["unsplit"])
        table.drop(table.index[0:3], axis=0, inplace=True)
        table.drop(table.index[-1], axis=0, inplace=True)
        table = table.reset_index()
        table = table.drop(columns="index")
        
        event = pd.DataFrame()
        event[cols] = table['unsplit'].str.split(r'\s+', expand=True)
        event["team"] = teams[div_count]
        event = event.astype(type_dict)
        event["total_win_pct"] = (event["total_w"] / event["total_g"]).astype(float)
        
        event.loc[(event["total_win_pct"].between(0, 0.02778)), "roll"] = 11
        event.loc[(event["total_win_pct"].between(0.027778, 0.055556)), "roll"] = 12
        event.loc[(event["total_win_pct"].between(0.055556, 0.083333)), "roll"] = 13
        event.loc[(event["total_win_pct"].between(0.083333, 0.111111)), "roll"] = 14
        event.loc[(event["total_win_pct"].between(0.111111, 0.138889)), "roll"] = 15
        event.loc[(event["total_win_pct"].between(0.138889, 0.166667)), "roll"] = 16
        event.loc[(event["total_win_pct"].between(0.166667, 0.194444)), "roll"] = 21
        event.loc[(event["total_win_pct"].between(0.194444, 0.222222)), "roll"] = 22
        event.loc[(event["total_win_pct"].between(0.222222, 0.25)), "roll"] = 23
        event.loc[(event["total_win_pct"].between(0.25, 0.277778)), "roll"] = 24
        event.loc[(event["total_win_pct"].between(0.277778, 0.305556)), "roll"] = 25
        event.loc[(event["total_win_pct"].between(0.305556, 0.333333)), "roll"] = 26
        event.loc[(event["total_win_pct"].between(0.333333, 0.361111)), "roll"] = 31
        event.loc[(event["total_win_pct"].between(0.361111, 0.388889)), "roll"] = 32
        event.loc[(event["total_win_pct"].between(0.388889, 0.416667)), "roll"] = 33
        event.loc[(event["total_win_pct"].between(0.416667, 0.444444)), "roll"] = 34
        event.loc[(event["total_win_pct"].between(0.444444, 0.472222)), "roll"] = 35
        event.loc[(event["total_win_pct"].between(0.472222, 0.5)), "roll"] = 36
        event.loc[(event["total_win_pct"].between(0.50, 0.527778)), "roll"] = 41
        event.loc[(event["total_win_pct"].between(0.527777, 0.555556)), "roll"] = 42
        event.loc[(event["total_win_pct"].between(0.555556, 0.583333)), "roll"] = 43
        event.loc[(event["total_win_pct"].between(0.583333, 0.611111)), "roll"] = 44
        event.loc[(event["total_win_pct"].between(0.611111, 0.638889)), "roll"] = 45
        event.loc[(event["total_win_pct"].between(0.638889, 0.666667)), "roll"] = 46
        event.loc[(event["total_win_pct"].between(0.666667, 0.694444)), "roll"] = 51
        event.loc[(event["total_win_pct"].between(0.694444, 0.722222)), "roll"] = 52
        event.loc[(event["total_win_pct"].between(0.722222, 0.75)), "roll"] = 53
        event.loc[(event["total_win_pct"].between(0.750000, 0.777778)), "roll"] = 54
        event.loc[(event["total_win_pct"].between(0.777778, 0.805556)), "roll"] = 55
        event.loc[(event["total_win_pct"].between(0.805556, 0.833333)), "roll"] = 56
        event.loc[(event["total_win_pct"].between(0.833333, 0.861111)), "roll"] = 61
        event.loc[(event["total_win_pct"].between(0.861111, 0.888889)), "roll"] = 62
        event.loc[(event["total_win_pct"].between(0.888889, 0.916667)), "roll"] = 63
        event.loc[(event["total_win_pct"].between(0.916667, 0.944444)), "roll"] = 64
        event.loc[(event["total_win_pct"].between(0.944444, 0.972222)), "roll"] = 65
        event.loc[(event["total_win_pct"].between(0.972222, 1)), "roll"] = 66
        event["roll"] = event["roll"].replace(np.nan, "")
        
        event2 = event[["team", "opponent", "league", "total_win_pct", "roll"]]
        data_frames.append(event2)  # Append DataFrame to list instead of using .append()
        
        div_count += 1
    
    # Concatenate all DataFrames at once (more efficient)
    events = pd.concat(data_frames, ignore_index=True)
    return events

In [59]:
pd.set_option('display.max_rows', 400)
pd.set_option('display.max_columns', 50)
win_pct_1995 = get_head2head(1995, my_hrefs, my_cols, my_teams, type_dict)
win_pct_1995

Unnamed: 0,team,opponent,league,total_win_pct,roll
0,ATL,CHI,N,0.666667,46.0
1,ATL,CIN,N,0.615385,45.0
2,ATL,COL,N,0.692308,51.0
3,ATL,FLA,N,0.769231,54.0
4,ATL,HOU,N,0.5,41.0
5,ATL,LA,N,0.555556,42.0
6,ATL,MON,N,0.692308,51.0
7,ATL,NY,N,0.384615,32.0
8,ATL,PHI,N,0.538462,42.0
9,ATL,PIT,N,0.666667,46.0


In [60]:
win_pct_1995 = win_pct_1995.sort_values(['league', 'team', 'opponent'], ascending=[True, True, True])
win_pct_1995

Unnamed: 0,team,opponent,league,total_win_pct,roll
208,BAL,BOS,A,0.307692,26.0
209,BAL,CAL,A,0.692308,51.0
210,BAL,CHI,A,0.857143,61.0
211,BAL,CLE,A,0.166667,16.0
212,BAL,DET,A,0.615385,45.0
213,BAL,KC,A,0.444444,35.0
214,BAL,MIL,A,0.583333,44.0
215,BAL,MIN,A,0.333333,31.0
216,BAL,NY,A,0.461538,35.0
217,BAL,OAK,A,0.416667,33.0


In [61]:
table_1995 = pd.pivot_table(win_pct_1995, values='roll', index=['team'],
                    columns=['opponent', 'league'], aggfunc="sum", fill_value="", sort=False)
table_1995

opponent,BOS,CAL,CHI,CLE,DET,KC,MIL,MIN,NY,OAK,SEA,TEX,TOR,BAL,CHI,CIN,COL,FLA,HOU,LA,MON,NY,PHI,PIT,SD,SF,STL,ATL
league,A,A,A,A,A,A,A,A,A,A,A,A,A,A,N,N,N,N,N,N,N,N,N,N,N,N,N,N
team,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2
BAL,26.0,51.0,61.0,16.0,45.0,35.0,44.0,31.0,35.0,33.0,35.0,55.0,42.0,,,,,,,,,,,,,,,
BOS,,55.0,45.0,35.0,45.0,44.0,46.0,42.0,32.0,46.0,44.0,34.0,45.0,51.0,,,,,,,,,,,,,,
CAL,22.0,,61.0,44.0,54.0,33.0,52.0,45.0,44.0,35.0,42.0,35.0,55.0,26.0,,,,,,,,,,,,,,
CHA,32.0,16.0,,32.0,46.0,45.0,35.0,54.0,41.0,44.0,26.0,33.0,42.0,16.0,,,,,,,,,,,,,,
CLE,42.0,33.0,45.0,,54.0,63.0,51.0,51.0,41.0,66.0,42.0,46.0,54.0,61.0,,,,,,,,,,,,,,
DET,32.0,24.0,31.0,23.0,,34.0,45.0,44.0,32.0,33.0,41.0,31.0,42.0,32.0,,,,,,,,,,,,,,
KCA,33.0,44.0,32.0,14.0,43.0,,61.0,35.0,25.0,32.0,44.0,43.0,44.0,42.0,,,,,,,,,,,,,,
MIL,31.0,25.0,42.0,26.0,32.0,16.0,,51.0,35.0,54.0,44.0,33.0,44.0,33.0,,,,,,,,,,,,,,
MIN,35.0,32.0,23.0,26.0,33.0,42.0,26.0,,34.0,33.0,31.0,32.0,22.0,46.0,,,,,,,,,,,,,,
NYA,45.0,33.0,31.0,41.0,45.0,52.0,42.0,43.0,,26.0,26.0,46.0,64.0,42.0,,,,,,,,,,,,,,


In [54]:
table_1995.to_excel('../data/Fast Action Chart 1995.xlsx', engine='openpyxl')

In [55]:
win_pct_1995["pct"] = round(win_pct_1995["total_win_pct"], 2) * 100

In [56]:
prob_1995 = pd.pivot_table(win_pct_1995, values='pct', index=['team'],
                    columns=['opponent', 'league'], aggfunc="sum", fill_value="", sort=False)
prob_1995

opponent,BOS,CAL,CHI,CLE,DET,KC,MIL,MIN,NY,OAK,SEA,TEX,TOR,BAL,CHI,CIN,COL,FLA,HOU,LA,MON,NY,PHI,PIT,SD,SF,STL,ATL
league,A,A,A,A,A,A,A,A,A,A,A,A,A,A,N,N,N,N,N,N,N,N,N,N,N,N,N,N
team,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2
BAL,31.0,69.0,86.0,17.0,62.0,44.0,58.0,33.0,46.0,42.0,46.0,80.0,54.0,,,,,,,,,,,,,,,
BOS,,79.0,62.0,46.0,62.0,60.0,67.0,56.0,38.0,67.0,58.0,43.0,62.0,69.0,,,,,,,,,,,,,,
CAL,21.0,,83.0,60.0,75.0,42.0,71.0,62.0,58.0,46.0,54.0,46.0,80.0,31.0,,,,,,,,,,,,,,
CHA,38.0,17.0,,38.0,67.0,62.0,46.0,77.0,50.0,58.0,31.0,42.0,55.0,14.0,,,,,,,,,,,,,,
CLE,54.0,40.0,62.0,,77.0,92.0,69.0,69.0,50.0,100.0,56.0,67.0,77.0,83.0,,,,,,,,,,,,,,
DET,38.0,25.0,33.0,23.0,,43.0,62.0,58.0,38.0,40.0,50.0,33.0,54.0,38.0,,,,,,,,,,,,,,
KCA,40.0,58.0,38.0,8.0,57.0,,83.0,46.0,30.0,38.0,58.0,57.0,58.0,56.0,,,,,,,,,,,,,,
MIL,33.0,29.0,54.0,31.0,38.0,17.0,,69.0,45.0,78.0,60.0,42.0,58.0,42.0,,,,,,,,,,,,,,
MIN,44.0,38.0,23.0,31.0,42.0,54.0,31.0,,43.0,42.0,33.0,38.0,20.0,67.0,,,,,,,,,,,,,,
NYA,62.0,42.0,33.0,50.0,62.0,70.0,55.0,57.0,,31.0,31.0,67.0,92.0,54.0,,,,,,,,,,,,,,


In [40]:
prob_1995.to_excel('../data/Fast Action Chart w Probs 1995.xlsx', engine='openpyxl')

# Generate random score

In [None]:
win_runs = max(1, np.random.normal(5.4, 3, 1).round(0))
lose_runs = max(0, np.random.normal(2.4, 2, 1).round(0))

print(f'Winner score: {win_runs}')
print(f'Loser score: {lose_runs}')

In [None]:
import scipy.stats
import pandas as pd
lower = 0
upper = 24
mu = 4.103472883
sigma = 2.916943731
N = 1

samples = scipy.stats.truncnorm.rvs(
          (lower-mu)/sigma,(upper-mu)/sigma,loc=mu,scale=sigma,size=N)
win_runs = max(1.0, scipy.stats.truncnorm.rvs(
          (lower-mu)/sigma,(upper-mu)/sigma,loc=mu,scale=sigma,size=N).round(0))
lose_runs = min(win_runs - 1.0, scipy.stats.truncnorm.rvs(
          (lower-mu)/sigma,(upper-mu)/sigma,loc=mu,scale=sigma,size=N).round(0))
print(f'Winner score: {win_runs}')
print(f'Loser score: {lose_runs}')

score_df = pd.DataFrame()
score_df['win_score'] = win_runs
score_df['lose_score'] = lose_runs
score_df

In [None]:
lose_score_list = []

lose_zeros = [0 for _ in range(305)]
lose_ones = [1 for _ in range(424)]
lose_twos = [2 for _ in range(440)]
lose_threes = [3 for _ in range(367)]
lose_fours = [4 for _ in range(238)]
lose_fives = [5 for _ in range(149)]
lose_sixes = [6 for _ in range(80)]
lose_sevens = [7 for _ in range(50)]
lose_eights = [8 for _ in range(24)]
lose_nines = [9 for _ in range(18)]
lose_tens = [10 for _ in range(4)]
lose_elevens = [11 for _ in range(2)]
lose_twelves = [12 for _ in range(1)]


win_score_list = []

win_ones = [1 for _ in range(51)]
win_twos = [2 for _ in range(176)]
win_threes = [3 for _ in range(289)]
win_fours = [4 for _ in range(312)]
win_fives = [5 for _ in range(323)]
win_sixes = [6 for _ in range(251)]
win_sevens = [7 for _ in range(205)]
win_eights = [8 for _ in range(158)]
win_nines = [9 for _ in range(132)]
win_tens = [10 for _ in range(81)]
win_elevens = [11 for _ in range(49)]
win_twelves = [12 for _ in range(30)]
win_thirteens = [13 for _ in range(20)]
win_fourteens = [14 for _ in range(7)]
win_fifteens = [15 for _ in range(7)]
win_sixteens = [16 for _ in range(5)]
win_seventeens = [17 for _ in range(3)]
win_nineteens = [19, 19]
win_twenty4s = [24]


In [None]:
lose_score_list.extend(lose_zeros)
lose_score_list.extend(lose_ones)
lose_score_list.extend(lose_twos)
lose_score_list.extend(lose_threes)
lose_score_list.extend(lose_fours)
lose_score_list.extend(lose_fives)
lose_score_list.extend(lose_sixes)
lose_score_list.extend(lose_sevens)
lose_score_list.extend(lose_eights)
lose_score_list.extend(lose_nines)
lose_score_list.extend(lose_tens)
lose_score_list.extend(lose_elevens)
lose_score_list.extend(lose_twelves)


win_score_list.extend(win_ones)
win_score_list.extend(win_twos)
win_score_list.extend(win_threes)
win_score_list.extend(win_fours)
win_score_list.extend(win_fives)
win_score_list.extend(win_sixes)
win_score_list.extend(win_sevens)
win_score_list.extend(win_eights)
win_score_list.extend(win_nines)
win_score_list.extend(win_tens)
win_score_list.extend(win_elevens)
win_score_list.extend(win_twelves)
win_score_list.extend(win_thirteens)
win_score_list.extend(win_fourteens)
win_score_list.extend(win_fifteens)
win_score_list.extend(win_sixteens)
win_score_list.extend(win_seventeens)
win_score_list.extend(win_nineteens)
win_score_list.extend(win_twenty4s)

In [None]:
len(win_score_list)

In [None]:
import random

winRuns=list()
loseRuns=list()

win_runs = random.choices(win_score_list, k=1)
lose_runs = random.choices(lose_score_list, k=1)

print(win_runs)
print(lose_runs)

win_runs[0] = max(1, win_runs[0])
lose_runs[0] = min(win_runs[0] - 1, lose_runs[0])

print(win_runs)
print(lose_runs)

winRuns.extend(win_runs)
loseRuns.extend(lose_runs)

print(winRuns)

# winRuns = winRuns.append(win_runs)
# loseRuns = loseRuns.append(lose_runs)

# winRuns
# # loseRuns

# score_df = pd.DataFrame()
# score_df['win_score'] = win_runs
# score_df['lose_score'] = lose_runs
# score_df

In [None]:
winRuns

In [None]:
def scores(win_score_list, lose_score_list, N):
    score_df=pd.DataFrame()
    win_runs=list()
    lose_runs=list()
    winRuns = random.choices(win_score_list, k=N)
    loseRuns = random.choices(lose_score_list, k=N)
    i=0
    for i in range(N):
        if winRuns[i] <= loseRuns[i]:
#             winRuns[i], loseRuns[i] = loseRuns[i], winRuns[i]
#         elif winRuns[i] == loseRuns[i]:
            new_win = random.choices(win_score_list, k=1)
            new_lose = random.choices(lose_score_list, k=1)
            winRuns[i], loseRuns[i] = new_win[0], new_lose[0]
            
#         winRuns[i] = max(1, winRuns[i])
#         loseRuns[i] = min(winRuns[i] - 1, loseRuns[i])
        
    score_df['win score'] = winRuns
    score_df['lose score'] = loseRuns
    
    return score_df
        
    

In [None]:
scores_df = scores(win_score_list, lose_score_list, 2106)

In [None]:
scores_df

In [None]:
sns.histplot(scores_df, x=scores_df['win score'])

In [None]:
sns.histplot(scores_df, x=scores_df['lose score'])

In [None]:
scores_df['win score'].value_counts()

In [None]:
scores_df['win score'].mean()

In [None]:
scores_df['lose score'].value_counts()

In [None]:
scores_df['lose score'].mean()

In [None]:
win_runs = random.choices(win_score_list, k=3000)
lose_runs = random.choices(lose_score_list, k=3000)

my_scores = pd.DataFrame({'win_score': win_runs, 'lose_score': lose_runs})

print(my_scores['win_score'].mean())
print(my_scores['lose_score'].mean())

In [None]:
sns.jointplot(x='win_score', y='lose_score', data=my_scores, kind='kde', color='orange')

In [None]:
my_scores.to_csv('../data/1978 random score list.csv', index=False)

In [None]:
actuals = pd.read_csv('../1978_actuals.csv')
actuals.head()

In [None]:
sns.jointplot(x='win_score', y='lose_score', data=actuals, kind='kde', color='orange')

In [None]:
joint = actuals.drop(columns=['away_score', 'home_score'])
joint.head()


In [None]:
new_scores = np.random.multivariate_normal(joint.mean(), joint.cov(), size=2106)

In [None]:
joint_df = pd.DataFrame(new_scores, columns=['win_score', 'lose_score'])

In [None]:
joint_df['win_score'] = round(joint_df['win_score']).astype(int)
joint_df['lose_score'] = round(joint_df['lose_score']).astype(int)
joint_df

In [None]:
joint_df['win_score'].value_counts()

In [None]:
joint_df['lose_score'].mean()

In [None]:
joint_df.to_csv('../data/1978 random score list.csv', index=False)

In [None]:
n_samples = 2106
amin, amax = 1, 24
bmin, bmax = 0, 12

samples = np.zeros((0, 2))   # 2 columns now
while samples.shape[0] < n_samples: 
    s = np.random.multivariate_normal(joint.mean(), joint.cov(), size=(n_samples,))
    accepted = s[(np.min(s - [amin, bmin], axis=1) >= 0) & (np.max(s - [amax, bmax], axis=1) <= 0)]
    samples = np.concatenate((samples, accepted), axis=0)
samples = samples[:n_samples, :]

In [None]:
samples

In [None]:
joint_df = pd.DataFrame(samples, columns=['win_score', 'lose_score'])
joint_df.head()

In [None]:
joint_df['win_score'].mean()

In [None]:
joint_df['lose_score'].mean()

In [None]:
joint_df['win_score'] = round(joint_df['win_score'], 0).astype(int)
joint_df['lose_score'] = round(joint_df['lose_score'], 0).astype(int)
joint_df.head()

In [None]:
print(joint_df['win_score'].mean())
print(joint_df['lose_score'].mean())

In [None]:
joint_df.to_csv('../data/1978 random score list.csv', index=False)