# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import requests
from lxml import html
from bs4 import BeautifulSoup
import re
from urllib.parse import urlparse, parse_qs

import pprint

from openpyxl import load_workbook

from datetime import datetime

# Get data from retrosheet

In [2]:
session_a = requests.session()

In [3]:
url_a = 'https://www.retrosheet.org/boxesetc/1975/Y_1975.htm'

In [4]:
a = session_a.get(url_a)

In [5]:
soup_a = BeautifulSoup(a.text, 'html.parser')
soup_a

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0//EN" "https://www.w3.org/TR/REC-html40/strict.dtd">

<html dir="LTR" lang="EN">
<pre><a href="../MISC/Y_descr.htm">Read Me</a></pre>
<head>
<title>The 1975 Season</title>
<link href="https://www.retrosheet.org/menubar/menubar.css" rel="stylesheet" type="text/css"/>
<script src="https://www.retrosheet.org/menubar/menubar.js" type="text/javascript"></script>
</head>
<body>
<p class="nopad"><img alt="Retrosheet" class="bancenter" height="46" src="https://www.retrosheet.org/menubar/retro-logo.gif" width="400"/></p>
<div class="mbcenter">
<ul class="nav">
<li><a href="https://www.retrosheet.org/">Home</a></li>
<li><a href="#">About ↓</a>
<ul>
<li><a href="https://www.retrosheet.org/site.htm">Overview</a></li>
<li><a href="https://www.retrosheet.org/faq.htm">FAQ</a></li>
</ul>
</li>
<li><a href="#">Games/People/Parks ↓</a>
<ul>
<li><a href="#">People →</a>
<ul>
<li><a href="https://www.retrosheet.org/boxesetc/index.html#Players">Players</a></li>
<l

In [6]:
tags = soup_a.find_all("a")
tags


[<a href="../MISC/Y_descr.htm">Read Me</a>,
 <a href="https://www.retrosheet.org/">Home</a>,
 <a href="#">About ↓</a>,
 <a href="https://www.retrosheet.org/site.htm">Overview</a>,
 <a href="https://www.retrosheet.org/faq.htm">FAQ</a>,
 <a href="#">Games/People/Parks ↓</a>,
 <a href="#">People →</a>,
 <a href="https://www.retrosheet.org/boxesetc/index.html#Players">Players</a>,
 <a href="https://www.retrosheet.org/boxesetc/index.html#Managers">Managers</a>,
 <a href="https://www.retrosheet.org/boxesetc/index.html#Coaches">Coaches</a>,
 <a href="https://www.retrosheet.org/boxesetc/index.html#Umpires">Umpires</a>,
 <a href="https://www.retrosheet.org/transactions/index.html">Transactions</a>,
 <a href="#">Games →</a>,
 <a href="https://www.retrosheet.org/boxesetc/index.html">Regular season</a>,
 <a href="https://www.retrosheet.org/Playoff%20Games.htm">Tiebreaker playoffs</a>,
 <a href="https://www.retrosheet.org/boxesetc/MISC/masterPS.htm">Post-season</a>,
 <a href="https://www.retrosheet

In [7]:
tags[46].attrs['href']

'../1975/TPIT01975.htm'

In [8]:
my_hrefs = []
for t in tags[46:70]:
    link = t.get('href')
    p = urlparse(link).path.rsplit("/", 1)[-1]
    my_hrefs.append(p)
my_hrefs

['TPIT01975.htm',
 'TPHI01975.htm',
 'TNYN01975.htm',
 'TSLN01975.htm',
 'TCHN01975.htm',
 'TMON01975.htm',
 'TCIN01975.htm',
 'TLAN01975.htm',
 'TSFN01975.htm',
 'TSDN01975.htm',
 'TATL01975.htm',
 'THOU01975.htm',
 'TBOS01975.htm',
 'TBAL01975.htm',
 'TNYA01975.htm',
 'TCLE01975.htm',
 'TMIL01975.htm',
 'TDET01975.htm',
 'TOAK01975.htm',
 'TKCA01975.htm',
 'TTEX01975.htm',
 'TMIN01975.htm',
 'TCHA01975.htm',
 'TCAL01975.htm']

In [9]:
my_teams = [char[1:4] for char in my_hrefs]

In [10]:
session = requests.session()

In [11]:
url = "https://www.retrosheet.org/boxesetc/1975/TPIT01975.htm"

In [12]:
s = session.get(url)

In [13]:
soup = BeautifulSoup(s.text, 'html.parser')

In [14]:
soup

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0//EN" "https://www.w3.org/TR/REC-html40/strict.dtd">

<html dir="LTR" lang="EN">
<pre><a href="../MISC/Tdescr.htm">Read Me</a></pre>
<head>
<title>The 1975 Pittsburgh Pirates</title>
<link href="https://www.retrosheet.org/menubar/menubar.css" rel="stylesheet" type="text/css"/>
<script src="https://www.retrosheet.org/menubar/menubar.js" type="text/javascript"></script>
</head>
<body>
<p class="nopad"><img alt="Retrosheet" class="bancenter" height="46" src="https://www.retrosheet.org/menubar/retro-logo.gif" width="400"/></p>
<div class="mbcenter">
<ul class="nav">
<li><a href="https://www.retrosheet.org/">Home</a></li>
<li><a href="#">About ↓</a>
<ul>
<li><a href="https://www.retrosheet.org/site.htm">Overview</a></li>
<li><a href="https://www.retrosheet.org/faq.htm">FAQ</a></li>
</ul>
</li>
<li><a href="#">Games/People/Parks ↓</a>
<ul>
<li><a href="#">People →</a>
<ul>
<li><a href="https://www.retrosheet.org/boxesetc/index.html#Players">Players<

In [15]:
data = soup.find_all('pre')[6].text
data

'                   Total                      Home                       Away\nTeam       G    W    L   RS   RA      G    W    L   RS   RA      G    W    L   RS   RA\nTotal    161   92   69  712  565     80   52   28  348  270     81   40   41  364  295\nATL N     12    8    4   60   36      6    4    2   29   18      6    4    2   31   18\nCHI N     18   12    6  111   56      9    6    3   37   21      9    6    3   74   35\nCIN N     12    6    6   55   55      6    4    2   29   26      6    2    4   26   29\nHOU N     11    5    6   44   48      5    4    1   32   20      6    1    5   12   28\nLA  N     12    7    5   46   39      6    3    3   20   22      6    4    2   26   17\nMON N     18   11    7   82   61      9    5    4   33   33      9    6    3   49   28\nNY  N     18   13    5   67   56      9    5    4   30   40      9    8    1   37   16\nPHI N     18    7   11   73   79      9    6    3   38   28      9    1    8   35   51\nSD  N     12    8    4   50   35      6 

In [16]:
table = pd.DataFrame([x.strip('\t') for x in data.split('\n')], columns=["unsplit"])
table.drop(table.index[0:3], axis=0, inplace=True)
table.drop(table.index[-1], axis=0, inplace=True)
table = table.reset_index()
table = table.drop(columns="index")
table

Unnamed: 0,unsplit
0,ATL N 12 8 4 60 36 6 4 ...
1,CHI N 18 12 6 111 56 9 6 ...
2,CIN N 12 6 6 55 55 6 4 ...
3,HOU N 11 5 6 44 48 5 4 ...
4,LA N 12 7 5 46 39 6 3 ...
5,MON N 18 11 7 82 61 9 5 ...
6,NY N 18 13 5 67 56 9 5 ...
7,PHI N 18 7 11 73 79 9 6 ...
8,SD N 12 8 4 50 35 6 5 ...
9,SF N 12 5 7 43 46 6 4 ...


In [17]:
table.shape

(11, 1)

In [18]:
my_cols = [
    'opponent', 'league', 'total_g', 'total_w', 'total_l', 'total_rs', 'total_ra', 
    'home_g', 'home_w', 'home_l', 'home_rs', 'home_ra', 
    'away_g', 'away_w', 'away_l', 'away_rs', 'away_ra'
]
df = pd.DataFrame() 
df[my_cols] = table['unsplit'].str.split('\s+',expand=True)
df["team"] = "PIT"
df

Unnamed: 0,opponent,league,total_g,total_w,total_l,total_rs,total_ra,home_g,home_w,home_l,home_rs,home_ra,away_g,away_w,away_l,away_rs,away_ra,team
0,ATL,N,12,8,4,60,36,6,4,2,29,18,6,4,2,31,18,PIT
1,CHI,N,18,12,6,111,56,9,6,3,37,21,9,6,3,74,35,PIT
2,CIN,N,12,6,6,55,55,6,4,2,29,26,6,2,4,26,29,PIT
3,HOU,N,11,5,6,44,48,5,4,1,32,20,6,1,5,12,28,PIT
4,LA,N,12,7,5,46,39,6,3,3,20,22,6,4,2,26,17,PIT
5,MON,N,18,11,7,82,61,9,5,4,33,33,9,6,3,49,28,PIT
6,NY,N,18,13,5,67,56,9,5,4,30,40,9,8,1,37,16,PIT
7,PHI,N,18,7,11,73,79,9,6,3,38,28,9,1,8,35,51,PIT
8,SD,N,12,8,4,50,35,6,5,1,29,17,6,3,3,21,18,PIT
9,SF,N,12,5,7,43,46,6,4,2,24,16,6,1,5,19,30,PIT


In [19]:
type_dict = {
    'total_g': 'int', 
    'total_w': 'int',
    'total_l': 'int',
    'total_rs': 'int',
    'total_ra': 'int',
    'home_g': 'int', 
    'home_w': 'int',
    'home_l': 'int',
    'home_rs': 'int',
    'home_ra': 'int',
    'away_g': 'int', 
    'away_w': 'int',
    'away_l': 'int',
    'away_rs': 'int',
    'away_ra': 'int',
}
df = df.astype(type_dict)
df

Unnamed: 0,opponent,league,total_g,total_w,total_l,total_rs,total_ra,home_g,home_w,home_l,home_rs,home_ra,away_g,away_w,away_l,away_rs,away_ra,team
0,ATL,N,12,8,4,60,36,6,4,2,29,18,6,4,2,31,18,PIT
1,CHI,N,18,12,6,111,56,9,6,3,37,21,9,6,3,74,35,PIT
2,CIN,N,12,6,6,55,55,6,4,2,29,26,6,2,4,26,29,PIT
3,HOU,N,11,5,6,44,48,5,4,1,32,20,6,1,5,12,28,PIT
4,LA,N,12,7,5,46,39,6,3,3,20,22,6,4,2,26,17,PIT
5,MON,N,18,11,7,82,61,9,5,4,33,33,9,6,3,49,28,PIT
6,NY,N,18,13,5,67,56,9,5,4,30,40,9,8,1,37,16,PIT
7,PHI,N,18,7,11,73,79,9,6,3,38,28,9,1,8,35,51,PIT
8,SD,N,12,8,4,50,35,6,5,1,29,17,6,3,3,21,18,PIT
9,SF,N,12,5,7,43,46,6,4,2,24,16,6,1,5,19,30,PIT


In [20]:
df["total_win_pct"] = (df["total_w"] / df["total_g"]).astype(float)
df

Unnamed: 0,opponent,league,total_g,total_w,total_l,total_rs,total_ra,home_g,home_w,home_l,home_rs,home_ra,away_g,away_w,away_l,away_rs,away_ra,team,total_win_pct
0,ATL,N,12,8,4,60,36,6,4,2,29,18,6,4,2,31,18,PIT,0.666667
1,CHI,N,18,12,6,111,56,9,6,3,37,21,9,6,3,74,35,PIT,0.666667
2,CIN,N,12,6,6,55,55,6,4,2,29,26,6,2,4,26,29,PIT,0.5
3,HOU,N,11,5,6,44,48,5,4,1,32,20,6,1,5,12,28,PIT,0.454545
4,LA,N,12,7,5,46,39,6,3,3,20,22,6,4,2,26,17,PIT,0.583333
5,MON,N,18,11,7,82,61,9,5,4,33,33,9,6,3,49,28,PIT,0.611111
6,NY,N,18,13,5,67,56,9,5,4,30,40,9,8,1,37,16,PIT,0.722222
7,PHI,N,18,7,11,73,79,9,6,3,38,28,9,1,8,35,51,PIT,0.388889
8,SD,N,12,8,4,50,35,6,5,1,29,17,6,3,3,21,18,PIT,0.666667
9,SF,N,12,5,7,43,46,6,4,2,24,16,6,1,5,19,30,PIT,0.416667


In [21]:
df2 = df[["team", "opponent", "league", "total_win_pct"]]
df2

Unnamed: 0,team,opponent,league,total_win_pct
0,PIT,ATL,N,0.666667
1,PIT,CHI,N,0.666667
2,PIT,CIN,N,0.5
3,PIT,HOU,N,0.454545
4,PIT,LA,N,0.583333
5,PIT,MON,N,0.611111
6,PIT,NY,N,0.722222
7,PIT,PHI,N,0.388889
8,PIT,SD,N,0.666667
9,PIT,SF,N,0.416667


In [22]:
my_hrefs[0]

'TPIT01975.htm'

In [23]:
# Create function to scrape all relevant pages
base_url = "https://www.retrosheet.org/boxesetc/"



def get_head2head(year, hrefs, cols, teams, type_dict):
    session = requests.session()
    events = pd.DataFrame()
    div_count = 0
    for href in hrefs:
       
        result = session.get(base_url + str(year) + "/" + hrefs[div_count])
        
        soup = BeautifulSoup(result.text, 'html.parser')
        
        if div_count == 0 or div_count == 6 or div_count == 12 or div_count == 18:
            data = soup.find_all('pre')[6].text
        else:
            data = soup.find_all('pre')[5].text
        
        
        table = pd.DataFrame([x.strip('\t') for x in data.split('\n')], columns=["unsplit"])
        table.drop(table.index[0:3], axis=0, inplace=True)
        table.drop(table.index[-1], axis=0, inplace=True)
        table = table.reset_index()
        table = table.drop(columns="index")
        

        event = pd.DataFrame()
        event[cols] = table['unsplit'].str.split('\s+',expand=True)
        event["team"] = teams[div_count]
        event = event.astype(type_dict)
        event["total_win_pct"] = (event["total_w"] / event["total_g"]).astype(float)
        
        event.loc[(event["total_win_pct"].between(0, 0.02778)), "roll"] = 11
        event.loc[(event["total_win_pct"].between(0.027778, 0.055556)), "roll"] = 12
        event.loc[(event["total_win_pct"].between(0.055556, 0.083333)), "roll"] = 13
        event.loc[(event["total_win_pct"].between(0.083333, 0.111111)), "roll"] = 14
        event.loc[(event["total_win_pct"].between(0.111111, 0.138889)), "roll"] = 15
        event.loc[(event["total_win_pct"].between(0.138889, 0.166667)), "roll"] = 16
        event.loc[(event["total_win_pct"].between(0.166667, 0.194444)), "roll"] = 21
        event.loc[(event["total_win_pct"].between(0.194444, 0.222222)), "roll"] = 22
        event.loc[(event["total_win_pct"].between(0.222222, 0.25)), "roll"] = 23
        event.loc[(event["total_win_pct"].between(0.25, 0.277778)), "roll"] = 24
        event.loc[(event["total_win_pct"].between(0.277778, 0.305556)), "roll"] = 25
        event.loc[(event["total_win_pct"].between(0.305556, 0.333333)), "roll"] = 26
        event.loc[(event["total_win_pct"].between(0.333333, 0.361111)), "roll"] = 31
        event.loc[(event["total_win_pct"].between(0.361111, 0.388889)), "roll"] = 32
        event.loc[(event["total_win_pct"].between(0.388889, 0.416667)), "roll"] = 33
        event.loc[(event["total_win_pct"].between(0.416667, 0.444444)), "roll"] = 34
        event.loc[(event["total_win_pct"].between(0.444444, 0.472222)), "roll"] = 35
        event.loc[(event["total_win_pct"].between(0.472222, 0.5)), "roll"] = 36
        event.loc[(event["total_win_pct"].between(0.50, 0.527778)), "roll"] = 41
        event.loc[(event["total_win_pct"].between(0.527777, 0.555556)), "roll"] = 42
        event.loc[(event["total_win_pct"].between(0.555556, 0.583333)), "roll"] = 43
        event.loc[(event["total_win_pct"].between(0.583333, 0.611111)), "roll"] = 44
        event.loc[(event["total_win_pct"].between(0.611111, 0.638889)), "roll"] = 45
        event.loc[(event["total_win_pct"].between(0.638889, 0.666667)), "roll"] = 46
        event.loc[(event["total_win_pct"].between(0.666667, 0.694444)), "roll"] = 51
        event.loc[(event["total_win_pct"].between(0.694444, 0.722222)), "roll"] = 52
        event.loc[(event["total_win_pct"].between(0.722222, 0.75)), "roll"] = 53
        event.loc[(event["total_win_pct"].between(0.750000, 0.777778)), "roll"] = 54
        event.loc[(event["total_win_pct"].between(0.777778, 0.805556)), "roll"] = 55
        event.loc[(event["total_win_pct"].between(0.805556, 0.833333)), "roll"] = 56
        event.loc[(event["total_win_pct"].between(0.833333, 0.861111)), "roll"] = 61
        event.loc[(event["total_win_pct"].between(0.861111, 0.888889)), "roll"] = 62
        event.loc[(event["total_win_pct"].between(0.888889, 0.916667)), "roll"] = 63
        event.loc[(event["total_win_pct"].between(0.916667, 0.944444)), "roll"] = 64
        event.loc[(event["total_win_pct"].between(0.944444, 0.972222)), "roll"] = 65
        event.loc[(event["total_win_pct"].between(0.972222, 1)), "roll"] = 66
        event["roll"] = event["roll"].replace(np.nan, "")
        
        event2 = event[["team", "opponent", "league", "total_win_pct", "roll"]]
        events = events.append(event2, ignore_index=True)
        
        div_count += 1
        
    return events


In [24]:
pd.set_option('display.max_rows', 300)
win_pct_1975 = get_head2head(1975, my_hrefs, my_cols, my_teams, type_dict)
win_pct_1975

Unnamed: 0,team,opponent,league,total_win_pct,roll
0,PIT,ATL,N,0.666667,46.0
1,PIT,CHI,N,0.666667,46.0
2,PIT,CIN,N,0.5,41.0
3,PIT,HOU,N,0.454545,35.0
4,PIT,LA,N,0.583333,44.0
5,PIT,MON,N,0.611111,45.0
6,PIT,NY,N,0.722222,53.0
7,PIT,PHI,N,0.388889,32.0
8,PIT,SD,N,0.666667,46.0
9,PIT,SF,N,0.416667,33.0


In [25]:
table_1975 = pd.pivot_table(win_pct_1975, values='roll', index=['team'],
                    columns=['opponent', 'league'], aggfunc=np.sum, fill_value="")
table_1975

opponent,ATL,BAL,BOS,CAL,CHI,CHI,CIN,CLE,DET,HOU,...,MON,NY,NY,OAK,PHI,PIT,SD,SF,STL,TEX
league,N,A,A,A,A,N,N,A,A,N,...,N,A,N,A,N,N,N,N,N,A
team,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
ATL,,,,,,33.0,16.0,,,46.0,...,46.0,,31.0,,33.0,31.0,32.0,35.0,24.0,
BAL,,,41.0,41.0,45.0,,,42.0,54.0,,...,,35.0,,31.0,,,,,,44.0
BOS,,41.0,,41.0,46.0,,,32.0,53.0,,...,,51.0,,41.0,,,,,,46.0
CAL,,41.0,41.0,,41.0,,,24.0,42.0,,...,,44.0,,32.0,,,,,,41.0
CHA,,32.0,31.0,41.0,,,,44.0,33.0,,...,,41.0,,41.0,,,,,,24.0
CHN,44.0,,,,,,14.0,,,44.0,...,41.0,,32.0,,46.0,31.0,33.0,33.0,45.0,
CIN,61.0,,,,,63.0,,,,53.0,...,46.0,,46.0,,44.0,41.0,45.0,53.0,46.0,
CLE,,35.0,45.0,54.0,33.0,,,,46.0,,...,,41.0,,16.0,,,,,,33.0
DET,,24.0,24.0,35.0,44.0,,,31.0,,,...,,31.0,,41.0,,,,,,14.0
HOU,31.0,,,,,33.0,24.0,,,,...,46.0,,31.0,,41.0,42.0,41.0,24.0,26.0,


In [26]:
table_1975.to_excel('../data/Fast Action Chart 1975.xlsx', engine='openpyxl')