In [3]:
import pandas as pd 

red_sox_data = pd.read_csv('../data/cleaned/Boston_Red_Sox_Roster_Data_cleaned.csv')

print(red_sox_data.head())

   Season             Name  Age Born  B  T      Ht     Wt           DoB  Yrs  \
0    1908  Frank Arellanes   26   US  R  R   6' 0"  180.0  Jan 28, 1882    0   
1    1908    Jimmy Barrett   33   US  L  R   5' 7"  170.0  Mar 28, 1875   10   
2    1908       King Brady   27   US  R  R   6' 0"  190.0  May 28, 1881    4   
3    1908    Fred Burchell   28   US  R  L  5' 11"  190.0  Jul 14, 1879    3   
4    1908  Walter Carlisle   26   GB  B  R   5' 9"  154.0   Jul 6, 1881    0   

   ...  LF  CF  RF  OF  DH  PH  PR   WAR  All-Star  HOF  
0  ...   0   0   0   0   -   2   0   1.3        No   No  
1  ...   0   2   0   2   -   1   0  -0.1        No   No  
2  ...   0   0   0   0   -   0   0   0.4        No   No  
3  ...   0   0   0   0   -   1   0   0.1        No   No  
4  ...   3   0   0   3   -   0   1  -0.1        No   No  

[5 rows x 30 columns]


# **Basic Data Points**

1. <u>List of Red Sox Hall of Famers</u>

In [4]:

hof_players = red_sox_data[red_sox_data['HOF'] == 'Yes']
print('Total Hall of Fame Players:', hof_players['Name'].nunique(),'\n')

distinct_hof_players = hof_players['Name'].drop_duplicates().reset_index(drop=True)
print('Red Sox Hall of Fame Players:\n',distinct_hof_players)


Total Hall of Fame Players: 37 

Red Sox Hall of Fame Players:
 0         Tris Speaker
1             Cy Young
2         Jack Chesbro
3         Harry Hooper
4            Babe Ruth
5         Herb Pennock
6           Waite Hoyt
7          Red Ruffing
8         Rick Ferrell
9          Lefty Grove
10          Joe Cronin
11         Jimmie Foxx
12       Heinie Manush
13         Bobby Doerr
14        Ted Williams
15          Al Simmons
16        Lou Boudreau
17         George Kell
18    Carl Yastrzemski
19       Dick Williams
20        Carlton Fisk
21       Luis Aparicio
22      Orlando Cepeda
23       Juan Marichal
24            Jim Rice
25      Fergie Jenkins
26    Dennis Eckersley
27          Tony Pérez
28          Wade Boggs
29          Tom Seaver
30           Lee Smith
31        Andre Dawson
32      Pedro Martinez
33    Rickey Henderson
34         David Ortiz
35         John Smoltz
36       Adrian Beltré
Name: Name, dtype: object


2. <u>Count and List of Red Sox All Stars</u>

In [5]:
all_stars = red_sox_data[red_sox_data['All-Star'] == 'Yes']
print("Total All Stars: ", all_stars['Name'].nunique(),'\n')

distinct_all_stars = all_stars['Name'].drop_duplicates().reset_index(drop=True)
print('Red Sox All Stars:\n',distinct_all_stars)

Total All Stars:  117 

Red Sox All Stars:
 0        Rick Ferrell
1          Joe Cronin
2         Lefty Grove
3         Jimmie Foxx
4          Doc Cramer
            ...      
112     Craig Kimbrel
113     Steven Wright
114        Chris Sale
115     J.D. Martinez
116    Mitch Moreland
Name: Name, Length: 117, dtype: object


3. <u>Count of Red Sox Players by Country</u>

In [6]:
countries = red_sox_data['Born'].unique()
print(countries)

countries_count = red_sox_data['Born'].value_counts()
print(countries_count)

['US' 'GB' 'DK' 'DE' 'IE' 'CU' 'CA' 'SK' 'PL' 'MX' 'PR' 'VI' 'VE' 'PA'
 'DO' 'NL' 'CO' 'KR' 'JP' 'NI' 'JM' 'TW' 'AW' 'SA' 'HK']
Born
US    3845
DO     129
PR      79
VE      78
CU      51
CA      38
JP      28
MX      17
AW       8
DK       7
PA       7
KR       7
TW       5
CO       5
NI       5
SK       4
DE       4
VI       4
JM       3
SA       2
IE       2
NL       2
GB       2
PL       1
HK       1
Name: count, dtype: int64


4. <u>Average Age of Red Sox Player by Decade</u>

In [7]:
import math

decades=(red_sox_data['Season']//10)*10

avg_age_by_decade = red_sox_data.groupby(decades)['Age'].mean()
avg_age_by_decade_rounded_up = avg_age_by_decade.apply(math.ceil)
avg_age_by_decade_rounded_up_sorted = avg_age_by_decade_rounded_up.sort_values(ascending=False)


print(avg_age_by_decade_rounded_up_sorted)

Season
1940    30
2000    30
1950    29
1990    29
2010    29
1920    28
1930    28
1960    28
1970    28
1980    28
2020    28
1900    27
1910    27
Name: Age, dtype: int64


### **Performance Trends**

1. <u>Average WAR by Season</u>

In [8]:
red_sox_data['WAR'] = pd.to_numeric(red_sox_data['WAR'], errors='coerce')

avg_war = red_sox_data.groupby('Season')['WAR'].mean()
print('Average WAR per Season\n',avg_war)

Average WAR per Season
 Season
1908    0.925641
1909    0.975000
1910    1.357143
1911    0.995349
1912    2.048000
          ...   
2016    1.046000
2017    0.848980
2018    1.270455
2019    0.923404
2020    0.268085
Name: WAR, Length: 113, dtype: float64


### **Data Viz**

In [11]:
import os

os.chdir('/Users/joeybiotti/Workspace/red_sox_notebook/notebooks')

print(os.getcwd())

%run ../scripts/visualization.py

/Users/joeybiotti/Workspace/red_sox_notebook/notebooks
Visualization complete!


In [None]:

%run ../scripts/visualization.py

# Example function call from visualization.py
# Assuming visualization.py has a function named `plot_all_stars`
plot_all_stars(all_stars)