# Importing and reading in data

In addition to our normal pandas and numpy, I've chosen to import some packages associated with web scraping and preprocessing, things that will be utilized later on in the project.

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import re
import os
from multiprocessing import Pool
from concurrent.futures import ThreadPoolExecutor

In [2]:
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [3]:
wswinners = pd.read_csv('/Users/jantmann17/Desktop/Portfolio-Projects/MLB-Expansion-Team-Project/WSWinners.csv')

# Year-by-Year MLB Team Data

In this section I will web scrape for team data year-by-year, such that drafting players can be compared against what statistics winning teams display. A side note on WAR: the average war for a World Series-winning team, is 47.0, with an average WAR for batting players at 29.7 and pitchers 17.3. 

Source: https://sabr.org/journal/article/war-and-the-world-series-is-war-an-indicator-of-october-success/

In [4]:
wswinners

Unnamed: 0,Year,AL Winner,Wins,Wins.1,NL Winner,Series MVP
0,2022,Houston Astros,4,2,Philadelphia Phillies,Jeremy Peña
1,2021,Houston Astros,2,4,Atlanta Braves,Jorge Soler
2,2020,Tampa Bay Rays,2,4,Los Angeles Dodgers,Corey Seager
3,2019,Houston Astros,3,4,Washington Nationals,Stephen Strasburg
4,2018,Boston Red Sox,4,1,Los Angeles Dodgers,Steve Pearce
...,...,...,...,...,...,...
113,1909,Detroit Tigers,3,4,Pittsburgh Pirates,
114,1908,Detroit Tigers,1,4,Chicago Cubs,
115,1907,Detroit Tigers,0,4,Chicago Cubs,
116,1906,Chicago White Sox,4,2,Chicago Cubs,


In [5]:
wswinners = wswinners.drop(columns = ['Series MVP'])
wswinners.head(30)

Unnamed: 0,Year,AL Winner,Wins,Wins.1,NL Winner
0,2022,Houston Astros,4,2,Philadelphia Phillies
1,2021,Houston Astros,2,4,Atlanta Braves
2,2020,Tampa Bay Rays,2,4,Los Angeles Dodgers
3,2019,Houston Astros,3,4,Washington Nationals
4,2018,Boston Red Sox,4,1,Los Angeles Dodgers
5,2017,Houston Astros,4,3,Los Angeles Dodgers
6,2016,Cleveland Indians,3,4,Chicago Cubs
7,2015,Kansas City Royals,4,1,New York Mets
8,2014,Kansas City Royals,3,4,San Francisco Giants
9,2013,Boston Red Sox,4,2,St. Louis Cardinals


In [6]:
wswinners = wswinners.drop(28)
wswinners.head(30)

Unnamed: 0,Year,AL Winner,Wins,Wins.1,NL Winner
0,2022,Houston Astros,4,2,Philadelphia Phillies
1,2021,Houston Astros,2,4,Atlanta Braves
2,2020,Tampa Bay Rays,2,4,Los Angeles Dodgers
3,2019,Houston Astros,3,4,Washington Nationals
4,2018,Boston Red Sox,4,1,Los Angeles Dodgers
5,2017,Houston Astros,4,3,Los Angeles Dodgers
6,2016,Cleveland Indians,3,4,Chicago Cubs
7,2015,Kansas City Royals,4,1,New York Mets
8,2014,Kansas City Royals,3,4,San Francisco Giants
9,2013,Boston Red Sox,4,2,St. Louis Cardinals


In [7]:
wswinners.dtypes

Year         object
AL Winner    object
Wins         object
Wins.1       object
NL Winner    object
dtype: object

In [8]:
wswinners['Year'] = wswinners['Year'].astype(int)

In [9]:
real_winners = []
for index, row in wswinners.iterrows():
    if row['Wins'] > row['Wins.1']:
        real_winners.append(row['AL Winner'])
    else:
        real_winners.append(row['NL Winner'])

In [10]:
real_winners

['Houston Astros',
 'Atlanta Braves',
 'Los Angeles Dodgers',
 'Washington Nationals',
 'Boston Red Sox',
 'Houston Astros',
 'Chicago Cubs',
 'Kansas City Royals',
 'San Francisco Giants',
 'Boston Red Sox',
 'San Francisco Giants',
 'St. Louis Cardinals',
 'San Francisco Giants',
 'New York Yankees',
 'Philadelphia Phillies',
 'Boston Red Sox',
 'St. Louis Cardinals',
 'Chicago White Sox',
 'Boston Red Sox',
 'Florida Marlins',
 'Anaheim Angels',
 'Arizona Diamondbacks',
 'New York Yankees',
 'New York Yankees',
 'New York Yankees',
 'Florida Marlins',
 'New York Yankees',
 'Atlanta Braves',
 'Toronto Blue Jays',
 'Toronto Blue Jays',
 'Minnesota Twins',
 'Cincinnati Reds',
 'Oakland Athletics',
 'Los Angeles Dodgers',
 'Minnesota Twins',
 'New York Mets',
 'Kansas City Royals',
 'Detroit Tigers',
 'Baltimore Orioles',
 'St. Louis Cardinals',
 'Los Angeles Dodgers',
 'Philadelphia Phillies',
 'Pittsburgh Pirates',
 'New York Yankees',
 'New York Yankees',
 'Cincinnati Reds',
 'Cincin

In [11]:
wswinners['World Series Champions'] = real_winners
wswinners = wswinners.drop(columns = ['AL Winner', 'Wins', 'Wins.1', 'NL Winner'])
wswinners

Unnamed: 0,Year,World Series Champions
0,2022,Houston Astros
1,2021,Atlanta Braves
2,2020,Los Angeles Dodgers
3,2019,Washington Nationals
4,2018,Boston Red Sox
...,...,...
113,1909,Pittsburgh Pirates
114,1908,Chicago Cubs
115,1907,Chicago Cubs
116,1906,Chicago White Sox


In [12]:
# have to make slightly different team names list because baseball reference does them differently
bref_team_names = ['ARI', 'ATL', 'BAL', 'BOS', 'CHC', 'CHW', 'CIN', 'CLE', 'COL','DET', 'HOU', 'KCR', 'ANA', 'LAD', 'FLA', 
              'MIL', 'MIN', 'NYM', 'NYY', 'OAK', 'PHI', 'PIT', 'SDP', 'SEA', 'SFG', 'STL', 'TBD', 'TEX', 'TOR', 'WSN']

In [13]:
team_dataframes = []
for team_abbr in bref_team_names:
    url = f'https://www.baseball-reference.com/teams/{team_abbr}/batteam.shtml#all_yby_team_bat'
    
    response = requests.get(url)
    
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the "Year-by-Year Team Batting" table by its id
        tables = soup.find_all('table')

        if tables:
            # Convert the first table data to a DataFrame
            team_df = pd.read_html(str(tables[0]))[0]

            # Add a "Team" column to store the team's abbreviation
            team_df['Team'] = team_abbr

            # Append the DataFrame to the list
            team_dataframes.append(team_df)


        else:
            print(f"No tables found on the page for {team_abbr}")
    else:
        print(f"Failed to retrieve data for {team_abbr}")
        
# Concatenate all DataFrames into a single DataFrame
all_team_data = pd.concat(team_dataframes, ignore_index=True)

# Save the combined data to a CSV file (optional)
all_team_data.to_csv('team_batting_data.csv', index=False)

# Display the first few rows of the combined DataFrame
print(all_team_data.head())

   Year       Lg   W    L  Finish   R/G    G    PA    AB    R     H   2B  3B  \
0  2023  NL West  84   78       2  4.60  162  6124  5436  746  1359  274  44   
1  2022  NL West  74   88       4  4.33  162  6027  5351  702  1232  262  24   
2  2021  NL West  52  110       5  4.19  162  6144  5489  679  1297  308  31   
3  2020  NL West  25   35       5  4.48   60  2238  1997  269   482  101  12   
4  2019  NL West  85   77       2  5.02  162  6315  5633  813  1419  288  40   

    HR    RBI     SB    CS   BB    SO     BA    OBP    SLG    OPS    E   DP  \
0  166  706.0  166.0  26.0  540  1247  0.250  0.322  0.408  0.730   56  134   
1  173  658.0  104.0  29.0  531  1341  0.230  0.304  0.385  0.689   86  134   
2  144  644.0   43.0  16.0  537  1465  0.236  0.309  0.382  0.692  100  113   
3   58  255.0   23.0   7.0  181   461  0.241  0.312  0.391  0.704   35   54   
4  220  778.0   88.0  14.0  540  1360  0.252  0.323  0.434  0.757   86  136   

    Fld%  BatAge Team  
0  0.990    27.4  AR

In [14]:
all_team_data_batting = all_team_data
all_team_data_batting

Unnamed: 0,Year,Lg,W,L,Finish,R/G,G,PA,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,BA,OBP,SLG,OPS,E,DP,Fld%,BatAge,Team
0,2023,NL West,84,78,2,4.60,162,6124,5436,746,1359,274,44,166,706.0,166.0,26.0,540,1247,0.250,0.322,0.408,0.730,56,134,0.990,27.4,ARI
1,2022,NL West,74,88,4,4.33,162,6027,5351,702,1232,262,24,173,658.0,104.0,29.0,531,1341,0.230,0.304,0.385,0.689,86,134,0.985,26.5,ARI
2,2021,NL West,52,110,5,4.19,162,6144,5489,679,1297,308,31,144,644.0,43.0,16.0,537,1465,0.236,0.309,0.382,0.692,100,113,0.983,28.9,ARI
3,2020,NL West,25,35,5,4.48,60,2238,1997,269,482,101,12,58,255.0,23.0,7.0,181,461,0.241,0.312,0.391,0.704,35,54,0.983,29.1,ARI
4,2019,NL West,85,77,2,5.02,162,6315,5633,813,1419,288,40,220,778.0,88.0,14.0,540,1360,0.252,0.323,0.434,0.757,86,136,0.986,28.7,ARI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2799,1973,NL East,79,83,4,4.12,162,6255,5369,668,1345,190,23,125,613.0,77.0,68.0,695,777,0.251,0.340,0.364,0.704,163,156,0.974,27.7,WSN
2800,1972,NL East,70,86,5,3.29,156,5820,5156,513,1205,156,22,91,462.0,68.0,66.0,474,828,0.234,0.303,0.325,0.628,134,141,0.978,26.9,WSN
2801,1971,NL East,71,90,5,3.84,162,6098,5335,622,1312,197,29,88,567.0,51.0,43.0,543,800,0.246,0.322,0.343,0.666,150,164,0.976,28.3,WSN
2802,1970,NL East,73,89,6,4.24,162,6251,5411,687,1284,211,35,136,646.0,65.0,45.0,659,972,0.237,0.323,0.365,0.687,141,193,0.977,27.7,WSN


In [15]:
team_dataframes = []
for team_abbr in bref_team_names:
    url = f'https://www.baseball-reference.com/teams/{team_abbr}/pitchteam.shtml#all_yby_team_pitch'
    
    response = requests.get(url)
    
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the "Year-by-Year Team Batting" table by its id
        tables = soup.find_all('table')

        if tables:
            # Convert the first table data to a DataFrame
            team_df = pd.read_html(str(tables[0]))[0]

            # Add a "Team" column to store the team's abbreviation
            team_df['Team'] = team_abbr

            # Append the DataFrame to the list
            team_dataframes.append(team_df)


        else:
            print(f"No tables found on the page for {team_abbr}")
    else:
        print(f"Failed to retrieve data for {team_abbr}")
        
# Concatenate all DataFrames into a single DataFrame
all_team_data_pitching = pd.concat(team_dataframes, ignore_index=True)

# Save the combined data to a CSV file (optional)
all_team_data_pitching.to_csv('team_pitching_data.csv', index=False)

# Display the first few rows of the combined DataFrame
print(all_team_data_pitching.head())

   Year       Lg   W    L  Finish  RA/G   ERA    G  CG  tSho  SV      IP  \
0  2023  NL West  84   78       2  4.70  4.47  162   1     9  44  1435.1   
1  2022  NL West  74   88       4  4.57  4.25  162   0    10  33  1430.0   
2  2021  NL West  52  110       5  5.51  5.11  162   3     4  22  1417.1   
3  2020  NL West  25   35       5  4.92  4.84   60   0     1  13   518.1   
4  2019  NL West  85   77       2  4.59  4.25  162   0    11  45  1465.0   

      H    R   ER   HR   BB    SO   WHIP   SO9   HR9    E   DP   Fld%  PAge  \
0  1375  761  713  197  525  1351  1.324  8.47  1.24   56  134  0.990  28.5   
1  1345  740  676  191  504  1216  1.293  7.65  1.20   86  134  0.985  30.0   
2  1480  893  804  232  555  1238  1.436  7.86  1.47  100  113  0.983  28.5   
3   506  295  279   93  235   524  1.430  9.10  1.61   35   54  0.983  27.7   
4  1400  743  691  220  516  1427  1.308  8.77  1.35   86  136  0.986  28.6   

  Team  
0  ARI  
1  ARI  
2  ARI  
3  ARI  
4  ARI  


In [16]:
all_team_data_pitching

Unnamed: 0,Year,Lg,W,L,Finish,RA/G,ERA,G,CG,tSho,SV,IP,H,R,ER,HR,BB,SO,WHIP,SO9,HR9,E,DP,Fld%,PAge,Team
0,2023,NL West,84,78,2,4.70,4.47,162,1,9,44,1435.1,1375,761,713,197,525,1351,1.324,8.47,1.24,56,134,0.990,28.5,ARI
1,2022,NL West,74,88,4,4.57,4.25,162,0,10,33,1430.0,1345,740,676,191,504,1216,1.293,7.65,1.20,86,134,0.985,30.0,ARI
2,2021,NL West,52,110,5,5.51,5.11,162,3,4,22,1417.1,1480,893,804,232,555,1238,1.436,7.86,1.47,100,113,0.983,28.5,ARI
3,2020,NL West,25,35,5,4.92,4.84,60,0,1,13,518.1,506,295,279,93,235,524,1.430,9.10,1.61,35,54,0.983,27.7,ARI
4,2019,NL West,85,77,2,4.59,4.25,162,0,11,45,1465.0,1400,743,691,220,516,1427,1.308,8.77,1.35,86,136,0.986,28.6,ARI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2799,1973,NL East,79,83,4,4.33,3.71,162,26,6,38,1451.2,1356,702,599,128,681,866,1.403,5.37,0.79,163,156,0.974,26.2,WSN
2800,1972,NL East,70,86,5,3.90,3.59,156,39,11,23,1401.1,1281,609,559,103,579,888,1.327,5.70,0.66,134,141,0.978,25.9,WSN
2801,1971,NL East,71,90,5,4.50,4.12,162,49,8,25,1434.1,1418,729,656,133,658,829,1.447,5.20,0.83,150,164,0.976,27.0,WSN
2802,1970,NL East,73,89,6,4.98,4.50,162,29,10,32,1438.2,1434,807,720,162,716,914,1.494,5.72,1.01,141,193,0.977,26.7,WSN


In [17]:
all_team_data_pitching.to_csv('/Users/jantmann17/Desktop/Portfolio-Projects/MLB-Expansion-Team-Project/all_team_data_pitching.csv', index = False)

In [18]:
all_team_data_batting.to_csv('/Users/jantmann17/Desktop/Portfolio-Projects/MLB-Expansion-Team-Project/all_team_data_batting.csv', index = False)

In [19]:
name_abbr_dict = {'Arizona Diamondbacks':'ARI', 'Atlanta Braves':'ATL', 'Baltimore Orioles':'BAL', 
                  'Boston Red Sox':'BOS', 'Chicago Cubs':'CHC', 'Chicago White Sox':'CHW', 'Cincinatti Reds':'CIN',
                  'Cleveland Indians': 'CLE', 'Colorado Rockies':'COL', 'Detroit Tigers':'DET', 'Houston Astros':'HOU',
                  'Kansas City Royals':'KCR', 'Anaheim Angels':'ANA', 'Los Angeles Dodgers':'LAD', 
                  'Florida Marlins':'FLA', 'Milwaukee Brewers':'MIL', 'Minnesota Twins':'MIN', 'New York Mets':'NYM',
                  'New York Yankees':'NYY', 'Oakland Athletics':'OAK', 'Philadelphia Phillies':'PHI', 
                  'Pittsburgh Pirates':'PIT', 'San Diego Padres':'SDP', 'Seattle Mariners':'SEA', 
                  'San Francisco Giants':'SFG', 'St. Louis Cardinals':'STL', 'Tampa Bay Rays':'TBD', 'Texas Rangers':'TEX', 
                  'Toronto Blue Jays':'TOR', 'Washington Nationals':'WSN'}

In [None]:
all_team_data_batting = pd.read_csv('/Users/jantmann17/Desktop/Portfolio-Projects/MLB-Expansion-Team-Project/all_team_data_batting.csv')

In [None]:
all_team_data_batting

In [20]:
wswinners['Team'] = wswinners['World Series Champions'].map(name_abbr_dict)
wswinners

Unnamed: 0,Year,World Series Champions,Team
0,2022,Houston Astros,HOU
1,2021,Atlanta Braves,ATL
2,2020,Los Angeles Dodgers,LAD
3,2019,Washington Nationals,WSN
4,2018,Boston Red Sox,BOS
...,...,...,...
113,1909,Pittsburgh Pirates,PIT
114,1908,Chicago Cubs,CHC
115,1907,Chicago Cubs,CHC
116,1906,Chicago White Sox,CHW


In [21]:
wsbat = all_team_data_batting.merge(wswinners, on=['Year', 'Team'], how = 'left')
wsbat['World Series Champions'].count()

98

In [22]:
wspitch = all_team_data_pitching.merge(wswinners, on=['Year', 'Team'], how = 'left')
wspitch['World Series Champions'].count()

98

In [None]:
wspitching = pd.merge(all_team_data_pitching, wswinners, on=['Year', 'Team'], how='inner')

In [None]:
wspitching

In [None]:
wschamps = wspitching.pop('World Series Champions')
wspitching.insert(1, wschamps.name, wschamps)
wspitching = wspitching.drop(columns = ['Team', 'Lg', 'W', 'L', 'Finish'])
wspitching = wspitching.sort_values(by = 'Year', ascending = False).reset_index()
wspitching = wspitching.drop(columns = ['index'])
wspitching

In [None]:
wschamps1 = wsbatting.pop('World Series Champions')
wsbatting.insert(1, wschamps.name, wschamps)
wsbatting = wsbatting.drop(columns = ['Team', 'Lg', 'W', 'L', 'Finish'])
wsbatting = wsbatting.sort_values(by = 'Year', ascending = False).reset_index()
wsbatting = wsbatting.drop(columns = ['index'])
wsbatting

In [None]:
wsbattingmeans = wsbatting.drop(columns = ['Year', 'G']).mean()
wspitchingmeans = wspitching.drop(columns = ['Year', 'G']).mean()
wsbattingmeans

In [None]:
wspitchingmeans

## Placeholder -- below testing on refactoring batting and pitching dataframes to check for WS winner

In [None]:
all_team_data_batting = all_team_data_batting[(all_team_data_batting['Year'] >= 1998) & (all_team_data_batting['Year'] <= 2022)]
all_team_data_pitching = all_team_data_pitching[(all_team_data_pitching['Year'] >= 1998) & (all_team_data_pitching['Year'] <= 2022)]

In [None]:
all_team_data_batting

In [None]:
all_team_data_pitching

In [23]:
batting_df_WS = all_team_data_batting.merge(wswinners, on=['Year', 'Team'], how='left')
batting_df_WS['WS_Winner'] = batting_df_WS['World Series Champions'].apply(lambda x: 'Yes' if pd.notna(x) else 'No')
batting_df_WS.drop(columns=['World Series Champions'], inplace=True)
batting_df_WS = batting_df_WS.drop(columns = ['Lg', 'W', 'L', 'G', 'Finish'])
teamname = batting_df_WS.pop('Team')
batting_df_WS.insert(1, teamname.name, teamname)
batting_df_WS = batting_df_WS[batting_df_WS['Year'] != 2020]
batting_df_WS = batting_df_WS.rename(columns = {'2B':'Doubles', '3B':'Triples'})
batting_df_WS

Unnamed: 0,Year,Team,R/G,PA,AB,R,H,Doubles,Triples,HR,RBI,SB,CS,BB,SO,BA,OBP,SLG,OPS,E,DP,Fld%,BatAge,WS_Winner
0,2023,ARI,4.60,6124,5436,746,1359,274,44,166,706.0,166.0,26.0,540,1247,0.250,0.322,0.408,0.730,56,134,0.990,27.4,No
1,2022,ARI,4.33,6027,5351,702,1232,262,24,173,658.0,104.0,29.0,531,1341,0.230,0.304,0.385,0.689,86,134,0.985,26.5,No
2,2021,ARI,4.19,6144,5489,679,1297,308,31,144,644.0,43.0,16.0,537,1465,0.236,0.309,0.382,0.692,100,113,0.983,28.9,No
4,2019,ARI,5.02,6315,5633,813,1419,288,40,220,778.0,88.0,14.0,540,1360,0.252,0.323,0.434,0.757,86,136,0.986,28.7,No
5,2018,ARI,4.28,6157,5460,693,1283,259,50,176,658.0,79.0,25.0,560,1460,0.235,0.310,0.397,0.707,75,152,0.988,29.2,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2799,1973,WSN,4.12,6255,5369,668,1345,190,23,125,613.0,77.0,68.0,695,777,0.251,0.340,0.364,0.704,163,156,0.974,27.7,No
2800,1972,WSN,3.29,5820,5156,513,1205,156,22,91,462.0,68.0,66.0,474,828,0.234,0.303,0.325,0.628,134,141,0.978,26.9,No
2801,1971,WSN,3.84,6098,5335,622,1312,197,29,88,567.0,51.0,43.0,543,800,0.246,0.322,0.343,0.666,150,164,0.976,28.3,No
2802,1970,WSN,4.24,6251,5411,687,1284,211,35,136,646.0,65.0,45.0,659,972,0.237,0.323,0.365,0.687,141,193,0.977,27.7,No


In [24]:
pitching_df_WS = all_team_data_pitching.merge(wswinners, on=['Year', 'Team'], how='left')
pitching_df_WS['WS_Winner'] = pitching_df_WS['World Series Champions'].apply(lambda x: 'Yes' if pd.notna(x) else 'No')
pitching_df_WS.drop(columns=['World Series Champions'], inplace=True)
pitching_df_WS = pitching_df_WS.drop(columns = ['Lg', 'W', 'L', 'G', 'Finish'])
teamname = pitching_df_WS.pop('Team')
pitching_df_WS.insert(1, teamname.name, teamname)
pitching_df_WS = pitching_df_WS[pitching_df_WS['Year'] != 2020]
pitching_df_WS

Unnamed: 0,Year,Team,RA/G,ERA,CG,tSho,SV,IP,H,R,ER,HR,BB,SO,WHIP,SO9,HR9,E,DP,Fld%,PAge,WS_Winner
0,2023,ARI,4.70,4.47,1,9,44,1435.1,1375,761,713,197,525,1351,1.324,8.47,1.24,56,134,0.990,28.5,No
1,2022,ARI,4.57,4.25,0,10,33,1430.0,1345,740,676,191,504,1216,1.293,7.65,1.20,86,134,0.985,30.0,No
2,2021,ARI,5.51,5.11,3,4,22,1417.1,1480,893,804,232,555,1238,1.436,7.86,1.47,100,113,0.983,28.5,No
4,2019,ARI,4.59,4.25,0,11,45,1465.0,1400,743,691,220,516,1427,1.308,8.77,1.35,86,136,0.986,28.6,No
5,2018,ARI,3.98,3.72,2,9,39,1463.0,1313,644,605,174,522,1448,1.254,8.91,1.07,75,152,0.988,29.6,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2799,1973,WSN,4.33,3.71,26,6,38,1451.2,1356,702,599,128,681,866,1.403,5.37,0.79,163,156,0.974,26.2,No
2800,1972,WSN,3.90,3.59,39,11,23,1401.1,1281,609,559,103,579,888,1.327,5.70,0.66,134,141,0.978,25.9,No
2801,1971,WSN,4.50,4.12,49,8,25,1434.1,1418,729,656,133,658,829,1.447,5.20,0.83,150,164,0.976,27.0,No
2802,1970,WSN,4.98,4.50,29,10,32,1438.2,1434,807,720,162,716,914,1.494,5.72,1.01,141,193,0.977,26.7,No


In [25]:
pitching_df_WS.to_csv('/Users/jantmann17/Desktop/Portfolio-Projects/MLB-Expansion-Team-Project/team_pitching_df.csv', index = False)
batting_df_WS.to_csv('/Users/jantmann17/Desktop/Portfolio-Projects/MLB-Expansion-Team-Project/team_batting_df.csv', index = False)

In [26]:
merged_df = pitching_df_WS.merge(batting_df_WS, on=['Year', 'Team'], how='inner')

In [27]:
merged_df

Unnamed: 0,Year,Team,RA/G,ERA,CG,tSho,SV,IP,H_x,R_x,ER,HR_x,BB_x,SO_x,WHIP,SO9,HR9,E_x,DP_x,Fld%_x,PAge,WS_Winner_x,R/G,PA,AB,R_y,H_y,Doubles,Triples,HR_y,RBI,SB,CS,BB_y,SO_y,BA,OBP,SLG,OPS,E_y,DP_y,Fld%_y,BatAge,WS_Winner_y
0,2023,ARI,4.70,4.47,1,9,44,1435.1,1375,761,713,197,525,1351,1.324,8.47,1.24,56,134,0.990,28.5,No,4.60,6124,5436,746,1359,274,44,166,706.0,166.0,26.0,540,1247,0.250,0.322,0.408,0.730,56,134,0.990,27.4,No
1,2022,ARI,4.57,4.25,0,10,33,1430.0,1345,740,676,191,504,1216,1.293,7.65,1.20,86,134,0.985,30.0,No,4.33,6027,5351,702,1232,262,24,173,658.0,104.0,29.0,531,1341,0.230,0.304,0.385,0.689,86,134,0.985,26.5,No
2,2021,ARI,5.51,5.11,3,4,22,1417.1,1480,893,804,232,555,1238,1.436,7.86,1.47,100,113,0.983,28.5,No,4.19,6144,5489,679,1297,308,31,144,644.0,43.0,16.0,537,1465,0.236,0.309,0.382,0.692,100,113,0.983,28.9,No
3,2019,ARI,4.59,4.25,0,11,45,1465.0,1400,743,691,220,516,1427,1.308,8.77,1.35,86,136,0.986,28.6,No,5.02,6315,5633,813,1419,288,40,220,778.0,88.0,14.0,540,1360,0.252,0.323,0.434,0.757,86,136,0.986,28.7,No
4,2018,ARI,3.98,3.72,2,9,39,1463.0,1313,644,605,174,522,1448,1.254,8.91,1.07,75,152,0.988,29.6,No,4.28,6157,5460,693,1283,259,50,176,658.0,79.0,25.0,560,1460,0.235,0.310,0.397,0.707,75,152,0.988,29.2,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2769,1973,WSN,4.33,3.71,26,6,38,1451.2,1356,702,599,128,681,866,1.403,5.37,0.79,163,156,0.974,26.2,No,4.12,6255,5369,668,1345,190,23,125,613.0,77.0,68.0,695,777,0.251,0.340,0.364,0.704,163,156,0.974,27.7,No
2770,1972,WSN,3.90,3.59,39,11,23,1401.1,1281,609,559,103,579,888,1.327,5.70,0.66,134,141,0.978,25.9,No,3.29,5820,5156,513,1205,156,22,91,462.0,68.0,66.0,474,828,0.234,0.303,0.325,0.628,134,141,0.978,26.9,No
2771,1971,WSN,4.50,4.12,49,8,25,1434.1,1418,729,656,133,658,829,1.447,5.20,0.83,150,164,0.976,27.0,No,3.84,6098,5335,622,1312,197,29,88,567.0,51.0,43.0,543,800,0.246,0.322,0.343,0.666,150,164,0.976,28.3,No
2772,1970,WSN,4.98,4.50,29,10,32,1438.2,1434,807,720,162,716,914,1.494,5.72,1.01,141,193,0.977,26.7,No,4.24,6251,5411,687,1284,211,35,136,646.0,65.0,45.0,659,972,0.237,0.323,0.365,0.687,141,193,0.977,27.7,No


In [28]:
pd.set_option('display.max_columns', None)

In [29]:
merged_df = merged_df.drop(columns=['WS_Winner_x', 'E_y', 'DP_y', 'Fld%_y', 'PA', 'AB']).rename(columns = {'WS_Winner_y':'WS_Winner', 'H_x':'Hits_Allowed', 
                                                                       'R_x':'Runs_Allowed', 'HR_x':'HR_Allowed',
                                                                      'BB_x':'Walks_Allowed', 'SO_x':'SO_Pitch',
                                                                      'E_x':'Errors_Committed', 'DP_x':'GDP',
                                                                      'Fld%_x':'Fielding_Percentage', 'R_y': 'Runs_Scored',
                                                                      'H_y':'Hits_For', 'HR_y':'HR_For', 'BB_y':'Walks_For', 
                                                                      'SO_y':'SO_Bat', 'BatAge':'Batting_Age', 'PAge':'Pitching_Age',
                                                                      'RA/G':'Runs_Allowed_Per_Game', 'R/G':'Runs_Per_Game',
                                                                      '2B':'Doubles', '3B':'Triples'})

In [30]:
merged_df

Unnamed: 0,Year,Team,Runs_Allowed_Per_Game,ERA,CG,tSho,SV,IP,Hits_Allowed,Runs_Allowed,ER,HR_Allowed,Walks_Allowed,SO_Pitch,WHIP,SO9,HR9,Errors_Committed,GDP,Fielding_Percentage,Pitching_Age,Runs_Per_Game,Runs_Scored,Hits_For,Doubles,Triples,HR_For,RBI,SB,CS,Walks_For,SO_Bat,BA,OBP,SLG,OPS,Batting_Age,WS_Winner
0,2023,ARI,4.70,4.47,1,9,44,1435.1,1375,761,713,197,525,1351,1.324,8.47,1.24,56,134,0.990,28.5,4.60,746,1359,274,44,166,706.0,166.0,26.0,540,1247,0.250,0.322,0.408,0.730,27.4,No
1,2022,ARI,4.57,4.25,0,10,33,1430.0,1345,740,676,191,504,1216,1.293,7.65,1.20,86,134,0.985,30.0,4.33,702,1232,262,24,173,658.0,104.0,29.0,531,1341,0.230,0.304,0.385,0.689,26.5,No
2,2021,ARI,5.51,5.11,3,4,22,1417.1,1480,893,804,232,555,1238,1.436,7.86,1.47,100,113,0.983,28.5,4.19,679,1297,308,31,144,644.0,43.0,16.0,537,1465,0.236,0.309,0.382,0.692,28.9,No
3,2019,ARI,4.59,4.25,0,11,45,1465.0,1400,743,691,220,516,1427,1.308,8.77,1.35,86,136,0.986,28.6,5.02,813,1419,288,40,220,778.0,88.0,14.0,540,1360,0.252,0.323,0.434,0.757,28.7,No
4,2018,ARI,3.98,3.72,2,9,39,1463.0,1313,644,605,174,522,1448,1.254,8.91,1.07,75,152,0.988,29.6,4.28,693,1283,259,50,176,658.0,79.0,25.0,560,1460,0.235,0.310,0.397,0.707,29.2,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2769,1973,WSN,4.33,3.71,26,6,38,1451.2,1356,702,599,128,681,866,1.403,5.37,0.79,163,156,0.974,26.2,4.12,668,1345,190,23,125,613.0,77.0,68.0,695,777,0.251,0.340,0.364,0.704,27.7,No
2770,1972,WSN,3.90,3.59,39,11,23,1401.1,1281,609,559,103,579,888,1.327,5.70,0.66,134,141,0.978,25.9,3.29,513,1205,156,22,91,462.0,68.0,66.0,474,828,0.234,0.303,0.325,0.628,26.9,No
2771,1971,WSN,4.50,4.12,49,8,25,1434.1,1418,729,656,133,658,829,1.447,5.20,0.83,150,164,0.976,27.0,3.84,622,1312,197,29,88,567.0,51.0,43.0,543,800,0.246,0.322,0.343,0.666,28.3,No
2772,1970,WSN,4.98,4.50,29,10,32,1438.2,1434,807,720,162,716,914,1.494,5.72,1.01,141,193,0.977,26.7,4.24,687,1284,211,35,136,646.0,65.0,45.0,659,972,0.237,0.323,0.365,0.687,27.7,No


In [31]:
pitchage = merged_df.pop('Pitching_Age')
merged_df.insert(36, pitchage.name, pitchage)
merged_df.iloc[:, 2:37] = merged_df.iloc[:, 2:37].astype(float)
merged_df

Unnamed: 0,Year,Team,Runs_Allowed_Per_Game,ERA,CG,tSho,SV,IP,Hits_Allowed,Runs_Allowed,ER,HR_Allowed,Walks_Allowed,SO_Pitch,WHIP,SO9,HR9,Errors_Committed,GDP,Fielding_Percentage,Runs_Per_Game,Runs_Scored,Hits_For,Doubles,Triples,HR_For,RBI,SB,CS,Walks_For,SO_Bat,BA,OBP,SLG,OPS,Batting_Age,Pitching_Age,WS_Winner
0,2023,ARI,4.70,4.47,1.0,9.0,44.0,1435.1,1375.0,761.0,713.0,197.0,525.0,1351.0,1.324,8.47,1.24,56.0,134.0,0.990,4.60,746.0,1359.0,274.0,44.0,166.0,706.0,166.0,26.0,540.0,1247.0,0.250,0.322,0.408,0.730,27.4,28.5,No
1,2022,ARI,4.57,4.25,0.0,10.0,33.0,1430.0,1345.0,740.0,676.0,191.0,504.0,1216.0,1.293,7.65,1.20,86.0,134.0,0.985,4.33,702.0,1232.0,262.0,24.0,173.0,658.0,104.0,29.0,531.0,1341.0,0.230,0.304,0.385,0.689,26.5,30.0,No
2,2021,ARI,5.51,5.11,3.0,4.0,22.0,1417.1,1480.0,893.0,804.0,232.0,555.0,1238.0,1.436,7.86,1.47,100.0,113.0,0.983,4.19,679.0,1297.0,308.0,31.0,144.0,644.0,43.0,16.0,537.0,1465.0,0.236,0.309,0.382,0.692,28.9,28.5,No
3,2019,ARI,4.59,4.25,0.0,11.0,45.0,1465.0,1400.0,743.0,691.0,220.0,516.0,1427.0,1.308,8.77,1.35,86.0,136.0,0.986,5.02,813.0,1419.0,288.0,40.0,220.0,778.0,88.0,14.0,540.0,1360.0,0.252,0.323,0.434,0.757,28.7,28.6,No
4,2018,ARI,3.98,3.72,2.0,9.0,39.0,1463.0,1313.0,644.0,605.0,174.0,522.0,1448.0,1.254,8.91,1.07,75.0,152.0,0.988,4.28,693.0,1283.0,259.0,50.0,176.0,658.0,79.0,25.0,560.0,1460.0,0.235,0.310,0.397,0.707,29.2,29.6,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2769,1973,WSN,4.33,3.71,26.0,6.0,38.0,1451.2,1356.0,702.0,599.0,128.0,681.0,866.0,1.403,5.37,0.79,163.0,156.0,0.974,4.12,668.0,1345.0,190.0,23.0,125.0,613.0,77.0,68.0,695.0,777.0,0.251,0.340,0.364,0.704,27.7,26.2,No
2770,1972,WSN,3.90,3.59,39.0,11.0,23.0,1401.1,1281.0,609.0,559.0,103.0,579.0,888.0,1.327,5.70,0.66,134.0,141.0,0.978,3.29,513.0,1205.0,156.0,22.0,91.0,462.0,68.0,66.0,474.0,828.0,0.234,0.303,0.325,0.628,26.9,25.9,No
2771,1971,WSN,4.50,4.12,49.0,8.0,25.0,1434.1,1418.0,729.0,656.0,133.0,658.0,829.0,1.447,5.20,0.83,150.0,164.0,0.976,3.84,622.0,1312.0,197.0,29.0,88.0,567.0,51.0,43.0,543.0,800.0,0.246,0.322,0.343,0.666,28.3,27.0,No
2772,1970,WSN,4.98,4.50,29.0,10.0,32.0,1438.2,1434.0,807.0,720.0,162.0,716.0,914.0,1.494,5.72,1.01,141.0,193.0,0.977,4.24,687.0,1284.0,211.0,35.0,136.0,646.0,65.0,45.0,659.0,972.0,0.237,0.323,0.365,0.687,27.7,26.7,No


In [32]:
merged_df.to_csv('/Users/jantmann17/Desktop/Portfolio-Projects/MLB-Expansion-Team-Project/all_team_data.csv', index = False)

In [33]:
merged_df.dtypes

Year                       int64
Team                      object
Runs_Allowed_Per_Game    float64
ERA                      float64
CG                       float64
tSho                     float64
SV                       float64
IP                       float64
Hits_Allowed             float64
Runs_Allowed             float64
ER                       float64
HR_Allowed               float64
Walks_Allowed            float64
SO_Pitch                 float64
WHIP                     float64
SO9                      float64
HR9                      float64
Errors_Committed         float64
GDP                      float64
Fielding_Percentage      float64
Runs_Per_Game            float64
Runs_Scored              float64
Hits_For                 float64
Doubles                  float64
Triples                  float64
HR_For                   float64
RBI                      float64
SB                       float64
CS                       float64
Walks_For                float64
SO_Bat    