Import libraries

In [1]:
import pandas as pd
import numpy as np

Begin merge and analysis of dataframes

In [2]:
#read in data files
total_payroll = pd.read_csv(r'cleansed_data\total_payroll.csv')
mlb_win_totals = pd.read_csv(r'cleansed_data\mlb_win_totals.csv')
total_attendance = pd.read_csv(r'cleansed_data\total_attendance.csv')

In [3]:
#check payroll data
total_payroll.head()

Unnamed: 0,Year,Team,Total Payroll
0,2021,Los Angeles Dodgers,235412876
1,2021,New York Yankees,191205631
2,2021,Boston Red Sox,180261996
3,2021,Los Angeles Angels,177353000
4,2021,Philadelphia Phillies,174009000


In [4]:
#check team win data
mlb_win_totals.head()

Unnamed: 0,Year,G,ARI,ATL,BAL,BOS,CHC,CHW,CIN,CLE,...,PHI,PIT,SDP,SFG,SEA,STL,TBR,TEX,TOR,WSN
0,2023,162,84,104,101,78,83,61,82,76,...,90,76,82,79,88,71,99,90,89,71
1,2022,162,74,101,83,78,74,81,62,92,...,87,62,89,81,90,93,86,68,92,55
2,2021,162,52,88,52,92,71,93,83,80,...,82,61,79,107,90,90,100,60,91,65


In [5]:
#check attendance data
total_attendance.dtypes

Year           int64
Team          object
Attendance     int64
Avg Att        int64
dtype: object

Update team names in mlb_win_totals to match total_payroll

In [6]:
#need to check all team name values in mlb_win_totals
print(mlb_win_totals.columns.to_numpy().tolist())

['Year', 'G', 'ARI', 'ATL', 'BAL', 'BOS', 'CHC', 'CHW', 'CIN', 'CLE', 'COL', 'DET', 'HOU', 'KCR', 'LAA', 'LAD', 'MIA', 'MIL', 'MIN', 'NYM', 'NYY', 'OAK', 'PHI', 'PIT', 'SDP', 'SFG', 'SEA', 'STL', 'TBR', 'TEX', 'TOR', 'WSN']


In [7]:
#renaming mlb_win_totals columns to match total_payroll columns
mlb_win_totals = mlb_win_totals.rename(columns={'ARI': 'Arizona Diamondbacks', 
                                                'ATL': 'Atlanta Braves', 
                                                'BAL': 'Baltimore Orioles', 
                                                'BOS': 'Boston Red Sox', 
                                                'CHC': 'Chicago Cubs', 
                                                'CHW': 'Chicago White Sox', 
                                                'CIN': 'Cincinnati Reds', 
                                                'CLE': 'Cleveland Guardians', 
                                                'COL': 'Colorado Rockies', 
                                                'DET': 'Detroit Tigers', 
                                                'HOU': 'Houston Astros', 
                                                'KCR': 'Kansas City Royals', 
                                                'LAA': 'Los Angeles Angels', 
                                                'LAD': 'Los Angeles Dodgers', 
                                                'MIA': 'Miami Marlins', 
                                                'MIL': 'Milwaukee Brewers', 
                                                'MIN': 'Minnesota Twins', 
                                                'NYM': 'New York Mets', 
                                                'NYY': 'New York Yankees', 
                                                'OAK': 'Oakland Athletics', 
                                                'PHI': 'Philadelphia Phillies', 
                                                'PIT': 'Pittsburgh Pirates', 
                                                'SDP': 'San Diego Padres', 
                                                'SFG': 'San Francisco Giants', 
                                                'SEA': 'Seattle Mariners', 
                                                'STL': 'St. Louis Cardinals', 
                                                'TBR': 'Tampa Bay Rays', 
                                                'TEX': 'Texas Rangers', 
                                                'TOR': 'Toronto Blue Jays', 
                                                'WSN': 'Washington Nationals'})

print(mlb_win_totals.columns.to_numpy().tolist())

['Year', 'G', 'Arizona Diamondbacks', 'Atlanta Braves', 'Baltimore Orioles', 'Boston Red Sox', 'Chicago Cubs', 'Chicago White Sox', 'Cincinnati Reds', 'Cleveland Guardians', 'Colorado Rockies', 'Detroit Tigers', 'Houston Astros', 'Kansas City Royals', 'Los Angeles Angels', 'Los Angeles Dodgers', 'Miami Marlins', 'Milwaukee Brewers', 'Minnesota Twins', 'New York Mets', 'New York Yankees', 'Oakland Athletics', 'Philadelphia Phillies', 'Pittsburgh Pirates', 'San Diego Padres', 'San Francisco Giants', 'Seattle Mariners', 'St. Louis Cardinals', 'Tampa Bay Rays', 'Texas Rangers', 'Toronto Blue Jays', 'Washington Nationals']


In [8]:
#write updated mlb_win_totals to CSV
#mlb_win_totals.to_csv('mlb_win_totals.csv', index=None, header=True)

#check mlb_wins dataframe
mlb_win_totals.head()

Unnamed: 0,Year,G,Arizona Diamondbacks,Atlanta Braves,Baltimore Orioles,Boston Red Sox,Chicago Cubs,Chicago White Sox,Cincinnati Reds,Cleveland Guardians,...,Philadelphia Phillies,Pittsburgh Pirates,San Diego Padres,San Francisco Giants,Seattle Mariners,St. Louis Cardinals,Tampa Bay Rays,Texas Rangers,Toronto Blue Jays,Washington Nationals
0,2023,162,84,104,101,78,83,61,82,76,...,90,76,82,79,88,71,99,90,89,71
1,2022,162,74,101,83,78,74,81,62,92,...,87,62,89,81,90,93,86,68,92,55
2,2021,162,52,88,52,92,71,93,83,80,...,82,61,79,107,90,90,100,60,91,65


Need to adjust the dataframe so the Year + Teams is a column in mlb_win_total

In [9]:
#use melt to arrange mlb_win_totals in a more usable format
mlb_win_totals=mlb_win_totals.melt(id_vars=['Year', 'G'], var_name='Team', value_name='Wins')

mlb_win_totals.head(30)

Unnamed: 0,Year,G,Team,Wins
0,2023,162,Arizona Diamondbacks,84
1,2022,162,Arizona Diamondbacks,74
2,2021,162,Arizona Diamondbacks,52
3,2023,162,Atlanta Braves,104
4,2022,162,Atlanta Braves,101
5,2021,162,Atlanta Braves,88
6,2023,162,Baltimore Orioles,101
7,2022,162,Baltimore Orioles,83
8,2021,162,Baltimore Orioles,52
9,2023,162,Boston Red Sox,78


In [10]:
#check mlb_wins_total shape
mlb_win_totals.shape

(90, 4)

In [11]:
#check payroll shape
total_payroll.shape

(90, 3)

Build keys to merge data on

In [12]:
#combine year and team on payroll DF
win_cols = ['Year', 'Team']
mlb_win_totals['Team Season'] = mlb_win_totals[win_cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

#move Team Season to front of DF
tese = mlb_win_totals.pop("Team Season")
mlb_win_totals.insert(0, "Team Season", tese)

mlb_win_totals.head()

Unnamed: 0,Team Season,Year,G,Team,Wins
0,2023 Arizona Diamondbacks,2023,162,Arizona Diamondbacks,84
1,2022 Arizona Diamondbacks,2022,162,Arizona Diamondbacks,74
2,2021 Arizona Diamondbacks,2021,162,Arizona Diamondbacks,52
3,2023 Atlanta Braves,2023,162,Atlanta Braves,104
4,2022 Atlanta Braves,2022,162,Atlanta Braves,101


In [13]:
#combine year and team on mlb wins DF
pay_cols = ['Year', 'Team']
total_payroll['Team Season'] = total_payroll[pay_cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

#move Team Season to front of DF
tese2 = total_payroll.pop("Team Season")
total_payroll.insert(0, "Team Season", tese2)

total_payroll.head()

Unnamed: 0,Team Season,Year,Team,Total Payroll
0,2021 Los Angeles Dodgers,2021,Los Angeles Dodgers,235412876
1,2021 New York Yankees,2021,New York Yankees,191205631
2,2021 Boston Red Sox,2021,Boston Red Sox,180261996
3,2021 Los Angeles Angels,2021,Los Angeles Angels,177353000
4,2021 Philadelphia Phillies,2021,Philadelphia Phillies,174009000


Merge the dataframes

In [14]:
payroll_wins = total_payroll.merge(mlb_win_totals, how='right')

In [15]:
#moving columns for clean dataframe
games = payroll_wins.pop("G")
payroll_wins.insert(1, "G", games)

#drop redunadant columns
payroll_wins.drop(['Year', 'Team'], axis=1, inplace=True)

#rname Games column for clarity
payroll_wins=payroll_wins.rename(columns={'G': 'Games Played'})

payroll_wins.head()

Unnamed: 0,Team Season,Games Played,Total Payroll,Wins
0,2023 Arizona Diamondbacks,162,112763571,84
1,2022 Arizona Diamondbacks,162,75993333,74
2,2021 Arizona Diamondbacks,162,89077233,52
3,2023 Atlanta Braves,162,199727500,104
4,2022 Atlanta Braves,162,173935000,101


In [16]:
#breakout year and team to seperate columns
payroll_wins[['Year', 'Team']] = payroll_wins['Team Season'].str.split(' ', n=1, expand=True)

#move Year and Team to front of DF
years = payroll_wins.pop("Year")
payroll_wins.insert(0, "Year", years)

teams = payroll_wins.pop("Team")
payroll_wins.insert(1, "Team", teams)

#drop team season column
payroll_wins.drop('Team Season', axis=1, inplace=True)

#cast Year as interger 
payroll_wins['Year'] = payroll_wins['Year'].astype(int)
payroll_wins.dtypes

Year              int32
Team             object
Games Played      int64
Total Payroll     int64
Wins              int64
dtype: object

In [17]:
#merge in attendance doc
atten_wins_payroll = pd.merge(payroll_wins, total_attendance, on=['Year', 'Team'])

atten_wins_payroll

Unnamed: 0,Year,Team,Games Played,Total Payroll,Wins,Attendance,Avg Att
0,2023,Arizona Diamondbacks,162,112763571,84,1961182,24212
1,2022,Arizona Diamondbacks,162,75993333,74,1605199,19817
2,2021,Arizona Diamondbacks,162,89077233,52,1043010,12876
3,2023,Atlanta Braves,162,199727500,104,3191505,39401
4,2022,Atlanta Braves,162,173935000,101,3129931,38641
...,...,...,...,...,...,...,...
85,2022,Toronto Blue Jays,162,168070905,92,2653830,32763
86,2021,Toronto Blue Jays,162,137133333,91,809557,10119
87,2023,Washington Nationals,162,79983095,71,1865832,23034
88,2022,Washington Nationals,162,114623095,55,2026401,25017


In [18]:
#write payroll_wins to csv
atten_wins_payroll.to_csv(r'cleansed_data\atten_wins_payroll.csv', index=None, header=True)