In [113]:
import pandas as pd
import pandas as pd
from pandas.api.types import is_numeric_dtype, is_integer_dtype
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LinearRegression
from clean_baseball_data import match

# Objective: 
### &emsp;The goal of this file is to transform the cleaned data for 2024, getting it into the same format as the data in TeamWins_UpdatatedVersion.csv.  I will need to join the rosters data with the statistics data for 2024 then aggregate it so that their are statistics that are representative of the team as a whole.

### Join player pitching/hitting data with data that contains the players on the 2024 rosters.

In [116]:
def is_Pitcher(pos):
    if 'P' in pos:
        return True
    else:
        return False
pitching=pd.read_csv('pitchingAllHistory_clean.csv') #Data contains career stats of every pitcher in mlb history
hitting=pd.read_csv('hittingAllHistory_clean.csv') #Data contains career stats of every hitter in mlb history.
rosters=pd.read_csv("PlayerTeamsAll.csv") #File contains this years (2024) rosters with career stats for each team.

L=["60-Day IL","Optioned","Reassigned",'Projected Restricted List (visa)','Released','Projected Injured List','Projected Restricted List',
   'Projected Restricted List (SUSP)','Projected Injured List (MiLB)'] #List of player types to disclude.

rosters=rosters[~rosters['Status'].isin(L)] #Take out players who likely will not be playing
rosters['Pitcher']=rosters['Pos'].apply(is_Pitcher) #Make a column which indicates if a player is a pitcher or not.
rosters=rosters.rename(columns={'Name':'PLAYER'}) #Rename name column to player for joining purposes.
print(rosters.info())
roster_hitting=rosters[~rosters['Pitcher']] #Get dataFrame containing only hitters

joined_hitting=pd.merge(roster_hitting,hitting,on='PLAYER',how='inner') #Merge the roster data with the hitters career stats data.

roster_pitching=rosters[rosters['Pitcher']] #Get dataFrame containing only pitchers

joined_pitching=pd.merge(roster_pitching,pitching, on='PLAYER',how='inner') #Join rosters data with pitchers career stats data.


<class 'pandas.core.frame.DataFrame'>
Index: 902 entries, 0 to 1977
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  902 non-null    int64 
 1   PLAYER      902 non-null    object
 2   Team        902 non-null    object
 3   Pos         902 non-null    object
 4   Status      901 non-null    object
 5   Pitcher     902 non-null    bool  
dtypes: bool(1), int64(1), object(4)
memory usage: 43.2+ KB
None


# Pitching
**When the team statistics are calculated, they will be sensitive to extreme values caused by rookies that for
example have pitched only a few inning and let go of an exhorbinant amount of runs in those innings.  For example,
I re ran the scrape after the dodgers played their opening games in seoul and the dodgers team ERA was above 8, because
rookie Yomimoto let go of 5 runs in 1 inning making his career era 45.  Therefore, I am removing pitchers who have not pitched at least 
10 innings.**

In [118]:
joined_pitching = joined_pitching[joined_pitching['IP']>=10]
joined_pitching.head(10)

Unnamed: 0,Unnamed: 0_x,PLAYER,Team,Pos,Status,Pitcher,Unnamed: 0_y,W,L,ERA,...,IP,H,R,ER,HR,HB,BB,SO,WHIP,AVG
0,12,Zac Gallen,ARI,SP,Starting Rotation,True,3253,39,31,3.21,...,667.1,534,254,238,73,28.0,204,729,1.11,0.217
1,13,Merrill Kelly,ARI,SP,Starting Rotation,True,3065,48,43,3.8,...,750.2,683,334,317,96,13.0,233,681,1.22,0.242
2,14,Brandon Pfaadt,ARI,SP,Starting Rotation,True,6471,3,9,5.72,...,96.0,109,63,61,22,3.0,26,94,1.41,0.282
3,15,Ryne Nelson,ARI,SP,Starting Rotation,True,5556,9,9,4.88,...,162.1,168,91,88,26,1.0,52,112,1.36,0.271
4,16,Tommy Henry,ARI,SP,Starting Rotation,True,5921,8,8,4.57,...,136.0,133,70,69,22,7.0,56,100,1.39,0.255
5,20,Kevin Ginkel,ARI,RP,Closer,True,2537,13,5,3.58,...,163.1,134,82,65,16,5.0,70,177,1.25,0.223
6,21,Scott McGough,ARI,RP,Setup Man,True,4191,2,7,5.14,...,77.0,72,47,44,14,2.0,34,90,1.38,0.242
7,22,Miguel Castro,ARI,RP,Setup Man,True,728,22,28,4.15,...,447.0,384,225,206,55,22.0,230,402,1.37,0.231
8,23,Ryan Thompson,ARI,RP,Middle Reliever,True,2873,8,9,3.57,...,133.2,114,64,53,14,8.0,36,120,1.12,0.226
9,24,Kyle Nelson,ARI,RP,Middle Reliever,True,3140,9,5,4.27,...,103.1,98,54,49,14,6.0,37,105,1.31,0.246


**Classify players as either Bullpen or Starting Rotation.  These categories will be used for weighting later.**

In [120]:

joined_pitching['Status']=joined_pitching['Status'].apply(lambda x: x if x=='Starting Rotation' else 'Bullpen')
#Change the status column so that players are categorized as either starting rotation or bullpen.
joined_pitching['WHIP']=joined_pitching['WHIP'].astype(float)
joined_pitching['ERA'] = joined_pitching['ERA'].astype(float)
joined_pitching['AVG'] = joined_pitching['AVG'].astype(float)
try:
    joined_pitching.drop(columns=['Unnamed: 0_x','Unnamed: 0_y'], inplace=True)
except:
    joined_pitching

In [121]:
class CalcTeamStatistics:
    def __init__(self,df):
        self.df=df.drop(columns=[item for item in df.columns if item.startswith('Unnamed')]).rename(columns={'PLAYER':'Player'})
        self.features=[item for item in self.df.columns if is_numeric_dtype(self.df[item])]
        self.grouped=self.group()
    def feature_dict(self):
        D={}
        for item in self.features:
           if is_integer_dtype(self.df[item]):
                D[item]='sum'
           else:
               D[item]='mean'
           if item=='IP':
               D[item]='sum'
        return D

    def group(self):
        grouped=self.df.groupby(['Team','Status']).agg(self.feature_dict())
        return grouped.reset_index()
class weightPitching(CalcTeamStatistics):
    def __init__(self,df):
        super().__init__(df)
        self.regrouped=self.starter_depth()
        self.weighted=self.weightedAverage()
        self.totals()
    def starter_depth(self):
        starters=self.grouped[self.grouped['Status']=='Starting Rotation'].copy()
        starters['starter contribution']=(starters['IP']/starters['G'])/9
        updated_grouped=pd.merge(self.grouped,starters[['starter contribution','Team']], on='Team', how='inner')
        updated_grouped['Weight']=updated_grouped.apply(lambda x: x['starter contribution'] if x['Status']=='Starting Rotation' else 1-x['starter contribution'], axis=1)
        return updated_grouped.drop(columns=['starter contribution'])
    def weightedAverage(self):
        D={}
        print(self.features)
        for item in self.features:
            if not is_integer_dtype(self.regrouped[item]) and item!='IP':
                self.regrouped[item]=self.regrouped[item]*self.regrouped['Weight']
                D[item]='sum'
            else:
                D[item]='sum'

        return self.regrouped.groupby('Team').agg(D)
    def totals(self):
        for item in self.features:
            if is_integer_dtype(self.weighted[item]) and item!='G':
                self.weighted[item]=(self.weighted[item]/self.weighted['IP'])*162*9

    def get(self):
        D={}
        for item in self.features:
            D[item]=item+'_pitch'
        return self.weighted.rename(columns=D)
class weightedHitting(CalcTeamStatistics):
    def __init__(self,df,weights):
        self.weights=weights
        super().__init__(df)
        print(self.grouped)
        self.weightedAvg()
        self.regrouped=self.regroup()

    def weightedAvg(self):
        for ind, row in self.grouped.iterrows():
            multiplyer=self.weights[self.grouped.loc[ind,'Status']]
            for item in self.features:
                if not is_integer_dtype(self.grouped[item]):
                     self.grouped.at[ind,item]=self.grouped.loc[ind,item]*multiplyer
    def regroup(self):
        D={}
        for item in self.features:
            D[item]='sum'

        regrouped=self.grouped.groupby('Team').agg(D).reset_index()
        return regrouped
      
    def get(self):
        for item in self.features:
            if is_integer_dtype(self.regrouped[item]):

                self.regrouped=LinearModel(self.regrouped,item).predict()
        return self.regrouped

class LinearModel:
    def __init__(self,df,col):
        self.df=df
        self.col=col
        self.data=pd.read_csv("TeamWins_UpdatedVersion.csv")

        self.features=self.Features()

        self.X=self.data[self.features]
        self.y=self.data[self.col]
        self.model = LinearRegression()
        self.model.fit(self.X, self.y)


    def Features(self):
        L=['AB','AVG','OBP']
        if self.col=='HR':
            L.append('SLG')
        L=[item for item in L if item!=self.col]
        return L
    def predict(self):
        pred_x=self.df[self.features]
        self.df[self.col]=self.model.predict(pred_x)
        return self.df


**Now we need to create a weighted average for each statistics that best represents the team as a whole. For pitching
I will weight based on two categories: starting rotation and bullpen.  To do this I will find out the average depth of the teams starting pitching, convert this into a percent that represents the percentage of the game the teams pitchers will generally pitch.  This value will 
become the weight that will be applied to the teams statistics, the rest of the weight will be given to the bullpen.**


**the methods in weightPitching use datatype to determine which aggregate function to apply to a column,
float columns should be averaged while integer columns should be summed.**

In [123]:

team_pitching=weightPitching(joined_pitching).get() #This returns the aggregated weighted average of all pitchers on each team.


['Pitcher', 'W', 'L', 'ERA', 'G', 'GS', 'CG', 'SHO', 'SV', 'IP', 'H', 'R', 'ER', 'HR', 'HB', 'BB', 'SO', 'WHIP', 'AVG']


# Collective Team Pitching Statistics

In [125]:
team_pitching.head()

Unnamed: 0_level_0,Pitcher_pitch,W_pitch,L_pitch,ERA_pitch,G_pitch,GS_pitch,CG_pitch,SHO_pitch,SV_pitch,IP_pitch,H_pitch,R_pitch,ER_pitch,HR_pitch,HB_pitch,BB_pitch,SO_pitch,WHIP_pitch,AVG_pitch
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
ARI,1.0,89.034259,79.078352,4.339505,2235,160.147885,2.560091,0.853364,39.254721,5125.6,1321.860075,692.931169,643.151631,176.077337,10.386579,535.05892,1348.883253,1.300416,0.246777
ATL,1.0,93.381581,82.593817,3.692941,3693,171.929987,4.382529,1.517029,49.050614,8649.8,1294.868783,656.536567,597.878101,157.602488,36.388034,480.055493,1578.047585,1.207876,0.228941
BAL,1.0,83.973206,79.105194,3.993434,2483,150.90837,0.608501,0.608501,138.73834,4792.1,1250.166315,658.398614,606.067486,179.812191,12.804783,476.456668,1529.772751,1.238738,0.240627
BOS,1.0,82.099921,84.802799,4.000799,2121,169.605599,1.013579,0.0,155.077629,4315.4,1328.464569,688.220327,646.663577,191.904343,14.494526,479.760856,1542.329796,1.277692,0.244622
CHC,1.0,87.718572,74.309364,3.746728,1902,200.858769,2.793585,1.676151,34.640455,5219.1,1350.977755,665.71133,612.353854,190.522504,19.389335,434.9612,1389.529229,1.262023,0.242798


# Hitting

In [127]:
print('Hitting Categories')
for item in joined_hitting['Status'].unique():
    print('\t',item)

Hitting Categories
	 Lineup Regular
	 Platoon vs R
	 Bench
	 10-Day IL
	 DFA
	 Platoon vs L
	 15-Day IL


In [128]:
print('Remove Players Designated for Reassignment')
print('Length Before',len(joined_hitting))
joined_hitting=joined_hitting[joined_hitting['Status']!='DFA']
print('Length After', len(joined_hitting))

Remove Players Designated for Reassignment
Length Before 400
Length After 393


In [129]:
print('Remaining Hitter Categories')
joined_hitting['Status'].unique()

Remaining Hitter Categories


array(['Lineup Regular', 'Platoon vs R', 'Bench', '10-Day IL',
       'Platoon vs L', '15-Day IL'], dtype=object)

**Look at players that are on the injured list**

In [131]:
joined_hitting[joined_hitting['Status']=='10-Day-IL']
not_interested=['Lineup Regular', 'Platoon vs R', 'Bench', 
       'Platoon vs L' ]
count=0
for item in joined_hitting['Status'].unique():
    if item not in not_interested:
        if count!=5:
            data=joined_hitting[joined_hitting['Status']==item]
    count+=1
print(f'{count} players on the IL')
data.head()
    

6 players on the IL


Unnamed: 0,Unnamed: 0_x,PLAYER,Team,Pos,Status,Pitcher,Unnamed: 0_y,G,AB,R,...,HR,RBI,BB,SO,SB,CS,AVG,OBP,SLG,OPS
11,30,Randal Grichuk,ARI,OF,10-Day IL,False,1335,1142,3968,536,...,191,556,236,1086.0,27,18,0.249,0.296,0.465,0.761
52,431,Vaughn Grissom,BOS,2B,10-Day IL,False,9952,64,216,29,...,5,27,13,49.0,5,3,0.287,0.339,0.407,0.746
53,432,Rob Refsnyder,BOS,OF,10-Day IL,False,4245,378,891,127,...,13,84,114,224.0,17,7,0.244,0.336,0.342,0.678
67,498,Patrick Wisdom,CHC,INF/OF,10-Day IL,False,4227,380,1153,176,...,80,184,122,481.0,18,8,0.214,0.298,0.468,0.766
80,662,Max Stassi,CHW,C,10-Day IL,False,3952,403,1137,136,...,41,128,119,379.0,0,0,0.212,0.295,0.361,0.656


**Similar to pitching, I needed a weighted average for the teams hitting statistics.  How to weight each player category
was not as easy to determine for hitting.  I decided to categorize a player as either lineup regular or bench and 
assigned the lineup regular category to 85% of the weight remaining 15% to the bench.  This is probably generally pretty 
accurate, however different teams are managed differently, and some teams give certain bench players a lot of 
at-bats in certain situations.  For example, certain bench players will get more at-bats that a lineup regular if the
pitcher is left-handed.  I encourage you to come up with different categories and weights to make this more accurate.**

In [133]:

def get_hitter_category(x):
    if x['Status']=='Lineup Regular':
        return x['Status']
    elif x['Status'] in ['Platoon vs R', 'Platoon vs L']:
        return 'Platoon'
    elif x['Status'] in 'Bench':
        return 'Bench'
    else:
        if x['OPS']>0.79:
            return 'Lineup Regular'
        elif x['OPS']>0.69:
            return 'Platoon'
        else:
            return 'Bench'
try:
    joined_hitting.drop(columns=['Unnamed: 0_x','Unnamed: 0_y'], inplace=True)
except:
    joined_hitting
joined_hitting['AVG'] = joined_hitting['AVG'].astype(float)
joined_hitting['OPS']=joined_hitting['OPS'].astype(float)
joined_hitting['SLG'] = joined_hitting['SLG'].astype(float)
joined_hitting['OBP'] = joined_hitting['OBP'].astype(float)
joined_hitting['RBI'] = joined_hitting['RBI'].astype(int)
joined_hitting['SB'] = joined_hitting['SB'].astype(int)
joined_hitting['CS'] = joined_hitting['CS'].astype(int)
joined_hitting['SO'] = joined_hitting['SO'].astype(int)
joined_hitting['Status']=joined_hitting.apply(get_hitter_category,axis=1)

"""If a team does not have 9 players listed as 'Lineup Regular' then recategorize the best remaining bench players as 'Lineup Regular'.  I first
placed players classified as 'Platoon' as 'Lineup Regulars' until a full 9 man starting lineup was filled.  The remaining non starters
(classified as Platoon or Bench) were placed into the bench category."""
inds_to_change=[]
for team in joined_hitting['Team'].unique():
    data=joined_hitting[joined_hitting['Team']==team]
    
    if len(data[data['Status']=='Lineup Regular'])<9:
        amount=9-len(data[data['Status']=='Lineup Regular'])
        count=0
        for ind, row in data.iterrows():
                if row['Status']=='Platoon':
                    inds_to_change.append(ind)
                    count+=1
                if count==amount:
                    break
        if count!=amount:
            for ind, row in data.iterrows():
                if row['Status']=='Bench':
                    inds_to_change.append(ind)
                    count+=1
                if count==amount:
                    break
for ind in inds_to_change:
    joined_hitting.at[ind,'Status']='Lineup Regular'
joined_hitting.loc[inds_to_change]
joined_hitting['Status']=joined_hitting['Status'].apply(lambda x: x if x=='Lineup Regular' else 'Bench')
                    


In [134]:
for team in joined_hitting['Team'].unique():
    print(team)
    data=joined_hitting[joined_hitting['Team']==team]
    print(data['Status'].value_counts())

ARI
Status
Lineup Regular    9
Bench             3
Name: count, dtype: int64
ATL
Status
Lineup Regular    9
Bench             1
Name: count, dtype: int64
BAL
Status
Lineup Regular    9
Bench             4
Name: count, dtype: int64
BOS
Status
Lineup Regular    9
Bench             6
Name: count, dtype: int64
CHC
Status
Lineup Regular    9
Bench             5
Name: count, dtype: int64
CHW
Status
Lineup Regular    9
Bench             4
Name: count, dtype: int64
CIN
Status
Lineup Regular    9
Bench             5
Name: count, dtype: int64
CLE
Status
Lineup Regular    9
Bench             4
Name: count, dtype: int64
COL
Status
Lineup Regular    9
Bench             3
Name: count, dtype: int64
DET
Status
Lineup Regular    9
Bench             3
Name: count, dtype: int64
HOU
Status
Lineup Regular    9
Bench             4
Name: count, dtype: int64
KCR
Status
Lineup Regular    9
Bench             4
Name: count, dtype: int64
LAA
Status
Lineup Regular    9
Bench             5
Name: count, dtype: int64

**Most teams have nearly the right balance of Starters and Bench players.  Atlanta how ever only has 1 bench player.  There is also the issue of new players that don't have much statistical history.  These players could potentially skew the results significantly.  Below I impute the stats with league averages if the player has less than 100 at-bats**

In [136]:

main_stats=joined_hitting[['Status']+[col for col in joined_hitting.columns if is_numeric_dtype(joined_hitting[col]) and col!='AB']]
league_avg=main_stats.groupby('Status').agg({col:'mean' for col in main_stats.columns if is_numeric_dtype(main_stats[col])}).reset_index()
for col in league_avg.columns:
    if col not in ['Status','Pitcher','Unnamed: 0_y','Unnamed: 0_x']:
        joined_hitting[col]=joined_hitting.apply(lambda x: x[col] if x['AB']>100 else x[col]*0.4+league_avg[league_avg['Status']==x['Status']][col].values[0]*0.6, axis=1)


**There were players with the same name that got into the data.  For example there are two Josh Bells in the data set.  I looked up these five players and filtered out the ones that were incorrect (the stats did not represent the player that is actually on the team)**

In [138]:
df=joined_hitting
duplicates = df[df['PLAYER'].duplicated(keep=False)]
duplicates

Unnamed: 0,PLAYER,Team,Pos,Status,Pitcher,G,AB,R,H,2B,...,HR,RBI,BB,SO,SB,CS,AVG,OBP,SLG,OPS
95,José Ramírez,CLE,3B,Lineup Regular,False,1293.0,4757,784.0,1327.0,325.0,...,216.0,746.0,546.0,646.0,202.0,48.0,0.279,0.355,0.499,0.854
96,José Ramírez,CLE,3B,Lineup Regular,False,368.202222,0,167.742222,298.015556,60.993333,...,47.915556,159.895556,116.92,274.037778,20.242222,6.466667,0.153736,0.196,0.259011,0.455011
177,Will Smith,LAD,C,Lineup Regular,False,486.0,1680,273.0,441.0,85.0,...,91.0,308.0,216.0,362.0,9.0,0.0,0.263,0.358,0.484,0.842
178,Will Smith,LAD,C,Lineup Regular,False,529.402222,1,167.742222,298.415556,60.993333,...,47.915556,160.695556,116.92,274.037778,20.242222,6.466667,0.553736,0.596,0.659011,1.255011
186,Josh Bell,MIA,1B,Lineup Regular,False,1002.0,3497,488.0,908.0,186.0,...,152.0,542.0,469.0,754.0,4.0,13.0,0.26,0.347,0.453,0.8
187,Josh Bell,MIA,1B,Lineup Regular,False,100.0,272,24.0,53.0,7.0,...,4.0,22.0,10.0,92.0,0.0,1.0,0.195,0.223,0.265,0.488
255,Nick Allen,OAK,SS,Lineup Regular,False,206.0,602,60.0,129.0,17.0,...,8.0,39.0,36.0,116.0,8.0,3.0,0.214,0.26,0.289,0.549
256,Nick Allen,OAK,SS,Lineup Regular,False,100.0,500,41.0,116.0,13.0,...,0.0,36.0,33.0,73.0,8.0,0.0,0.232,0.288,0.278,0.566
328,Tom Murphy,SFG,C,Bench,False,315.0,911,114.0,222.0,45.0,...,48.0,126.0,90.0,325.0,3.0,1.0,0.244,0.313,0.456,0.769
329,Tom Murphy,SFG,C,Bench,False,296.0,289,21.0,42.0,7.0,...,1.0,18.0,13.0,96.0,0.0,0.0,0.145,0.181,0.18,0.361


In [139]:
drop_inds=[96,178,187,256,329] #Indices of players that are not correct but have the same name as a current player.
joined_hitting=joined_hitting[~joined_hitting.index.isin(drop_inds)]


# Aggregate the player statistics into team statistics using the described weights.

In [141]:
weights = {'Lineup Regular': 0.85, 'Bench': 0.15}
X=weightedHitting(joined_hitting, weights)
team_hitting=X.get() #Returns dataFrame with the aggregated team averages for each statistic.

   Team          Status  Pitcher            G     AB           R           H  \
0   ARI           Bench      0.0   660.666667   5704  205.333333  453.666667   
1   ARI  Lineup Regular      0.0   681.000000  20603  329.666667  577.000000   
2   ATL           Bench      0.0   206.278049     13   80.092683  149.687805   
3   ATL  Lineup Regular      0.0   701.666667  22265  351.777778  632.555556   
4   BAL           Bench      0.0   543.669512   5857  194.673171  380.771951   
5   BAL  Lineup Regular      0.0   374.666667  11880  178.333333  338.222222   
6   BOS           Bench      0.0   202.746341   2586   67.448780  133.147967   
7   BOS  Lineup Regular      0.0   386.778025  11141  208.571358  357.890617   
8   CHC           Bench      0.0   220.200000   3132   93.200000  149.600000   
9   CHC  Lineup Regular      0.0   605.622469  17920  297.371358  541.979506   
10  CHW           Bench      0.0   278.689024   2079   93.596341  183.293902   
11  CHW  Lineup Regular      0.0   702.5

In [142]:
team_hitting.head()

Unnamed: 0,Team,Pitcher,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,CS,AVG,OBP,SLG,OPS
0,ARI,0.0,677.95,5477.997698,311.016667,558.5,114.416667,13.783333,91.861111,300.044444,228.7,566.844444,30.977778,11.011111,0.252406,0.325217,0.420094,0.745311
1,ATL,0.0,627.358374,5517.16059,311.025014,560.125393,112.275745,9.215041,105.457073,333.178509,206.047019,555.655257,21.594499,8.166531,0.261645,0.332859,0.458879,0.791738
2,BAL,0.0,400.017093,5489.636395,180.784309,344.604682,72.796714,6.709316,50.199268,179.167683,118.097588,332.382425,20.66918,7.605799,0.251917,0.318453,0.421964,0.740417
3,BOS,0.0,359.173272,5512.739862,187.402971,324.17922,73.362921,6.366531,57.170426,183.438036,112.609355,347.769815,26.811589,7.383681,0.257128,0.322291,0.441921,0.764212
4,CHC,0.0,547.809099,5476.622053,266.745654,483.12258,98.404926,9.381802,75.673136,267.25458,185.745778,500.056901,33.481765,10.590741,0.251883,0.324394,0.424575,0.74897


# Join the Pitching and hitting stats and add flags for the division and league.

In [144]:
team_stats_24=pd.merge(team_pitching,team_hitting, how='inner',on='Team')
team_stats_24['Year']=2024
team_stats_24=team_stats_24.rename(columns={'Team':'CODE'})
leagues=pd.read_csv('leagueInfo.csv') #contains the current teams with their leagues and divisions
leagues=leagues[['CODE','division_id','league_id']]
print(leagues.sort_values(by='CODE')['CODE'].unique())
print(len(leagues))
print(team_stats_24.sort_values(by='CODE')["CODE"].unique())
print(len(team_stats_24))
team_stats_24=pd.merge(leagues,team_stats_24, on=['CODE'])#add the league/division information to the dataFrame
team_stats_24=team_stats_24.rename(columns={'division_id':'DIVISION','league_id':'LEAGUE'}) #rename so it matches the format of training data.
team_stats_24.to_csv('2024teamStatsProjections2.csv')

['ARI' 'ATL' 'BAL' 'BOS' 'CHC' 'CHW' 'CIN' 'CLE' 'COL' 'DET' 'HOU' 'KCR'
 'LAA' 'LAD' 'MIA' 'MIL' 'MIN' 'NYM' 'NYY' 'OAK' 'PHI' 'PIT' 'SDP' 'SEA'
 'SFG' 'STL' 'TBR' 'TEX' 'TOR' 'WSN']
30
['ARI' 'ATL' 'BAL' 'BOS' 'CHC' 'CHW' 'CIN' 'CLE' 'COL' 'DET' 'HOU' 'KCR'
 'LAA' 'LAD' 'MIA' 'MIL' 'MIN' 'NYM' 'NYY' 'OAK' 'PHI' 'PIT' 'SDP' 'SEA'
 'SFG' 'STL' 'TBR' 'TEX' 'TOR' 'WSN']
30
