In [1]:
import pandas as pd
import numpy as np

Create dataframe with player_id column and HoF_nomination column (with all values equaling 1) for all players nominated for the Hall of Fame

In [2]:
df_hof = pd.read_csv('~/Desktop/Springboard/CapstoneThree/data/raw/hall_of_fame.csv')
df_hof = df_hof[df_hof['category']=='Player']
hof_ids = pd.DataFrame(df_hof['player_id'].unique(), columns=['player_id']).set_index('player_id')
hof_ids['HoF_nomination']=1

Read in general player info, batting, and appearances (specifically games played and games started columns) dataframes

In [3]:
df_player = pd.read_csv('~/Desktop/Springboard/CapstoneThree/data/raw/player.csv')
df_batting = pd.read_csv('~/Desktop/Springboard/CapstoneThree/data/raw/batting.csv')
df_appearances = pd.read_csv('~/Desktop/Springboard/CapstoneThree/data/raw/appearances.csv')

Create temporary dataframe by merging the batting and appearance datasets and group by player_id mean. We now have one row of data with average stats for each player's career

In [4]:
df_temp = pd.merge(left=df_batting, right=df_appearances[['player_id','g_all','gs']], on='player_id')
df_temp = df_temp.groupby(['player_id']).mean().drop(['year', 'stint'], axis=1)

Merge temporary dataframe with general player info columns of player df

In [5]:
df_temp = pd.merge(left=df_temp, right=df_player[['player_id','weight','height','debut','final_game']], on='player_id').set_index('player_id')

Convert final and debut games to datetime and create new career length column

In [6]:
df_temp['final_game'] = pd.to_datetime(df_temp['final_game'])
df_temp['debut'] = pd.to_datetime(df_temp['debut'])

df_temp['career_length']=df_temp['final_game'] - df_temp['debut']
df_temp['career_length']=df_temp['career_length'].dt.days

Left join temp df with HoF df, creating a new column "HoF_nomination" where a 1 indicates the player was nominated and a 0 indicates they were not

In [7]:
df_temp = pd.merge(left=df_temp, right=hof_ids, on='player_id', how='left')
df_temp['HoF_nomination']=df_temp['HoF_nomination'].fillna(0)

Read in pitching df, filter out years before 1936, group by player_id to get career averages

In [8]:
df_pitching = pd.read_csv('~/Desktop/Springboard/CapstoneThree/data/raw/pitching.csv')
df_pitching = df_pitching[df_pitching['year']>=1936]
df_pitching = df_pitching.groupby(['player_id']).mean()

Outer join temp df and pitching df with an Indicator. This way we can filter non-pitchers (right_only) from pitchers (both). Drop all player ineligible for HoF because their career length was less than 10 years.

In [9]:
df_merged = pd.merge(left=df_pitching, right=df_temp, how='outer', on='player_id', indicator=True)
print(df_merged['HoF_nomination'].value_counts())
df_merged = df_merged[df_merged['career_length']>3284]
print(df_merged['HoF_nomination'].value_counts())

0.0    17491
1.0     1162
Name: HoF_nomination, dtype: int64
0.0    2864
1.0    1117
Name: HoF_nomination, dtype: int64


Bin players into decades by average year

In [10]:
bins=[1929.9,1939.9,1949.9,1959.9,1969.9,1979.9,1989.9,1999.9,2009.9,2019.9]
df_merged['Binned_Decade'] = pd.cut(df_merged['year'], bins)

dummy = pd.get_dummies(df_merged['Binned_Decade'])
df_merged = pd.concat([df_merged,dummy], axis=1)
df_merged.drop(columns=['year', 'Binned_Decade'], inplace=True)

Create pitchers dataframe my filtering on the merge indicator

In [11]:
df_pitchers = df_merged[df_merged['_merge']=='both']
df_pitchers.drop('_merge', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Drop columns that are irrelevant to pitchers or that had many NaNs

In [12]:
columns = list(df_pitchers.columns)
drop_column_numbers = [0,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44]
drop_columns = [columns[i] for i in drop_column_numbers]
df_pitchers.drop(drop_columns, axis=1, inplace=True)

Create dataframe df_ineligible_pitchers that includes all pitchers ineligible for HoF nomination because they are either currently playing or retired less than 5 years ago. Delete all these pitchers from main pitcher dataframe.

In [13]:
df_ineligible_pitchers = df_pitchers[df_pitchers['final_game'] > '2011-01-01']
df_pitchers = df_pitchers[df_pitchers['final_game'] <= '2011-01-01']

Replace NaNs with means, drop debut/final game columns since they are not int/floats and have already been used to filter and create decade bins.

In [14]:
df_pitchers.fillna(df_pitchers.mean(), inplace=True)
df_pitchers.drop(columns=['debut', 'final_game'], inplace=True)

df_ineligible_pitchers.fillna(df_ineligible_pitchers.mean(), inplace=True)
df_ineligible_pitchers.drop(columns=['debut', 'final_game'], inplace=True)

Write out csv of both pitcher dataframes

In [15]:
df_pitchers.to_csv('~/Desktop/Springboard/CapstoneThree/data/final/pitchers.csv', index=False)
df_ineligible_pitchers.to_csv('~/Desktop/Springboard/CapstoneThree/data/final/ineligible_pitchers.csv', index=False)

Create batters dataframe my filtering on the merge indicator

In [16]:
df_batters = df_merged[df_merged['_merge']=='right_only']
df_batters.drop('_merge', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Drop columns that are irrelevant to batters or that had many NaNs

In [17]:
columns = list(df_batters.columns)
drop_column_numbers = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,38,41,42,44]
drop_columns = [columns[i] for i in drop_column_numbers]
df_batters.drop(drop_columns, axis=1, inplace=True)

Create dataframe df_ineligible_batters that includes all batters ineligible for HoF nomination because they are either currently playing or retired less than 5 years ago. Delete all these batters from main batter dataframe.

In [18]:
df_ineligible_batters = df_batters[df_batters['final_game'] > '2011-01-01']
df_batters = df_batters[df_batters['final_game'] <= '2011-01-01']

Replace NaNs with means, drop debut/final game columns since they are not int/floats and have already been used to filter and create decade bins.

In [19]:
df_batters.fillna(df_batters.mean(), inplace=True)
df_batters.drop(columns=['debut', 'final_game'], inplace=True)

df_ineligible_batters.fillna(df_ineligible_batters.mean(), inplace=True)
df_ineligible_batters.drop(columns=['debut', 'final_game'], inplace=True)

Write out csv of both batter dataframes

In [20]:
df_batters.to_csv('~/Desktop/Springboard/CapstoneThree/data/final/batters.csv', index=False)
df_ineligible_batters.to_csv('~/Desktop/Springboard/CapstoneThree/data/final/ineligible_batters.csv', index=False)