# COGS 108 - EDA Checkpoint

# Names

- Ant Man
- Hulk
- Iron Man
- Thor
- Wasp

<a id='research_question'></a>
# Research Question

*Fill in your research question here*

# Setup

In [371]:
# import packages and setup visuals
import pandas as pd
import numpy as np
import os

# Data Cleaning

Describe your data cleaning steps here.

In [372]:
# merge all contract files in contracts directory as one dataframe
directory = 'data/contracts/'
filepaths = [directory + filename for filename in os.listdir(directory)]
contracts = pd.concat([pd.read_csv(filepath) for filepath in filepaths])
print('contracts shape: ', contracts.shape)

contracts shape:  (4996, 18)


In [373]:
# set 'Age' column to value in 'Age  7/1/21' column or 'Age 7/1/22' column if NaN
contracts['Age'] = contracts['Age'].add(contracts['Age 7/1/21'], fill_value=0)
contracts['Age'] = contracts['Age'].add(contracts['Age 7/1/22'], fill_value=0)

In [374]:
# drop unecessary columns
contracts = contracts[['Player','Pos\'n', 'Age', 'Term', 'AAV']]

# drop players that are pitchers
contracts = contracts[contracts["Pos'n"].str.contains("hp") == False]

# drop players with NaN values
contracts = contracts.dropna(axis=0)

# drop position column (no longer needed)
contracts = contracts.drop('Pos\'n',axis=1)
display(contracts)

Unnamed: 0,Player,Age,Term,AAV
0,"Cano, Robinson",31.0,2014-23,"$24,000,000"
1,"Ellsbury, Jacoby",30.0,2014-20,"$21,857,143"
2,"Choo, Shin-Soo",31.0,2014-20,"$18,571,429"
3,"McCann, Brian",30.0,2014-18,"$17,000,000"
4,"Granderson, Curtis",33.0,2014-17,"$15,000,000"
...,...,...,...,...
66,"Chisenhall, Lonnie",30.0,2019,"$2,750,000"
70,"Maldonado, Martin",32.0,2019,"$2,500,000"
75,"McCann, Brian",35.0,2019,"$2,000,000"
77,"Harrison, Josh",31.0,2019,"$2,000,000"


In [375]:
# functions to standardize variables
def salary_to_int(str_in):
    return int(str_in.replace('$','').replace(',',''))

def term_to_year(str_in):
    return int(str(str_in).split('-')[0])

def standardize_name(str_in):
    if ',' in str_in:
        name_list = str(str_in).split(',')
        str_in = name_list[1].strip() + " " + name_list[0].strip()
    str_in = str_in.replace('Jr.','')
    str_in = str_in.replace('Sr.','')
    str_in = str_in.replace('.','')
    str_in = str_in.replace('é','e')
    str_in = str_in.replace('á','a')
    return str_in.replace(' ', '')

def standardize_age(age):
    return round(float(age), 0)

In [376]:
# standardize salary and term variables
contracts['Player'] = contracts['Player'].apply(standardize_name)
contracts['Age'] = contracts['Age'].apply(standardize_age)
contracts['AAV'] = contracts['AAV'].apply(salary_to_int)
contracts['Term'] = contracts['Term'].apply(term_to_year)
contracts

Unnamed: 0,Player,Age,Term,AAV
0,RobinsonCano,31.0,2014,24000000
1,JacobyEllsbury,30.0,2014,21857143
2,Shin-SooChoo,31.0,2014,18571429
3,BrianMcCann,30.0,2014,17000000
4,CurtisGranderson,33.0,2014,15000000
...,...,...,...,...
66,LonnieChisenhall,30.0,2019,2750000
70,MartinMaldonado,32.0,2019,2500000
75,BrianMcCann,35.0,2019,2000000
77,JoshHarrison,31.0,2019,2000000


In [377]:
# rename columns for consistency
contracts.columns = ['playerName','playerAge','year','yearSalary']
contracts.head()

Unnamed: 0,playerName,playerAge,year,yearSalary
0,RobinsonCano,31.0,2014,24000000
1,JacobyEllsbury,30.0,2014,21857143
2,Shin-SooChoo,31.0,2014,18571429
3,BrianMcCann,30.0,2014,17000000
4,CurtisGranderson,33.0,2014,15000000


In [378]:
# read people and batting files as dataframes
people = pd.read_csv('data/batting/People.csv')
batting = pd.read_csv('data/batting/Batting.csv')
print('people shape: ', people.shape)
print('batting shape: ', batting.shape)

people shape:  (20370, 24)
batting shape:  (110495, 22)


In [379]:
batting = batting[['playerID', 'yearID', 'AB', 'H', '2B', '3B', 'HR', 'BB','HBP','SF']]

In [380]:
display(batting[batting['playerID'] == 'abramca01'])
batting = batting.groupby(['playerID', 'yearID']).agg(lambda x: x.sum(min_count=1))
batting = batting.reset_index()
display(batting[batting['playerID'] == 'abramca01'])

Unnamed: 0,playerID,yearID,AB,H,2B,3B,HR,BB,HBP,SF
33904,abramca01,1949,24,2,1,0,0,7,0.0,
34477,abramca01,1950,44,9,1,0,0,9,0.0,
35053,abramca01,1951,150,42,8,0,3,36,0.0,
35670,abramca01,1952,10,2,0,0,0,2,0.0,
35671,abramca01,1952,158,44,9,2,2,19,0.0,
36303,abramca01,1953,448,128,10,6,15,58,0.0,
36889,abramca01,1954,42,6,1,1,0,10,0.0,0.0
36890,abramca01,1954,423,124,22,7,6,72,4.0,1.0
37466,abramca01,1955,309,75,12,3,6,89,3.0,3.0
38121,abramca01,1956,3,1,0,0,0,2,0.0,0.0


Unnamed: 0,playerID,yearID,AB,H,2B,3B,HR,BB,HBP,SF
187,abramca01,1949,24,2,1,0,0,7,0.0,
188,abramca01,1950,44,9,1,0,0,9,0.0,
189,abramca01,1951,150,42,8,0,3,36,0.0,
190,abramca01,1952,168,46,9,2,2,21,0.0,
191,abramca01,1953,448,128,10,6,15,58,0.0,
192,abramca01,1954,465,130,23,8,6,82,4.0,1.0
193,abramca01,1955,309,75,12,3,6,89,3.0,3.0
194,abramca01,1956,3,1,0,0,0,2,0.0,0.0


In [381]:
# drop observations with NaN values
batting = batting.dropna(axis=0)

# drop observations with 0 At Bats
batting = batting[batting['AB'] > 0]
batting.shape

(53054, 10)

In [382]:
# functions used for calculating batting average and OBS average
def calc_avg(h, ab):
    return h / ab
    
def calc_obp(h, bb, hbp, ab, sf):
    return (h + bb + hbp) / (ab + bb + sf + hbp)
    
def calc_tb(h, two_b, three_b, hr):
    singles = h - two_b - three_b - hr
    return singles + two_b * 2 + three_b * 3 + hr * 4

def calc_slg(tb, ab):
    return tb / ab

def calc_obs(obp, slg):
    return obp + slg

In [383]:
# create Batting Average column in batting dataframe
avg = calc_avg(h=batting['H'], ab=batting['AB'])
batting['AVG'] = round(avg, 3)

# create On Base Percentage average column in batting dataframe
obp = calc_obp(h=batting['H'], bb=batting['BB'], hbp=batting['HBP'], ab=batting['AB'], sf=batting['SF'])
batting['OBP'] = round(obp, 3)

# create Slugging average column in batting dataframe
tb = calc_tb(h=batting['H'], two_b=batting['2B'], three_b=batting['3B'], hr=batting['HR'])
slg = calc_slg(tb, batting['AB'])
batting['SLG'] = round(slg, 3)

# create On Base Plus Slugging column in batting dataframe
obs = calc_obs(obp, slg)
batting['OBS'] = round(obs, 3)

batting.head()

Unnamed: 0,playerID,yearID,AB,H,2B,3B,HR,BB,HBP,SF,AVG,OBP,SLG,OBS
1,aardsda01,2006,2,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,aardsda01,2008,1,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
8,aardsda01,2015,1,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
9,aaronha01,1954,468,131,27,6,13,28,3.0,4.0,0.28,0.322,0.447,0.769
10,aaronha01,1955,602,189,37,9,27,49,3.0,4.0,0.314,0.366,0.54,0.906


In [384]:
# drop irrelevant columns and rename
batting = batting[['playerID', 'yearID','AB', 'AVG', 'OBP','SLG','OBS']]
batting.columns = ['playerID', 'year', 'AB', 'AVG', 'OBP', 'SLG','OBS']

In [385]:
batting

Unnamed: 0,playerID,year,AB,AVG,OBP,SLG,OBS
1,aardsda01,2006,2,0.000,0.000,0.000,0.000
3,aardsda01,2008,1,0.000,0.000,0.000,0.000
8,aardsda01,2015,1,0.000,0.000,0.000,0.000
9,aaronha01,1954,468,0.280,0.322,0.447,0.769
10,aaronha01,1955,602,0.314,0.366,0.540,0.906
...,...,...,...,...,...,...,...
102185,zuverge01,1954,66,0.136,0.149,0.152,0.301
102186,zuverge01,1955,27,0.185,0.214,0.222,0.437
102187,zuverge01,1956,17,0.118,0.167,0.118,0.284
102188,zuverge01,1957,23,0.130,0.231,0.130,0.361


In [386]:
# add name column to people that combines players first and last name
people['name'] = people['nameFirst'] + ' ' + people['nameLast']

# drop irrelevant columns
people = people[['playerID','name']]

# merge batting and people to add name column to batting dataframe
batting = batting.merge(people, how='left', on='playerID')

# rename and reorder  batting columns for consistency 
batting = batting.rename({'name':'playerName'}, axis=1)
batting = batting[['playerID','playerName','year','AB', 'AVG', 'OBP', 'SLG','OBS']]

# standardize name
batting['playerName'] = batting['playerName'].apply(standardize_name)
batting.head()

Unnamed: 0,playerID,playerName,year,AB,AVG,OBP,SLG,OBS
0,aardsda01,DavidAardsma,2006,2,0.0,0.0,0.0,0.0
1,aardsda01,DavidAardsma,2008,1,0.0,0.0,0.0,0.0
2,aardsda01,DavidAardsma,2015,1,0.0,0.0,0.0,0.0
3,aaronha01,HankAaron,1954,468,0.28,0.322,0.447,0.769
4,aaronha01,HankAaron,1955,602,0.314,0.366,0.54,0.906


In [404]:
# remove players from batting who are not in contracts
batting = batting[batting['playerName'].isin(list(contracts['playerName'].unique()))]

# NOTE: 22 names in contracts missing from batting; Caused by naming disparities in databases;
print('number of missing names: ', len(contracts['playerName'].unique()) - len(batting['playerName'].unique()))
batting.shape

number of missing names:  22


(10324, 8)

# Data Analysis & Results (EDA)

Carry out EDA on your dataset(s); Describe in this section

In [389]:
## YOUR CODE HERE
## FEEL FREE TO ADD MULTIPLE CELLS PER SECTION