In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import acquire

# acquire and clean data
---
The data has been collected from two sources and as such the acquire process included not only acquiring the data from each source, but correcting any formatting discrepancies between the 2 data sets in order to merge them into one. 

These steps includied but were not limited to:

- downloading each csv and converting them to dataframes
- removing erroneous columns and rows created during the downloading process
- creating the necessary 'name' column in one dataframe while correcting formatting issues in the name column of the other dataframe
- merging the dataframes on the name and year columns to account for the fact that players had an entry for each year they played, which resulted in many players having multiple entries under the same name

---

The sources were [baseball savant](https://baseballsavant.mlb.com/leaderboard/custom?year=2022,2021,2020,2019,2018&type=batter&filter=&sort=4&sortDir=desc&min=10&selections=player_age,b_ab,b_total_pa,b_total_hits,b_single,b_double,b_triple,b_home_run,b_strikeout,b_walk,on_base_plus_slg,b_rbi,b_lob,b_total_bases,pitch_count,pull_percent,straightaway_percent,opposite_percent,batted_ball,&chart=false&x=player_age&y=player_age&r=no&chartType=beeswarm ) (major league baseball's official source for in-depth analytic and stat-cast data), and [baseball-reference] (https://www.baseball-reference.com/leagues/majors/2022-value-batting.shtml). Both of these sites allow you to generate csv files from their data that is updated daily. 

<mark>note: with the baseball-reference data it was necessary for me to create a csv with the stats from each year, then generate a single csv by joining those together in an excel spreadsheet</mark>

Since the data is updated daily, should you choose to recreate my process by using the links provided rather than the csv files provided, your data will look slightly different.


all of the following functions will be combined into 1 "wrangle" function. To inspect the code in these, please see the acquire.py file in the git-repository

In [7]:
# this brings in the data
df = acquire.mlb_data()
df.head()

Unnamed: 0,index_x,last,first,id,year,age,ab,pa,hits,single,...,oppo,batted,name,index_y,team,raa,waa,owar,salary,playerid
0,1927,Ellis,A.J.,454560,2018,37,151,183,41,32,...,26.1,119,AJ Ellis,5274.0,SDP,-2,-0.3,0.8,"$1,250,000",ellisaj01
1,2195,Pollock,AJ,572041,2018,30,413,460,106,59,...,19.6,321,AJ Pollock,5883.0,ARI,9,0.7,2.1,"$7,750,000",polloaj01
2,2841,Pollock,AJ,572041,2019,31,308,342,82,51,...,18.5,238,AJ Pollock,4498.0,LAD,-9,-1.0,1.3,"$4,000,000",polloaj01
3,757,Pollock,AJ,572041,2020,32,196,210,54,29,...,26.8,153,AJ Pollock,3143.0,LAD,2,0.2,1.2,"$15,000,000",polloaj01
4,1310,Pollock,AJ,572041,2021,33,384,422,114,65,...,22.5,306,AJ Pollock,1783.0,LAD,17,1.6,2.9,"$18,000,000",polloaj01


In [8]:
# this cleans the data up and gets it ready for exploration
df = acquire.clean(df)
df.head()

Unnamed: 0,index_x,last,first,id,year,age,ops,tb,pitches_faced,pull,...,batted,name,team,raa,waa,owar,salary,playerid,impact,ppa
0,1927,Ellis,A.J.,454560,2018,37,0.716,52,793,37.8,...,119,AJ Ellis,SDP,-2,-0.3,0.8,1250000.0,ellisaj01,-0.049,4.333333
1,2195,Pollock,AJ,572041,2018,30,0.799,200,1720,44.5,...,321,AJ Pollock,ARI,9,0.7,2.1,7750000.0,polloaj01,0.039,3.73913
2,2841,Pollock,AJ,572041,2019,31,0.795,144,1250,42.0,...,238,AJ Pollock,LAD,-9,-1.0,1.3,4000000.0,polloaj01,0.012,3.654971
3,757,Pollock,AJ,572041,2020,32,0.881,111,746,43.1,...,153,AJ Pollock,LAD,2,0.2,1.2,15000000.0,polloaj01,0.171,3.552381
4,1310,Pollock,AJ,572041,2021,33,0.892,206,1465,38.2,...,306,AJ Pollock,LAD,17,1.6,2.9,18000000.0,polloaj01,0.147,3.471564


In [10]:
# this splits the data then fills the null values that we didnt drop in the previous steps
train, validate, test = acquire.split_fill(df)
train.isna().sum()

index_x          0
last             0
first            0
id               0
year             0
age              0
ops              0
tb               0
pitches_faced    0
pull             0
center           0
oppo             0
batted           0
name             0
team             0
raa              0
waa              0
owar             0
salary           0
playerid         0
impact           0
ppa              0
dtype: int64

### our data is now ready to explore!