### Extract stats from milb player pages

In [615]:
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import pandas as pd
import numpy as np
import os
import csv
import re
from collections import OrderedDict

### Functions For Getting Data From Header

Helper Functions

In [653]:
def make_height(h_str):
    temp_height = list(map(float,(h_str.split('-'))))
    height = temp_height[0]*12 + temp_height[1]
    return height

In [654]:
def add_position_info(summary_dict):
    position_tags = ['first','second','third','shortstop','catcher','outfielder']
    positions =  summary_dict['positions'].lower()

    for t in position_tags:
        if t in positions:
            summary_dict[t] = 1
        else:
            summary_dict[t] = 0

    return summary_dict

In [734]:
def add_mlb_war(summary_dict,player_summary_soup):
    # Determine if the player made the majors
    debuts_check = player_summary_soup.find_all('a',href=re.compile("debuts"))
    if debuts_check:
        debut_date = float(debuts_check[0]['href'].split('/')[-1][0:4])
        debut = 1

        # BBREF Key can be found in this section
        debut_link = debuts_check[0].find_next('a')['href']
        mlb_id = (re.search(r"id=(.*)&", debut_link)).group(1)

        # Find Career War
        career_table = player_summary_soup.find('div',class_='p1')
        try: 
            float(career_table.find('p').next_sibling.next_sibling.string)
        except AttributeError:
            war = float(career_table.find('p').string)
        else:
            war = float(career_table.find('p').next_sibling.next_sibling.string)
    else:
        debut = 0
        mlb_id = 'none'
        war = np.nan


    summary_dict['war'] = war
    summary_dict['mlb'] = debut
    summary_dict['mlb_id'] = mlb_id
    summary_dict['milb_id'] = html_files[ix].replace('.html','')
    
    return summary_dict
    

Main Fucntion

In [735]:
def make_summary_dict(player_summary_soup):

    summary_dict = dict()
    # Get whether player made MLB and total WAR
    summary_dict = add_mlb_war(summary_dict,player_summary_soup)
    
    strongs = player_summary_soup.find_all('strong')

    for su in strongs:
        if 'Position' in su.text:
            summary_dict['positions'] = re.sub(
                '\n','',su.next_element.next_element).strip()
        if 'Bats' in su.text:
            b_string = su.next_element.next_element
            if 'Right' in b_string:
                summary_dict['bats'] = 'right'
            elif 'Left' in b_string:
                summary_dict['bats'] = 'left'
            elif 'Both'in b_string:
                summary_dict['bats'] = 'both'
        if 'Throws' in su.text:
            t_string = su.next_element.next_element
            if 'Right' in t_string:
                summary_dict['throws'] = 'right'
            else:
                summary_dict['throws'] = 'left'
        if 'Draft' in su.text:
            summary_dict['draft'] = player_summary_soup.find_all(
                'a',href=re.compile('draft_round'))[-1].string
            summary_dict['draft'] = float(re.sub('[^0-9]','',summary_dict['draft']))

    spans = player_summary_soup.find_all('span')

    for sp in spans:
        # Height
        try:
            sp.attrs['itemprop']
        except:
            KeyError
        else:
            if sp.attrs['itemprop'] == 'height':
                summary_dict['height'] = make_height(sp.text)

        # Weight
        try:
            sp.attrs['itemprop'] == 'weight'
        except:
            KeyError
        else:
            if sp.attrs['itemprop'] == 'weight':
                summary_dict['weight'] = float(re.sub(r'[^0-9]','',sp.text))

        # Add not Drafted
        try:
            summary_dict['draft']
        except:
            KeyError
            summary_dict['draft'] = np.nan

    summary_dict = add_position_info(summary_dict)
    
    return summary_dict        

Function for making player summary sorted

In [736]:
def make_summary_dict_sorted(summary_dict):

    queue = ['milb_id','mlb_id','mlb','war','draft','bats','throws','height','weight',
             'positions','first','second','third','shortstop','catcher','outfielder']

    summary_dict_sorted = OrderedDict()
    for k in queue:
        summary_dict_sorted[k] = summary_dict[k]

    return summary_dict_sorted


### Batting Table Creation and Cleaning

In [737]:
def make_standard_batting(batter_page):

    batter_standard_data = batter_page.find('table',class_='sortable stats_table')

    header_name_list = []
    # Populate a list containing column headers
    header_html = batter_standard_data.find('tr')
    for item in header_html.find_all('th'):
        header_name_list.append(item.text.strip())

    data_html = batter_standard_data.find_all('tr')
    data_list = []
    for ix, row in enumerate(data_html):
        temp_list = []
        
        try:
            re.search('[a-zA-Z]',row.find('th').string)
        except TypeError:
            break
        else:
        
            if ix >0:
                if re.search('[a-zA-Z]',row.find('th').string):
                    break
                else:
                    #print(row.find('th').text.strip())
                    temp_year = row.find('th').text.strip()
                    temp_year = re.sub('\-.*$','',temp_year)
                    temp_list.append(temp_year)
                    for item in row.find_all('td'):
                        temp_list.append(item.text.strip())
                    data_list.append(temp_list)

    batting_df = pd.DataFrame(data_list,columns=header_name_list)
    
    return batting_df

In [738]:
def clean_batter_df(df):
    numeric_fields = ['Year', 'Age','AgeDif', 'G', 'PA', 'AB', 'R', 'H', '2B', '3B', 'HR',\
                      'RBI', 'SB', 'CS', 'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS', \
                      'TB','GDP', 'HBP', 'SH', 'SF', 'IBB']

    text_fields = ['Tm', 'Lg','Lev','Aff']
    df_clean = pd.DataFrame()
    
    for nf in numeric_fields:
        df_clean[nf] = pd.to_numeric(df[nf])
    
    for tf in text_fields:
        df_clean[tf] = df[tf]
    
    return df_clean

## Loop through Player Files

In [775]:
fpath = 'milb_player_pages/'
html_files = os.listdir('milb_player_pages')
out_path = 'milb_batter_files/'

In [787]:
with open('milb_batter_summaries.csv', 'w') as csvfile:  # Just use 'w' mode in 3.x

    
    fieldnames = ['milb_id','mlb_id','mlb','war','draft','bats','throws','height','weight',
             'positions','first','second','third','shortstop','catcher','outfielder']
    
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()   

    for ix, fi in enumerate(html_files[0:3]):
        html_f = open(fpath + fi,'r')

        page = html_f.read()
        batter_page = BeautifulSoup(page,"lxml")

        html_f.close()
        # Find Header Data
        player_summary_soup = batter_page.find('div', {'id':'info'})


        summary_dict = make_summary_dict(player_summary_soup)
        if 'Pitcher' not in summary_dict['positions']:
            summary_dict = make_summary_dict_sorted(summary_dict)
            #Season By Season Stats
            batter_df = make_standard_batting(batter_page)    
            batter_df_cleaned = clean_batter_df(batter_df)

            batter_df_cleaned['mlb_id'] = summary_dict['mlb_id']
            batter_df_cleaned['milb_id'] = summary_dict['milb_id']
            

            # if player made major leagues, use that ID for file name
            if summary_dict['mlb'] == 1:
                batter_df_cleaned.to_csv(
                    out_path +summary_dict['mlb_id'] +'.csv' )
            else:
                batter_df_cleaned.to_csv(
                    out_path +summary_dict['milb_id'] +'.csv' )
                
            # Write to summary file
            writer.writerow(summary_dict)
            
        print(ix)
        
    csvfile.close()
    

0
1
2
