In [7]:
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
import html5lib
import requests
import numpy as np
import os
import re


In [4]:
def scrape(url):
    chromedriver = "/Users/ryanmurray/Downloads/chromedriver" # path to the chromedriver executable
    os.environ["webdriver.chrome.driver"] = chromedriver


    driver = webdriver.Chrome(chromedriver)

    driver.get(url)
    soup = BeautifulSoup(driver.page_source,'lxml')
    driver.quit()
    
    return soup

def parse_table(soup,table="table_name"):
    
    #takes table with "table_name" from scraped website and outputs pandas dataframe
    tab = soup.find("table",{"id": table})
    rows = tab.find_all('tr')
    
    #get header and remove leading and trailing blank strings
    header = rows[0].text.split('\n')
    header.remove('')
    header.remove('')
    
    #create list of rows
    rows_list = []
    for row in rows:
        cur_row = []
        for c in row:
            try: cur_row.append(c.text)
            except: cur_row.append('')
        if len(cur_row) == len(header):
            rows_list.append(cur_row)
            
    return pd.DataFrame(rows_list,columns=header)


In [3]:
def is_lhp(p):
    return '*' in p
def bat_hand(p):
    if '*' in p:
        return 'lhb'
    elif '#' in p:
        return 'swh'
    else:
        return 'rhb'

def drop_punc(n):
    return re.sub(r'[^\w\s]','',n)

def clean_pitching(df,year):
    df['Salary'] = df['Salary'].apply(lambda x: (x.replace('$','').replace(',','')))
    df['lhp'] = df['Name'].apply(is_lhp)
    df['Name'] = df['Name'].apply(drop_punc)
    df['year'] = year
    cols = df.columns.tolist()
    for col in cols:
        try:
            df[col] = pd.to_numeric(df[col])
        except:
            pass
    df['log_sal'] = df['Salary'].apply(lambda x: np.log10(x))
    df.columns = [x.lower() for x in df.columns.tolist()]
    return df.dropna()    

def clean_batting(df,year):
    df['Salary'] = df['Salary'].apply(lambda x: (x.replace('$','').replace(',','')))
    df['bat_hand'] = df['Name'].apply(bat_hand)
    df['Name'] = df['Name'].apply(drop_punc)
    df['year'] = year
    cols = df.columns.tolist()
    for col in cols:
        try:
            df[col] = pd.to_numeric(df[col])
        except:
            pass
    df['log_sal'] = df['Salary'].apply(lambda x: np.log10(x))
    df.columns = [x.lower() for x in df.columns.tolist()]
    return df.dropna()        

In [12]:
import time
def scrape_parse_clean_p(year):
    url1 = 'https://www.baseball-reference.com/leagues/MLB/'
    url2 = '-value-pitching.shtml'
    url = url1+str(year)+url2
    table = 'players_value_pitching'
    soup = scrape(url)
    df = parse_table(soup, table)
    clean_df = clean_pitching(df,year)
    return clean_df

def scrape_parse_clean_b(year):
    url1 = 'https://www.baseball-reference.com/leagues/MLB/'
    url2 = '-value-batting.shtml'
    url = url1+str(year)+url2
    table = 'players_value_batting'
    soup = scrape(url)
    df = parse_table(soup, table)
    clean_df = clean_batting(df,year)
    return clean_df

def scrape_parse_clean_tb():
    url='https://www.baseball-reference.com/leagues/MLB/2017-value-batting.shtml'
    soup = scrape(url)
    time.sleep(10)
    table = 'teamss_value_batting'

    df = parse_table(soup, table)
    
    #clean_df = clean_batting(df,year)
    return clean_df


In [50]:
def create_train(stat,start,end):
    df_list = []
    if stat == 'pitching':
        for year in range(start,end+1):
            df_list.append(scrape_parse_clean_p(year))
    elif stat == 'batting':
        for year in range(start,end+1):
            df_list.append(scrape_parse_clean_b(year))
    df = pd.concat(df_list)
    return df
        
        
        
        

In [18]:
url_2015 = 'https://www.baseball-reference.com/leagues/MLB/2015-value-pitching.shtml'
url_2016 = 'https://www.baseball-reference.com/leagues/MLB/2016-value-pitching.shtml'
url_2017 = 'https://www.baseball-reference.com/leagues/MLB/2017-value-pitching.shtml'

table_name = 'players_value_pitching'



In [19]:
p_2015 = scrape_parse_clean_p(url_2015,table_name)
p_2016 = scrape_parse_clean_p(url_2016,table_name)
p_2017 = scrape_parse_clean_p(url_2017,table_name)

In [20]:
p_3 = pd.concat([p_2015,p_2016,p_2017])

In [23]:
p_3.to_pickle("pitching_2015_2017.pkl")

In [34]:
b_2016_url = 'https://www.baseball-reference.com/leagues/MLB/2016-value-batting.shtml'

b_2017_url = 'https://www.baseball-reference.com/leagues/MLB/2017-value-batting.shtml'
b_table_name = 'players_value_batting'

In [35]:
b_2017 = scrape_parse_clean_b(b_2017_url,b_table_name)
b_2016 = scrape_parse_clean_b(b_2016_url,b_table_name)

In [36]:
b_2016.to_pickle("batting_2016.pkl")
b_2017.to_pickle("batting_2017.pkl")

In [55]:
b_00_05 = create_train('batting',2000,2005)

In [59]:
b_06_08 = create_train('batting',2006,2008)

In [61]:
b_09_11 = create_train('batting',2009,2011)

In [62]:
b_12_14 = create_train('batting',2012,2014)

In [63]:
b_15_16 = create_train('batting',2015,2016)

In [65]:
b_train = pd.concat([b_00_05,b_06_08,b_09_11,b_12_14,b_15_16])

In [69]:
b_train.to_pickle('batting_00_16.pkl')

In [68]:
b_test = scrape_parse_clean_b(2017)

In [70]:
b_test.to_pickle('batting_17.pkl')

In [13]:
team_2017 = scrape_parse_clean_tb()

AttributeError: 'NoneType' object has no attribute 'find_all'