In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
# from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String
# from sqlalchemy.ext.declarative import declarative_base
# from sqlalchemy.orm import sessionmaker

## Scrape USA Today to get salaries for MLB players

In [2]:
url = "https://www.usatoday.com/sports/mlb/salaries/"
#use pandas to scrape tables into list of data frames
tables = pd.read_html(url)

In [3]:
#select the first table
salaries = tables[0]
salaries.head(3)

Unnamed: 0,rank,Name,Team,POS,Salary,Years,Total Value,Avg Annual
0,--,Max Scherzer,WSH,SP,"$42,142,857",7 (2015-21),"$210,000,000","$30,000,000"
1,--,Stephen Strasburg,WSH,SP,"$36,428,571",7 (2017-23),"$175,000,000","$25,000,000"
2,--,Mike Trout,LAA,CF,"$34,083,333",6 (2015-20),"$144,500,000","$24,083,333"


## Scrape FanGraphs to get starting lineups for each team

In [4]:
#create list of all MLB team names
teams =['braves', 'marlins','mets','phillies', 'nationals', 'cubs', 'reds', 'brewers', 'pirates', 'cardinals','diamondbacks',
        'rockies', 'dodgers','padres','giants','orioles','redsox','yankees','rays','bluejays','whitesox','indians','tigers',
        'royals','twins','astros','angels','athletics', 'mariners','rangers']

In [5]:
#Create empy lists and dataframes to store data for the scrape
ndf=pd.DataFrame()
url_list = []

#loop to create list of url's for each team's lineup
for i in teams:
    url = "https://www.fangraphs.com/teams/{}/depth-chart".format(i)
    url_list.append(url)

#loop through each team's url to scrape their starting lineup
for x in url_list:
    res = requests.get(x)
    soup = BeautifulSoup(res.content, 'lxml')
    table = soup.find_all('div', {'class':'team-stats-table'})
    df = pd.read_html(str(table))

    lineup = []

    for frame in df:
        lineup.append(frame.loc[0])

    #temp df to hold single team lineup    
    tdf= pd.DataFrame(lineup)
    #add positions
    tdf['position'] = ['C','1B','2B', 'SS', '3B', 'LF', 'CF', 'RF', 'DH', 'ALL', 'SP', 'RP', 'AP']

    #add team names
    tn = x.split('/')
    tdf['team_name'] = tn[4]
    tdf = tdf.reset_index()
    tdf = tdf.set_index('position')
    
    #drop the old index and non-useful positions
    tdf = tdf.drop('index', axis=1)
    tdf = tdf.drop(['ALL', 'AP'])

    #add results to result df
    ndf = pd.concat([ndf,tdf])
    #clear temp df
    tdf= pd.DataFrame()


In [6]:
ndf = ndf.reset_index()
ndf['table_index'] = np.arange(len(ndf))
ndf.head(3)


Unnamed: 0,position,Name,PA,AVG,OBP,SLG,wOBA,Bat,Fld,BsR,...,IP,K/9,BB/9,HR/9,BABIP,LOB%,ERA,FIP,team_name,table_index
0,C,Brian McCann,36.0,0.242,0.323,0.409,0.312,-0.6,-0.1,-0.1,...,,,,,,,,,braves,0
1,1B,Freddie Freeman,91.0,0.298,0.39,0.549,0.386,4.4,0.3,0.0,...,,,,,,,,,braves,1
2,2B,Ozzie Albies,88.0,0.279,0.333,0.467,0.334,0.3,0.7,0.3,...,,,,,,,,,braves,2


## Join the salary and lineup dataframes

In [7]:
df_merge = pd.merge(ndf, salaries, on='Name', how='left')

In [17]:
#drop unwanted columns
#df_merge = df_merge.drop('rank', 1)
#df_merge = df_merge.drop('index', 1)
#df_merge = df_merge.drop('table_index', 1)
df_merge.count()

position       330
Name           330
PA             270
AVG            270
OBP            270
SLG            270
wOBA           270
Bat            270
Fld            270
BsR            270
WAR            330
IP              60
K/9             60
BB/9            60
HR/9            60
BABIP           60
LOB%            60
ERA             60
FIP             60
team_name      330
Team           270
POS            270
Salary         270
Years          270
Total Value    270
Avg Annual     270
dtype: int64

## Load into Database

In [12]:
connection_string = "postgres:nickel13@localhost:5432/mlb_db"
engine = create_engine(f'postgresql://{connection_string}')

In [14]:
df_merge.to_sql(name='starting_rosters', con=engine, if_exists='append', index=True)

ProgrammingError: (psycopg2.ProgrammingError) incomplete placeholder: '%(' without ')'
[SQL: INSERT INTO starting_rosters (index, position, "Name", "PA", "AVG", "OBP", "SLG", "wOBA", "Bat", "Fld", "BsR", "WAR", "IP", "K/9", "BB/9", "HR/9", "BABIP", "LOB%%", "ERA", "FIP", team_name, table_index, "Team", "POS", "Salary", "Years", "Total Value", "Avg Annual") VALUES (%(index)s, %(position)s, %(Name)s, %(PA)s, %(AVG)s, %(OBP)s, %(SLG)s, %(wOBA)s, %(Bat)s, %(Fld)s, %(BsR)s, %(WAR)s, %(IP)s, %(K/9)s, %(BB/9)s, %(HR/9)s, %(BABIP)s, %(LOB%)s, %(ERA)s, %(FIP)s, %(team_name)s, %(table_index)s, %(Team)s, %(POS)s, %(Salary)s, %(Years)s, %(Total Value)s, %(Avg Annual)s)]
[parameters: ({'index': 0, 'position': 'C', 'Name': 'Brian McCann', 'PA': 36.0, 'AVG': 0.242, 'OBP': 0.32299999999999995, 'SLG': 0.409, 'wOBA': 0.312, 'Bat': -0.6, 'Fld': -0.1, 'BsR': -0.1, 'WAR': 0.1, 'IP': None, 'K/9': None, 'BB/9': None, 'HR/9': None, 'BABIP': None, 'LOB%': None, 'ERA': None, 'FIP': None, 'team_name': 'braves', 'table_index': 0, 'Team': 'ATL', 'POS': 'C', 'Salary': '$2,000,000', 'Years': '1 (2019)', 'Total Value': '$2,000,000', 'Avg Annual': '$2,000,000'}, {'index': 1, 'position': '1B', 'Name': 'Freddie Freeman', 'PA': 91.0, 'AVG': 0.298, 'OBP': 0.39, 'SLG': 0.5489999999999999, 'wOBA': 0.386, 'Bat': 4.4, 'Fld': 0.3, 'BsR': 0.0, 'WAR': 0.6, 'IP': None, 'K/9': None, 'BB/9': None, 'HR/9': None, 'BABIP': None, 'LOB%': None, 'ERA': None, 'FIP': None, 'team_name': 'braves', 'table_index': 1, 'Team': 'ATL', 'POS': '1B', 'Salary': '$21,359,375', 'Years': '8 (2014-21)', 'Total Value': '$135,000,000', 'Avg Annual': '$16,875,000'}, {'index': 2, 'position': '2B', 'Name': 'Ozzie Albies', 'PA': 88.0, 'AVG': 0.27899999999999997, 'OBP': 0.33299999999999996, 'SLG': 0.467, 'wOBA': 0.33399999999999996, 'Bat': 0.3, 'Fld': 0.7, 'BsR': 0.3, 'WAR': 0.4, 'IP': None, 'K/9': None, 'BB/9': None, 'HR/9': None, 'BABIP': None, 'LOB%': None, 'ERA': None, 'FIP': None, 'team_name': 'braves', 'table_index': 2, 'Team': 'ATL', 'POS': '2B', 'Salary': '$575,000', 'Years': '1 (2019)', 'Total Value': '$575,000', 'Avg Annual': '$575,000'}, {'index': 3, 'position': 'SS', 'Name': 'Dansby Swanson', 'PA': 81.0, 'AVG': 0.251, 'OBP': 0.321, 'SLG': 0.41600000000000004, 'wOBA': 0.309, 'Bat': -1.5, 'Fld': 0.1, 'BsR': 0.2, 'WAR': 0.2, 'IP': None, 'K/9': None, 'BB/9': None, 'HR/9': None, 'BABIP': None, 'LOB%': None, 'ERA': None, 'FIP': None, 'team_name': 'braves', 'table_index': 3, 'Team': 'ATL', 'POS': 'SS', 'Salary': '$585,000', 'Years': '1 (2019)', 'Total Value': '$585,000', 'Avg Annual': '$585,000'}, {'index': 4, 'position': '3B', 'Name': 'Josh Donaldson', 'PA': 80.0, 'AVG': 0.263, 'OBP': 0.374, 'SLG': 0.517, 'wOBA': 0.37200000000000005, 'Bat': 2.9, 'Fld': 0.2, 'BsR': 0.0, 'WAR': 0.6, 'IP': None, 'K/9': None, 'BB/9': None, 'HR/9': None, 'BABIP': None, 'LOB%': None, 'ERA': None, 'FIP': None, 'team_name': 'braves', 'table_index': 4, 'Team': 'ATL', 'POS': '3B', 'Salary': '$23,000,000', 'Years': '1 (2019)', 'Total Value': '$23,000,000', 'Avg Annual': '$23,000,000'}, {'index': 5, 'position': 'LF', 'Name': 'Austin Riley', 'PA': 42.0, 'AVG': 0.247, 'OBP': 0.303, 'SLG': 0.455, 'wOBA': 0.315, 'Bat': -0.6, 'Fld': 0.2, 'BsR': 0.0, 'WAR': 0.1, 'IP': None, 'K/9': None, 'BB/9': None, 'HR/9': None, 'BABIP': None, 'LOB%': None, 'ERA': None, 'FIP': None, 'team_name': 'braves', 'table_index': 5, 'Team': None, 'POS': None, 'Salary': None, 'Years': None, 'Total Value': None, 'Avg Annual': None}, {'index': 6, 'position': 'CF', 'Name': 'Ronald Acuna Jr.', 'PA': 74.0, 'AVG': 0.27899999999999997, 'OBP': 0.354, 'SLG': 0.51, 'wOBA': 0.36, 'Bat': 1.9, 'Fld': 0.2, 'BsR': 0.2, 'WAR': 0.5, 'IP': None, 'K/9': None, 'BB/9': None, 'HR/9': None, 'BABIP': None, 'LOB%': None, 'ERA': None, 'FIP': None, 'team_name': 'braves', 'table_index': 6, 'Team': 'ATL', 'POS': 'LF', 'Salary': '$560,000', 'Years': '1 (2019)', 'Total Value': '$560,000', 'Avg Annual': '$560,000'}, {'index': 7, 'position': 'RF', 'Name': 'Matt Joyce', 'PA': 36.0, 'AVG': 0.242, 'OBP': 0.34600000000000003, 'SLG': 0.419, 'wOBA': 0.32799999999999996, 'Bat': -0.1, 'Fld': -0.2, 'BsR': 0.0, 'WAR': 0.0, 'IP': None, 'K/9': None, 'BB/9': None, 'HR/9': None, 'BABIP': None, 'LOB%': None, 'ERA': None, 'FIP': None, 'team_name': 'braves', 'table_index': 7, 'Team': 'ATL', 'POS': 'OF', 'Salary': '$1,250,000', 'Years': '1 (2019)', 'Total Value': '$1,250,000', 'Avg Annual': '$1,250,000'}  ... displaying 10 of 330 total bound parameter sets ...  {'index': 328, 'position': 'SP', 'Name': 'Mike Minor', 'PA': None, 'AVG': None, 'OBP': None, 'SLG': None, 'wOBA': None, 'Bat': None, 'Fld': None, 'BsR': None, 'WAR': 0.4, 'IP': 25.0, 'K/9': 8.5, 'BB/9': 2.8, 'HR/9': 1.4, 'BABIP': 0.304, 'LOB%': '71.1 %', 'ERA': 4.39, 'FIP': 4.38, 'team_name': 'rangers', 'table_index': 328, 'Team': 'TEX', 'POS': 'SP', 'Salary': '$9,833,333', 'Years': '3 (2018-20)', 'Total Value': '$28,000,000', 'Avg Annual': '$9,333,333'}, {'index': 329, 'position': 'RP', 'Name': 'Jose Leclerc', 'PA': None, 'AVG': None, 'OBP': None, 'SLG': None, 'wOBA': None, 'Bat': None, 'Fld': None, 'BsR': None, 'WAR': 0.2, 'IP': 8.0, 'K/9': 12.8, 'BB/9': 5.1, 'HR/9': 0.9, 'BABIP': 0.309, 'LOB%': '75.9 %', 'ERA': 3.43, 'FIP': 3.5, 'team_name': 'rangers', 'table_index': 329, 'Team': 'TEX', 'POS': 'RP', 'Salary': '$1,687,500', 'Years': '4 (2019-22)', 'Total Value': '$14,750,000', 'Avg Annual': '$3,687,500'})]
(Background on this error at: http://sqlalche.me/e/f405)

## Code used in my previous version to add to MySQL

In [None]:
# engine = create_engine('mysql+mysqlconnector://jsoltis:###########@jsoltis-db1.chsfidy10kbp.us-west-2.rds.amazonaws.com:3306/MLB')
# engine.connect()
# Base = declarative_base()
# Session = sessionmaker()
# Session.configure(bind=engine)
# session = Session()
# dfmerge.to_sql(name='starting_rosters', con = engine, if_exists='replace',)