In [38]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
# from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String
# from sqlalchemy.ext.declarative import declarative_base
# from sqlalchemy.orm import sessionmaker

## Scrape USA Today to get salaries for MLB players

In [24]:
url = "https://www.usatoday.com/sports/mlb/salaries/"
#use pandas to scrape tables into list of data frames
tables = pd.read_html(url)

In [32]:
#select the first table
salaries = tables[0]
salaries.head(3)

Unnamed: 0,rank,Name,Team,POS,Salary,Years,Total Value,Avg Annual
0,--,Max Scherzer,WSH,SP,"$42,142,857",7 (2015-21),"$210,000,000","$30,000,000"
1,--,Stephen Strasburg,WSH,SP,"$36,428,571",7 (2017-23),"$175,000,000","$25,000,000"
2,--,Mike Trout,LAA,CF,"$34,083,333",6 (2015-20),"$144,500,000","$24,083,333"


## Scrape FanGraphs to get starting lineups for each team

In [26]:
#create list of all MLB team names
teams =['braves', 'marlins','mets','phillies', 'nationals', 'cubs', 'reds', 'brewers', 'pirates', 'cardinals','diamondbacks',
        'rockies', 'dodgers','padres','giants','orioles','redsox','yankees','rays','bluejays','whitesox','indians','tigers',
        'royals','twins','astros','angels','athletics', 'mariners','rangers']

In [27]:
#Create empy lists and dataframes to store data for the scrape
ndf=pd.DataFrame()
url_list = []

#loop to create list of url's for each team's lineup
for i in teams:
    url = "https://www.fangraphs.com/teams/{}/depth-chart".format(i)
    url_list.append(url)

#loop through each team's url to scrape their starting lineup
for x in url_list:
    res = requests.get(x)
    soup = BeautifulSoup(res.content, 'lxml')
    table = soup.find_all('div', {'class':'team-stats-table'})
    df = pd.read_html(str(table))

    lineup = []

    for frame in df:
        lineup.append(frame.loc[0])

    #temp df to hold single team lineup    
    tdf= pd.DataFrame(lineup)
    #add positions
    tdf['position'] = ['C','1B','2B', 'SS', '3B', 'LF', 'CF', 'RF', 'DH', 'ALL', 'SP', 'RP', 'AP']

    #add team names
    tn = x.split('/')
    tdf['team_name'] = tn[4]
    tdf = tdf.reset_index()
    tdf = tdf.set_index('position')
    
    #drop the old index and non-useful positions
    tdf = tdf.drop('index', axis=1)
    tdf = tdf.drop(['ALL', 'AP'])

    #add results to result df
    ndf = pd.concat([ndf,tdf])
    #clear temp df
    tdf= pd.DataFrame()


In [30]:
ndf = ndf.reset_index()
ndf['table_index'] = np.arange(len(ndf))
ndf.head(3)


Unnamed: 0,index,position,Name,PA,AVG,OBP,SLG,wOBA,Bat,Fld,...,IP,K/9,BB/9,HR/9,BABIP,LOB%,ERA,FIP,team_name,table_index
0,0,C,Brian McCann,36.0,0.242,0.323,0.409,0.312,-0.6,-0.1,...,,,,,,,,,braves,0
1,1,1B,Freddie Freeman,91.0,0.297,0.389,0.548,0.386,4.4,0.3,...,,,,,,,,,braves,1
2,2,2B,Ozzie Albies,88.0,0.278,0.333,0.467,0.334,0.3,0.7,...,,,,,,,,,braves,2


## Join the salary and lineup dataframes

In [33]:
df_merge = pd.merge(ndf, salaries, on='Name', how='left')

In [37]:
#drop unwanted columns
df_merge = df_merge.drop('rank', 1)
df_merge = df_merge.drop('index', 1)
df_merge = df_merge.drop('table_index', 1)
df_merge.head()

Unnamed: 0,position,Name,PA,AVG,OBP,SLG,wOBA,Bat,Fld,BsR,...,LOB%,ERA,FIP,team_name,Team,POS,Salary,Years,Total Value,Avg Annual
0,C,Brian McCann,36.0,0.242,0.323,0.409,0.312,-0.6,-0.1,-0.1,...,,,,braves,ATL,C,"$2,000,000",1 (2019),"$2,000,000","$2,000,000"
1,1B,Freddie Freeman,91.0,0.297,0.389,0.548,0.386,4.4,0.3,0.0,...,,,,braves,ATL,1B,"$21,359,375",8 (2014-21),"$135,000,000","$16,875,000"
2,2B,Ozzie Albies,88.0,0.278,0.333,0.467,0.334,0.3,0.7,0.3,...,,,,braves,ATL,2B,"$575,000",1 (2019),"$575,000","$575,000"
3,SS,Dansby Swanson,81.0,0.251,0.321,0.415,0.309,-1.5,0.1,0.2,...,,,,braves,ATL,SS,"$585,000",1 (2019),"$585,000","$585,000"
4,3B,Josh Donaldson,80.0,0.263,0.374,0.517,0.372,2.9,0.2,0.0,...,,,,braves,ATL,3B,"$23,000,000",1 (2019),"$23,000,000","$23,000,000"


## Load into Database

In [None]:
connection_string = "postgres:postgres@localhost:5432/mlb_db"
engine = create_engine(f'postgresql://{connection_string}')

In [None]:
df_merge.to_sql(name='starting_rosters', con=engine, if_exists='append', index=True)

## Code used in my previous version to add to MySQL

In [None]:
# engine = create_engine('mysql+mysqlconnector://jsoltis:###########@jsoltis-db1.chsfidy10kbp.us-west-2.rds.amazonaws.com:3306/MLB')
# engine.connect()
# Base = declarative_base()
# Session = sessionmaker()
# Session.configure(bind=engine)
# session = Session()
# dfmerge.to_sql(name='starting_rosters', con = engine, if_exists='replace',)