@@ -1,26 +1,36 @@
import xlsxwriter
import json
import pprint
import re
import datetime
import openpyxl

#
# CODE GOLF
ordinal = lambda n: "%d%s" % (n,"tsnrhtdd"[(n/10%10!=1)*(n%10<4)*n%10::4])

n_previous_games = 5
col_offset = n_previous_games + 2
n_previous_games = 4
n_games_per_sheet = 8

header_row = 0
result_row = 1
runs_row = 2
# ================================================================================================================

ratings_data_column_start = ord("l") - 96
col_offset_home_away = 2
col_offset_games = 4

formats = {}
header_row = 9
result_row = 10
runs_row = 11

###################################################################################################v
# ================================================================================================================

def WriteHeaderData(team_data, worksheet, col_offset=0, is_home=None):
profile_data_column_start = ord("d") - 96
profile_data_row_start = 5

profile_row_offset_home_away = 1
profile_row_offset_game = 3

profile_neat_layout_column_jump = (ord("q") - 96) - profile_data_column_start # second set of games starts at col Q

# ================================================================================================================

def WriteHeader(team_data, worksheet, col_offset=0, is_home=None):

if is_home is None:
home_away = "TEAM: "
@@ -29,73 +39,117 @@ def WriteHeaderData(team_data, worksheet, col_offset=0, is_home=None):
else:
home_away = "AWAY: "

worksheet.write(header_row, col_offset+0, home_away + team_data['team_id'], formats['default']) # team name
worksheet.cell(column=col_offset, row=header_row, value="{} {}".format(home_away, team_data['team_id']))


def WriteResult(team_data, worksheet, col_offset=0):

for i in range(n_previous_games):
worksheet.write(header_row, col_offset+1+i, ordinal(i+1), formats['default'])
worksheet.cell(column=col_offset + i, row=result_row, value="{}".format( team_data['W/L'][i] ))

def WriteResultData(team_data, worksheet, col_offset=0):
def WriteRuns(team_data, worksheet, col_offset=0):

worksheet.write(result_row, col_offset+0, "RESULT", formats['default'])
for i in range(n_previous_games):
score = team_data['Score'][i]
score = re.search(r'\d+', score).group() # get only the first number using .search()
worksheet.cell(column=col_offset + i, row=runs_row, value=score)

for i in range(5):
worksheet.write(result_row, col_offset+i+1, team_data['W/L'][i], formats['default'])
# ================================================================================================================

def WriteProfile(pitcher_data, worksheet, row_offset=0, col_offset=0):

def WriteRunsData(team_data, worksheet, col_offset=0):
worksheet.cell(column=col_offset, row=row_offset, value=pitcher_data['team_id'])
worksheet.cell(column=col_offset + 1, row=row_offset, value=pitcher_data['name'])
# print(pitcher_data)

worksheet.write(runs_row, col_offset+0, "RUNS", formats['default'])
# Write previous decisions
for i in range(n_previous_games):
try:
worksheet.cell(column=col_offset + 2 + i, row=row_offset, value=pitcher_data['DECISION'][i])
except:
worksheet.cell(column=col_offset + 2 + i, row=row_offset, value="NO DATA")

for i in range(5):
score = team_data['Score'][i]
score = re.search(r'\d+', score).group() # get only the first number using .search()
worksheet.write(runs_row, col_offset+i+1, score, formats['default'])
# print("\n\n", "="*50, "\n")

# ================================================================================================================

###################################################################################################v

def WriteDataToExcel(data):

global formats
base = openpyxl.load_workbook(filename="worksheets/RUN-SHEET-BASE.xlsx")

# = = = = = = = = = = = = = = = = = = = = = = RATINGS = = = = = = = = = = = = = = = = = =

all_ratings = [base['RATINGS - 1'], base['RATINGS - 2']]

# Create a workbook and add a worksheet.
workbook = xlsxwriter.Workbook('MLB_{}.xlsx'.format(datetime.datetime.today().strftime('%Y-%m-%d')))
worksheet = workbook.add_worksheet()
print(base.sheetnames)

formats['default'] = workbook.add_format({'bold': True, 'center_across': True})
formats['border'] = workbook.add_format({'right': True})
total_col_offset = -1
sheet_idx = -1

total_col_offset = 0
for i, game in enumerate(data):

if i % n_games_per_sheet == 0: # Aneryin spreads games over sheets, so do this too...
sheet_idx += 1 # Go to the next sheet
total_col_offset = ratings_data_column_start + 1 # Start at the beginning column

ratings = all_ratings[sheet_idx]

for game in data:
home = game['HOME']
away = game['AWAY']

WriteHeaderData(home, worksheet, total_col_offset, is_home=True)
WriteResultData(home, worksheet, total_col_offset)
WriteRunsData(home, worksheet, total_col_offset)
WriteHeader(home, ratings, total_col_offset - 1, is_home=True)
WriteResult(home, ratings, total_col_offset)
WriteRuns(home, ratings, total_col_offset)

total_col_offset += n_previous_games + col_offset_home_away

WriteHeader(away, ratings, total_col_offset - 1, is_home=False)
WriteResult(away, ratings, total_col_offset)
WriteRuns(away, ratings, total_col_offset)

total_col_offset += n_previous_games + col_offset_games

# = = = = = = = = = = = = = = = = = = = = = PROFILING = = = = = = = = = = = = = = = = = = = =

profiling = base['PROFILING']

pitcher_row = -1
pitcher_col = -1

for i, game in enumerate(data):

if i % n_games_per_sheet == 0: # Aneryin spreads games over sheets, so do this too...
pitcher_row = profile_data_row_start
pitcher_col = profile_data_column_start + (profile_neat_layout_column_jump * round(i/n_games_per_sheet, 0))

try:
home = game['HOME']['pitcher']
away = game['AWAY']['pitcher']
except:
pitcher_row += profile_row_offset_home_away
pitcher_row += profile_row_offset_game
continue

WriteProfile(home, profiling, pitcher_row, pitcher_col)

total_col_offset += col_offset
pitcher_row += profile_row_offset_home_away

WriteHeaderData(away, worksheet, total_col_offset, is_home=False)
WriteResultData(away, worksheet, total_col_offset)
WriteRunsData(away, worksheet, total_col_offset)
WriteProfile(away, profiling, pitcher_row, pitcher_col)

total_col_offset += col_offset + 2
pitcher_row += profile_row_offset_game

# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =

base.save("MLB_{}.xlsx".format(datetime.datetime.today().strftime('%Y-%m-%d')))


workbook.close()
# ================================================================================================================

if __name__ == "__main__":

def RecreateSheet():
with open('temp_game_data.json', 'r') as fp:
data = json.load(fp)
# pprint.pprint(data)
WriteDataToExcel(data)

#
# with open('temp_game_data.json', 'r') as fp:
# data = json.load(fp)
#
# WriteDataToExcel(data)
#
#
#
# pprint.pprint(data)
@@ -4,8 +4,9 @@
from bs4 import Comment
import pprint
import json
# from .scrape_mlb import

from output_bbal_ref import WriteDataToExcel

pp = pprint.PrettyPrinter(indent=4)

base_mlb = "https://www.baseball-reference.com"
@@ -44,54 +45,107 @@ def GetSchedule():
# break
#
# pp.pprint(all_games)
WriteDataToExcel(all_games)
# with open('temp_game_data.json', 'w') as fp:
# json.dump(all_games, fp)
# WriteDataToExcel(all_games)
with open('temp_game_data.json', 'w') as fp:
json.dump(all_games, fp)

def GetPreviewStats(preview_url):

preview_html = requests.get(preview_url, headers=header)

soup = BeautifulSoup(preview_html.content, 'html.parser')

pitcher_names = soup.findAll("span", {"class": "section_anchor", "data-label": re.compile('.*Stats')})
pitcher_names = [x['data-label'].replace(" Stats", "") for x in pitcher_names]

# NOTE:
# For some nuts reason, the last 10 games data IS IN A COMMENT...
# I imagine that, as the page loads in the browser, some javascript formats it into a proper table...
comments = soup.find_all(string=lambda text: isinstance(text, Comment))

preview_data = {}
game_data = {}

pitcher_idx = 0

for c in comments:

comment_soup = BeautifulSoup(c, 'html.parser')

# - - - - - - - - - - - - - - - - - - TEAM STATS - - - - - - - - - - - - - - - - - -

tables = comment_soup.findAll("table", {"id": re.compile('last10_.*')}) # We want tables like 'id=last10_NYY'

if len(tables) > 0:
if len(tables):

last10 = tables[0]

team_data = {
"team_id": last10.get("id")[-3:] # last10_**NYY**
}

theader = last10.find("thead").find("tr").findAll("th")
table_header = last10.find("thead").find("tr").findAll("th")

for header_cell in theader: # Get the column names from the header -> keys
for header_cell in table_header: # Get the column names from the header -> keys
header_text = header_cell.text.strip()
team_data[header_text] = []

game_rows = last10.find("tbody").findAll("tr") # Get actual data from table rows -> values
for r in game_rows:
game_results = r.findAll('td')
for i, cell in enumerate(game_results):
team_data[theader[i].text].append(cell.text.strip()) # Store relevant data for the key in an []
for i, pitcher_stat in enumerate(game_results):
team_data[table_header[i].text].append(pitcher_stat.text.strip()) # Store relevant data for the key in an []

if len(game_data) == 0:
game_data["AWAY"] = team_data
else:
game_data["HOME"] = team_data # HOME is always second in basketball reference

# - - - - - - - - - - - - - - - - - - PITCHER STATS - - - - - - - - - - - - - - - - - -

tables = comment_soup.findAll("table", {"id": re.compile('^sp_.*')})

if len(tables):

pitcher = tables[0]

pitcher_data = {
"name": pitcher_names[pitcher_idx]
}

did_see_spacer = False
spacer_idx = -1

pitcher_rows = pitcher.find("tbody").findAll("tr") # Get actual data from table rows -> values
for i, r in enumerate(pitcher_rows):

# First few rows are historical data, we dont want that
if not did_see_spacer and r.get("class") is not None and "spacer" in r.get("class"):
did_see_spacer = True
spacer_idx = i

if did_see_spacer and i > spacer_idx+1: # Skip the spacer & 1 row!

for pitcher_stat in r.findAll('td'):

try:
stat = pitcher_stat['data-stat'] # Unlike team stats, the name of the stat is an attr
except:
continue # Sometimes there is an extra column in the html. Skip.

if stat not in pitcher_data:
pitcher_data[stat] = []

pitcher_data[stat].append(pitcher_stat.text.strip())

if len(preview_data) == 0:
preview_data["AWAY"] = team_data
if pitcher_idx == 0:
pitcher_data['team_id'] = game_data["AWAY"]["team_id"]
game_data["AWAY"]['pitcher'] = pitcher_data
else:
preview_data["HOME"] = team_data # HOME is always second in basketball reference
game_data["HOME"]['pitcher'] = pitcher_data
pitcher_data['team_id'] = game_data["HOME"]["team_id"]
pitcher_idx += 1

return preview_data
return game_data

if __name__ == "__main__":
GetSchedule()
BIN -171 Bytes ~$2017-08-03_MLB.xlsx
Binary file not shown.
BIN -171 Bytes ~$RUN-SHEET-BASE.xlsx
Binary file not shown.