# Webscrape for Hockey Team

## goals
1. Simple webscrape with given `HTML`
1. page before with list of teams

In [884]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from datetime import datetime as dt
import re
from urllib.parse import urlparse, urljoin

In [885]:
url_main_league = 'https://stats.sharksice.timetoscore.com/display-stats.php?league=1'
team_names = ['Team Beer','Team America']
page_hockey_main = requests.get(url_main_league)

soup_main = BeautifulSoup(page_hockey_main.content,"html.parser")
link_names = soup_main.find_all('a',href=True)

In [886]:
# Find links for Team Names.
parsed = urlparse(url_main_league)
# print('scheme  :', parsed.scheme)
# print('netloc  :', parsed.netloc)
# print('path    :', parsed.path)
# print('params  :', parsed.params)
# print('query   :', parsed.query)
# print('fragment:', parsed.fragment)
# print('username:', parsed.username)
# print('password:', parsed.password)
# print('hostname:', parsed.hostname)
# print('port    :', parsed.port)

# urljoin(urljoin(parsed.scheme,parsed.netloc),hockey_url)
df_links = pd.DataFrame({'team_name':[],'href_link':[]})

for link in link_names:           
    for nam in team_names:                
        if link.text.strip()==nam:
            hockey_url = parsed.scheme + '://' + parsed.netloc + '/' + link['href']
            df_add = pd.DataFrame.from_dict({'team_name':[nam],'href_link':[hockey_url]})
            df_links = pd.concat([df_links,df_add])
df_links

Unnamed: 0,team_name,href_link
0,Team America,https://stats.sharksice.timetoscore.com/displa...
0,Team Beer,https://stats.sharksice.timetoscore.com/displa...


## Team select stuff

In [887]:
team_name, hockey_url = df_links.iloc[0,:]
print(team_name)
print(hockey_url)
# team_name  = 'Team Beer'
# hockey_url = 'https://stats.sharksice.timetoscore.com/display-schedule?team=4637&season=55&league=1&stat_class=1'
# team_name  = 'Team America'
# hockey_url = 'https://stats.sharksice.timetoscore.com/display-schedule?team=2297&season=55&league=1&stat_class=1'

Team America
https://stats.sharksice.timetoscore.com/display-schedule?team=2297&season=55&league=1&stat_class=1


In [888]:
hockey_url_check = requests.get(hockey_url)
soup = BeautifulSoup(hockey_url_check.content,"html.parser")
tables = soup.find_all('table')

In [889]:
# should be zero but double check 
for index,table in enumerate(tables):
    if ("Game Results" in str(table)):
        table_index = index
print(table_index)

0


## read table and fix data

In [890]:
df_hockey_games = pd.read_html(str(tables[0]), flavor='bs4')[0]
df_hockey_games = df_hockey_games.droplevel(0,axis=1)
df_hockey_games.reset_index(inplace=True)
df_hockey_games.rename(columns={'Goals':'Goals_Away','Goals.1':'Goals_Home'},inplace=True)
df_hockey_games.head()
df_hockey_games.replace("NaN", np.nan, inplace = True)
df_hockey_games.drop(columns=['Box Score','Scoresheet'],inplace=True)

## indexing

#### get datetime

In [891]:
current_year = pd.to_datetime('today').strftime('%Y')
current_datetime = pd.DataFrame([df_hockey_games['Date'] + ' ' + current_year + ' ' + df_hockey_games['Time']]).T
current_datetime.columns = ['Date_Time']
current_datetime_fix = pd.to_datetime(current_datetime['Date_Time'], format = '%a %b %d %Y %I:%M %p')
df_hockey_games['Game_datetime'] = current_datetime_fix

#### get bench and jersey color

In [892]:
# team_name = 'Team America'
idx_team_side = df_hockey_games['Home'] == team_name
df_hockey_games['Team_side'] = pd.DataFrame(np.where(idx_team_side,'Home','Away'))
df_hockey_games['Jersey']    = pd.DataFrame(np.where(idx_team_side,'Light','Dark'))

#### fix goals and determine shootout

In [893]:
shootout_decider = pd.DataFrame([df_hockey_games['Goals_Home'].astype(str).str[-1] == 'S'] or [df_hockey_games['Goals_Away'].astype(str).str[-1] == 'S']).transpose()
df_hockey_games['Shootout_decider'] = shootout_decider

df_hockey_games['Goals_Home'] = df_hockey_games['Goals_Home'].astype(str).str.extract('(\d+)')
df_hockey_games['Goals_Away'] = df_hockey_games['Goals_Away'].astype(str).str.extract('(\d+)')

#### see if there is games left

In [894]:
games_left = pd.to_datetime('today') < df_hockey_games['Game_datetime']
# games_left = pd.DataFrame(df_hockey_games['Game'].astype(str).str[-1] != '*')

df_hockey_games['Upcoming_game'] = games_left

neat game time

In [895]:
df_hockey_games['Game_datetime_neat'] = df_hockey_games['Game_datetime'].dt.strftime('%a - %d %b @ %I:%M %p')
df_hockey_games.drop(['Date','Time'], axis = 1, inplace=True)

#### Str Replace Rinks and add team name

In [896]:
df_hockey_games['Rink'] = df_hockey_games['Rink'].str.replace('San Jose ','')
df_hockey_games['team_name'] = team_name

## combine all teams data


In [901]:
# blah here

## reorder

In [897]:
cols_to_order = ['index', 'Game', 'team_name', 'Game_datetime_neat', 'Rink', 'Team_side', 'Jersey']
new_columns = cols_to_order + (df_hockey_games.columns.drop(cols_to_order).tolist())
df_hockey_games = df_hockey_games[new_columns]
df_hockey_games

Unnamed: 0,index,Game,team_name,Game_datetime_neat,Rink,Team_side,Jersey,League,Level,Away,Goals_Away,Home,Goals_Home,Type,Game_datetime,Shootout_decider,Upcoming_game
0,0,328022*,Team America,Sat - 10 Sep @ 08:00 PM,Grey,Away,Dark,SIAHL@SJ,Adult Division 6A,Team America,4.0,Beerbears on Ice,3.0,Regular 1,2022-09-10 20:00:00,True,False
1,1,354475*,Team America,Mon - 19 Sep @ 11:15 PM,Grey,Home,Light,SIAHL@SJ,Adult Division 6A,Cereal Killers,7.0,Team America,4.0,Regular 2,2022-09-19 23:15:00,False,False
2,2,351696*,Team America,Sun - 25 Sep @ 05:45 PM,Sharks,Away,Dark,SIAHL@SJ,Adult Division 6A,Team America,2.0,Beerbears on Ice,3.0,Regular 3,2022-09-25 17:45:00,False,False
3,3,360402*,Team America,Wed - 28 Sep @ 09:45 PM,White (C),Away,Dark,SIAHL@SJ,Adult Division 6A,Team America,2.0,Stampede,2.0,Regular 4,2022-09-28 21:45:00,False,False
4,4,369297,Team America,Sun - 09 Oct @ 09:00 PM,Orange (N),Away,Dark,SIAHL@SJ,Adult Division 6A,Team America,,K-Wings,,Regular 5,2022-10-09 21:00:00,False,True
5,5,343483,Team America,Tue - 18 Oct @ 11:15 PM,Grey,Home,Light,SIAHL@SJ,Adult Division 6A,Kraken More Beers,,Team America,,Regular 6,2022-10-18 23:15:00,False,True


## subq-date

In [898]:
schedule_data = df_hockey_games.loc[:,['team_name', 'Upcoming_game', 'Game_datetime', 'Game_datetime_neat', 'Rink', 'Jersey', 'Team_side']]
schedule_data.sort_values(by='Game_datetime', ascending=True, inplace=True)
schedule_data.drop(columns='Game_datetime',inplace=True)

# games and remaining games

In [900]:
schedule_data_remaining = schedule_data.loc[schedule_data['Upcoming_game']].reset_index().drop(columns = ['Upcoming_game','index'])
print(schedule_data)
print(schedule_data_remaining)

Unnamed: 0,team_name,Game_datetime_neat,Rink,Jersey,Team_side
0,Team America,Sun - 09 Oct @ 09:00 PM,Orange (N),Dark,Away
1,Team America,Tue - 18 Oct @ 11:15 PM,Grey,Light,Home
