# Datasets
A brainstorm of what datasets would be useful

## By player per game
A dictionary of player's and their stats in all their games

## By team per game
A dictionary of all teams and overall stats for each game they've played

## Ratings
A dictionary of all players and their current ratings. Not on a game by game basis, but an aggregate.
Sources could include:
* NBA official stats
* 2K
* Other free stats sources

## Data structure
It might be useful to have custom data structures to access all this data. For example, I could have a specific data structure for box score, player stats, team stats.

In [1]:
import datetime as dt
from bs4 import BeautifulSoup
import requests
import re

In [2]:
# ESPN
START_DATE = dt.datetime(2018, 10, 17)
url_start = 'http://www.espn.com.au/nba/scoreboard/_/date/'

data_day = dt.datetime(2018, 10, 17)

date_string = str(data_day.year) + str(data_day.month).zfill(2) + str(data_day.day).zfill(2) # zfill for leading 0 if necessary
url = url_start + date_string

if len(date_string) != 8:
    raise ValueError('date_string needs to be in yyyymmdd format, date_string is ' + date_string)

source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')

In [3]:
# NBA
START_DATE = dt.datetime(2018, 10, 17)
url_start = 'https://au.global.nba.com/schedule/#!/'

data_day = dt.datetime(2018, 10, 17)
# zfill for leading 0 if necessary
date_string = str(data_day.year) + '-' + str(data_day.month).zfill(2) + '-' + str(data_day.day).zfill(2) 
url = url_start + date_string

if len(date_string) != 10:
    raise ValueError('date_string needs to be in yyyy-mm-dd format, date_string is ' + date_string)

source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')

In [4]:
# games = soup.findAll('a', attrs={'href': re.compile('boxscore')})
# games = soup.findAll('div', attrs={'id': 'events'})
games = soup.findAll('table', attrs={'id':'sib-league-schedule'})
assert len(games) == 1
games_table = games[0]

In [5]:
for game in games_table.findAll('tr', attrs={'data-ng-repeat':"game in date.games"}):
    links = game.findAll('td', attrs={'class':'links'})
    print(links)

[<td class="links" data-ng-show="!paginate || currentpage==1">
<sib3-game-urls class="dark-theme" game="game" types="live,leaguepass,stats-boxscore,stats-preview,stats-playbyplay,highlights"></sib3-game-urls>
</td>]


# Using tutorial
[Here](http://practicallypredictable.com/2017/12/21/web-scraping-nba-team-matchups-box-scores/) is a nice tutorial on how to scrape box score data.
A nice notebook viewer version [here](https://nbviewer.jupyter.org/github/practicallypredictable/posts/blob/master/basketball/nba/notebooks/scrape-stats_nba-team_matchups.ipynb)

In [8]:
from itertools import chain
from pathlib import Path
from time import sleep
from datetime import datetime
import requests
from tqdm import tqdm
tqdm.monitor_interval = 0
import numpy as np
import pandas as pd
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

In [10]:
USER_AGENT = (
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) ' +
    'AppleWebKit/537.36 (KHTML, like Gecko) ' +
    'Chrome/61.0.3163.100 Safari/537.36'
)

REQUEST_HEADERS = {
    'user-agent': USER_AGENT,
}

In [21]:
NBA_URL = 'http://stats.nba.com/stats/teamgamelogs'
NBA_ID = '00'

NBA_SEASON_TYPES = {
    'regular': 'Regular Season',
    'playoffs': 'Playoffs',
    'preseason': 'Pre Season',
}

season = '2016-17'
season_type = NBA_SEASON_TYPES['regular']

nba_params = {
    'LeagueID': NBA_ID,
    'Season': season,
    'SeasonType': season_type,
}
r = requests.get(NBA_URL, params=nba_params, allow_redirects=False, timeout=15)
assert r.status_code == 200

ReadTimeout: HTTPConnectionPool(host='stats.nba.com', port=80): Read timed out. (read timeout=15)