# Import Data 

In [78]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.basketball-reference.com/leagues/NBA_2023_games.html'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
tbl = soup.find("table",{"id":"schedule"})
schedule = pd.read_html(str(tbl))[0]

months = ["november", "december", "january", "february", "march", "april"]
for  i in months:
    url = 'https://www.basketball-reference.com/leagues/NBA_2023_games-{0}.html'.format(i)
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    tbl = soup.find("table",{"id":"schedule"})
    temp_data_frame = pd.read_html(str(tbl))[0]
    schedule = schedule.append(temp_data_frame)

In [79]:
schedule

Unnamed: 0,Date,Start (ET),Visitor/Neutral,PTS,Home/Neutral,PTS.1,Unnamed: 6,Unnamed: 7,Attend.,Arena,Notes
0,"Tue, Oct 18, 2022",7:30p,Philadelphia 76ers,117.0,Boston Celtics,126.0,Box Score,,19156.0,TD Garden,
1,"Tue, Oct 18, 2022",10:00p,Los Angeles Lakers,109.0,Golden State Warriors,123.0,Box Score,,18064.0,Chase Center,
2,"Wed, Oct 19, 2022",7:00p,Orlando Magic,109.0,Detroit Pistons,113.0,Box Score,,20190.0,Little Caesars Arena,
3,"Wed, Oct 19, 2022",7:00p,Washington Wizards,114.0,Indiana Pacers,107.0,Box Score,,15027.0,Gainbridge Fieldhouse,
4,"Wed, Oct 19, 2022",7:30p,Houston Rockets,107.0,Atlanta Hawks,117.0,Box Score,,17878.0,State Farm Arena,
...,...,...,...,...,...,...,...,...,...,...,...
65,"Sun, Apr 9, 2023",3:30p,Utah Jazz,,Los Angeles Lakers,,,,,Crypto.com Arena,
66,"Sun, Apr 9, 2023",3:30p,New Orleans Pelicans,,Minnesota Timberwolves,,,,,Target Center,
67,"Sun, Apr 9, 2023",3:30p,Memphis Grizzlies,,Oklahoma City Thunder,,,,,Paycom Center,
68,"Sun, Apr 9, 2023",3:30p,Los Angeles Clippers,,Phoenix Suns,,,,,Footprint Center,


In [80]:
schedule.columns

Index(['Date', 'Start (ET)', 'Visitor/Neutral', 'PTS', 'Home/Neutral', 'PTS.1',
       'Unnamed: 6', 'Unnamed: 7', 'Attend.', 'Arena', 'Notes'],
      dtype='object')

# Clean the data

In [81]:
# Make sure all columns have names and the names match coding standards (lowercase with "_" for spaces)
# Get rid of unneeded columns
schedule = schedule.rename(columns = {"PTS":"visitor_pts", "PTS.1":"home_points","Unnamed:7":"OT",'Visitor/Neutral': 'visitor','Home/Neutral': 'home'})
schedule.columns = schedule.columns.str.replace(" ", "_")
schedule.columns = schedule.columns.str.lower()
schedule = schedule.drop(["unnamed:_6", 'notes'], axis = 1)
schedule.columns

Index(['date', 'start_(et)', 'visitor', 'visitor_pts', 'home', 'home_points',
       'unnamed:_7', 'attend.', 'arena'],
      dtype='object')

In [82]:
# parse each team's data 
list_of_team_names = schedule['visitor'].unique()
list_of_team_names.sort()
list_of_team_names

array(['Atlanta Hawks', 'Boston Celtics', 'Brooklyn Nets',
       'Charlotte Hornets', 'Chicago Bulls', 'Cleveland Cavaliers',
       'Dallas Mavericks', 'Denver Nuggets', 'Detroit Pistons',
       'Golden State Warriors', 'Houston Rockets', 'Indiana Pacers',
       'Los Angeles Clippers', 'Los Angeles Lakers', 'Memphis Grizzlies',
       'Miami Heat', 'Milwaukee Bucks', 'Minnesota Timberwolves',
       'New Orleans Pelicans', 'New York Knicks', 'Oklahoma City Thunder',
       'Orlando Magic', 'Philadelphia 76ers', 'Phoenix Suns',
       'Portland Trail Blazers', 'Sacramento Kings', 'San Antonio Spurs',
       'Toronto Raptors', 'Utah Jazz', 'Washington Wizards'], dtype=object)

In [85]:
# Make a dictionary where the key is the team name and the value is a dataframe of all the games
dict_of_dfs = {}
for team in list_of_team_names:
    dict_of_dfs[team] = schedule.query('visitor == @team or home == @team')

In [101]:
# Check to make sure it works
dict_of_dfs["San Antonio Spurs"]

Unnamed: 0,date,start_(et),visitor,visitor_pts,home,home_points,unnamed:_7,attend.,arena
10,"Wed, Oct 19, 2022",8:00p,Charlotte Hornets,129.0,San Antonio Spurs,102.0,,16236.0,AT&T Center
17,"Fri, Oct 21, 2022",7:00p,San Antonio Spurs,137.0,Indiana Pacers,134.0,,12073.0,Gainbridge Fieldhouse
27,"Sat, Oct 22, 2022",6:00p,San Antonio Spurs,114.0,Philadelphia 76ers,105.0,,19822.0,Wells Fargo Center
49,"Mon, Oct 24, 2022",8:00p,San Antonio Spurs,115.0,Minnesota Timberwolves,106.0,,15347.0,Target Center
61,"Wed, Oct 26, 2022",8:00p,San Antonio Spurs,122.0,Minnesota Timberwolves,134.0,,16165.0,Target Center
...,...,...,...,...,...,...,...,...,...
7,"Sun, Apr 2, 2023",6:00p,San Antonio Spurs,,Sacramento Kings,,,,Golden 1 Center
27,"Tue, Apr 4, 2023",10:00p,San Antonio Spurs,,Phoenix Suns,,,,Footprint Center
38,"Thu, Apr 6, 2023",8:00p,Portland Trail Blazers,,San Antonio Spurs,,,,Moody Center
54,"Sat, Apr 8, 2023",4:00p,Minnesota Timberwolves,,San Antonio Spurs,,,,Moody Center


# Team Report

Plan is to use dataframes as the data structures to hold the data for the reports. There will be a "full_season_report" df and then one for 30 days and one for 10 days. Each report's columns will be the metrics and each report's rows will be the teams. This all could probably be one df but I'm planning on splitting it to make it more digestible. 

## Full Season Report

In [102]:
# Create list of the metrics to calculate
metrics = ['wins', 'losses', 'win_%', 'tot_pts', 'tot_pts_against', 'avg_pts', 'avg_pts_against', 'home_wins', 'home_losses', 'home_win_%' \
    'home_avg_pts', 'home_avg_pts_against','away_wins', 'away_losses', 'away_win_%', 'away_avg_pts', 'away_avg_pts_against' \
    'back_wins', 'back_losses', 'back_win_%', 'back_avg_pts', 'back_avg_pts_against']

# - Home record
#     - home avg pts scored
#     - home avg pts against
# - Away record
#     - away avg pts scored
#     - away avgs pts against
# - Back to backs
#     - AVG PTS scored
#     - AVG PTS against


# Create the empty df
full_season_report = pd.DataFrame(columns=[metrics])

# Day Report 

In [106]:
# Get all games for the day

In [107]:
# for each game 
# calcualte basic metrics from team report 
# get the prediction for this game 
# get the previous results b/t these 2 teams 
# get how hard their schedule was for the past 10, 21, 30 days