# Import Data 

In [222]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
url = 'https://www.basketball-reference.com/leagues/NBA_2023_games.html'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
tbl = soup.find("table",{"id":"schedule"})
schedule = pd.read_html(str(tbl))[0]

months = ["november", "december", "january", "february", "march", "april"]
for  i in months:
    url = 'https://www.basketball-reference.com/leagues/NBA_2023_games-{0}.html'.format(i)
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    tbl = soup.find("table",{"id":"schedule"})
    temp_data_frame = pd.read_html(str(tbl))[0]
    schedule = schedule.append(temp_data_frame)

In [223]:
schedule

Unnamed: 0,Date,Start (ET),Visitor/Neutral,PTS,Home/Neutral,PTS.1,Unnamed: 6,Unnamed: 7,Attend.,Arena,Notes
0,"Tue, Oct 18, 2022",7:30p,Philadelphia 76ers,117.0,Boston Celtics,126.0,Box Score,,19156.0,TD Garden,
1,"Tue, Oct 18, 2022",10:00p,Los Angeles Lakers,109.0,Golden State Warriors,123.0,Box Score,,18064.0,Chase Center,
2,"Wed, Oct 19, 2022",7:00p,Orlando Magic,109.0,Detroit Pistons,113.0,Box Score,,20190.0,Little Caesars Arena,
3,"Wed, Oct 19, 2022",7:00p,Washington Wizards,114.0,Indiana Pacers,107.0,Box Score,,15027.0,Gainbridge Fieldhouse,
4,"Wed, Oct 19, 2022",7:30p,Houston Rockets,107.0,Atlanta Hawks,117.0,Box Score,,17878.0,State Farm Arena,
...,...,...,...,...,...,...,...,...,...,...,...
65,"Sun, Apr 9, 2023",3:30p,Utah Jazz,,Los Angeles Lakers,,,,,Crypto.com Arena,
66,"Sun, Apr 9, 2023",3:30p,New Orleans Pelicans,,Minnesota Timberwolves,,,,,Target Center,
67,"Sun, Apr 9, 2023",3:30p,Memphis Grizzlies,,Oklahoma City Thunder,,,,,Paycom Center,
68,"Sun, Apr 9, 2023",3:30p,Los Angeles Clippers,,Phoenix Suns,,,,,Footprint Center,


In [224]:
schedule.columns

Index(['Date', 'Start (ET)', 'Visitor/Neutral', 'PTS', 'Home/Neutral', 'PTS.1',
       'Unnamed: 6', 'Unnamed: 7', 'Attend.', 'Arena', 'Notes'],
      dtype='object')

# Clean the data

In [225]:
# Make sure all columns have names and the names match coding standards (lowercase with "_" for spaces)
# Get rid of unneeded columns
schedule = schedule.rename(columns = {"PTS":"visitor_pts", "PTS.1":"home_pts","Unnamed:7":"OT",'Visitor/Neutral': 'visitor','Home/Neutral': 'home'})
schedule.columns = schedule.columns.str.replace(" ", "_")
schedule.columns = schedule.columns.str.lower()
schedule = schedule.drop(["unnamed:_6", 'notes'], axis = 1)
schedule.columns

Index(['date', 'start_(et)', 'visitor', 'visitor_pts', 'home', 'home_pts',
       'unnamed:_7', 'attend.', 'arena'],
      dtype='object')

In [226]:
# parse each team's data 
list_of_team_names = schedule['visitor'].unique()
list_of_team_names.sort()
list_of_team_names

array(['Atlanta Hawks', 'Boston Celtics', 'Brooklyn Nets',
       'Charlotte Hornets', 'Chicago Bulls', 'Cleveland Cavaliers',
       'Dallas Mavericks', 'Denver Nuggets', 'Detroit Pistons',
       'Golden State Warriors', 'Houston Rockets', 'Indiana Pacers',
       'Los Angeles Clippers', 'Los Angeles Lakers', 'Memphis Grizzlies',
       'Miami Heat', 'Milwaukee Bucks', 'Minnesota Timberwolves',
       'New Orleans Pelicans', 'New York Knicks', 'Oklahoma City Thunder',
       'Orlando Magic', 'Philadelphia 76ers', 'Phoenix Suns',
       'Portland Trail Blazers', 'Sacramento Kings', 'San Antonio Spurs',
       'Toronto Raptors', 'Utah Jazz', 'Washington Wizards'], dtype=object)

In [227]:
# Make a dictionary where the key is the team name and the value is a dataframe of all the games
dict_of_dfs = {}
for team in list_of_team_names:
    dict_of_dfs[team] = schedule.query('visitor == @team or home == @team')

In [228]:
# Check to make sure it works
dict_of_dfs["San Antonio Spurs"]

Unnamed: 0,date,start_(et),visitor,visitor_pts,home,home_pts,unnamed:_7,attend.,arena
10,"Wed, Oct 19, 2022",8:00p,Charlotte Hornets,129.0,San Antonio Spurs,102.0,,16236.0,AT&T Center
17,"Fri, Oct 21, 2022",7:00p,San Antonio Spurs,137.0,Indiana Pacers,134.0,,12073.0,Gainbridge Fieldhouse
27,"Sat, Oct 22, 2022",6:00p,San Antonio Spurs,114.0,Philadelphia 76ers,105.0,,19822.0,Wells Fargo Center
49,"Mon, Oct 24, 2022",8:00p,San Antonio Spurs,115.0,Minnesota Timberwolves,106.0,,15347.0,Target Center
61,"Wed, Oct 26, 2022",8:00p,San Antonio Spurs,122.0,Minnesota Timberwolves,134.0,,16165.0,Target Center
...,...,...,...,...,...,...,...,...,...
7,"Sun, Apr 2, 2023",6:00p,San Antonio Spurs,,Sacramento Kings,,,,Golden 1 Center
27,"Tue, Apr 4, 2023",10:00p,San Antonio Spurs,,Phoenix Suns,,,,Footprint Center
38,"Thu, Apr 6, 2023",8:00p,Portland Trail Blazers,,San Antonio Spurs,,,,Moody Center
54,"Sat, Apr 8, 2023",4:00p,Minnesota Timberwolves,,San Antonio Spurs,,,,Moody Center


In [229]:
# Add column to determine if the game was a win. First create a function to determine W or L
def determine_win(team_of_interest, away_team, home_pts,away_pts):
    """Return whether a given team of interest has won or loss the game.""" 
    # If team of interest is away team
    if team_of_interest == away_team:
        if away_pts > home_pts:
            return 'W'
        else:
            return 'L'
    # If team of intest is home team
    else:
        if away_pts > home_pts:
            return 'L'
        else:
            return 'W'

In [230]:

# dict_of_dfs['San Antonio Spurs']
# vecfunc = np.vectorize(determine_win)
# df = dict_of_dfs['San Antonio Spurs']
# # out = vecfunc(dict_of_dfs['San Antonio Spurs']['home_pts'],dict_of_dfs['San Antonio Spurs']['visitor_pts'] )
# out = vecfunc('San Antonio Spurs', df['visitor'], df['home_pts'],df['visitor_pts'] )
# print(out)
# df.insert(loc = len(df.columns), column='Result', value=out)
# # df['Result'] = out
# # df
# df
# Add column to each dataframe that holds the result of the game (either W or L)
    # Use vectorize from numpy to itterate over a dataframe since its faster. 
vec_determine_win = np.vectorize(determine_win)
for i,v in dict_of_dfs.items():
    temp_results = vec_determine_win(i,v['visitor'], v['home_pts'], v['visitor_pts'])
    v.insert(loc = len(v.columns), column='result', value=temp_results)


In [231]:
# Test
dict_of_dfs['New Orleans Pelicans'] 

Unnamed: 0,date,start_(et),visitor,visitor_pts,home,home_pts,unnamed:_7,attend.,arena,result
5,"Wed, Oct 19, 2022",7:30p,New Orleans Pelicans,130.0,Brooklyn Nets,108.0,,18003.0,Barclays Center,W
16,"Fri, Oct 21, 2022",7:00p,New Orleans Pelicans,124.0,Charlotte Hornets,112.0,,19287.0,Spectrum Center,W
39,"Sun, Oct 23, 2022",7:00p,Utah Jazz,122.0,New Orleans Pelicans,121.0,OT,18665.0,Smoothie King Center,L
52,"Tue, Oct 25, 2022",7:30p,Dallas Mavericks,111.0,New Orleans Pelicans,113.0,,14020.0,Smoothie King Center,W
78,"Fri, Oct 28, 2022",10:00p,New Orleans Pelicans,111.0,Phoenix Suns,124.0,,17071.0,Footprint Center,L
...,...,...,...,...,...,...,...,...,...,...
1,"Sat, Apr 1, 2023",8:30p,Los Angeles Clippers,,New Orleans Pelicans,,,,Smoothie King Center,W
22,"Tue, Apr 4, 2023",8:00p,Sacramento Kings,,New Orleans Pelicans,,,,Smoothie King Center,W
33,"Wed, Apr 5, 2023",8:00p,Memphis Grizzlies,,New Orleans Pelicans,,,,Smoothie King Center,W
48,"Fri, Apr 7, 2023",8:00p,New York Knicks,,New Orleans Pelicans,,,,Smoothie King Center,W


# Team Report

Plan is to use dataframes as the data structures to hold the data for the reports. There will be a "full_season_report" df and then one for 30 days and one for 10 days. Each report's columns will be the metrics and each report's rows will be the teams. This all could probably be one df but I'm planning on splitting it to make it more digestible. 

## Full Season Report

In [232]:
# Create list of the metrics to calculate
metrics = ['wins', 'losses', 'win_%', 'tot_pts', 'tot_pts_against', 'avg_pts', 'avg_pts_against', 'home_wins', 'home_losses', 'home_win_%' \
    'home_avg_pts', 'home_avg_pts_against','away_wins', 'away_losses', 'away_win_%', 'away_avg_pts', 'away_avg_pts_against' \
    'back_wins', 'back_losses', 'back_win_%', 'back_avg_pts', 'back_avg_pts_against']

# - Home record
#     - home avg pts scored
#     - home avg pts against
# - Away record
#     - away avg pts scored
#     - away avgs pts against
# - Back to backs
#     - AVG PTS scored
#     - AVG PTS against


# Create the empty df
full_season_report = pd.DataFrame(columns=[metrics])
full_season_report

Unnamed: 0,wins,losses,win_%,tot_pts,tot_pts_against,avg_pts,avg_pts_against,home_wins,home_losses,home_win_%home_avg_pts,home_avg_pts_against,away_wins,away_losses,away_win_%,away_avg_pts,away_avg_pts_againstback_wins,back_losses,back_win_%,back_avg_pts,back_avg_pts_against


In [233]:
def determine_margin(team_of_interest, away_team, home_pts,away_pts):
    """Return how much a team has won or loss by.""" 
    # First call function to determine if they won 
    result = determine_win(team_of_interest, away_team, home_pts,away_pts)
    # Determine the margin of game 
    margin = home_pts - away_pts

    # Determine if margin is pos or neg based off if they won. 
    if result == 'W':
        return margin
    else:
        return (margin * -1)


In [234]:
# Practice using vectorize from numpy to itterate over a dataframe since its faster. 
dict_of_dfs['San Antonio Spurs']
vecfunc = np.vectorize(determine_win)
df = dict_of_dfs['San Antonio Spurs']
# out = vecfunc(dict_of_dfs['San Antonio Spurs']['home_pts'],dict_of_dfs['San Antonio Spurs']['visitor_pts'] )
out = vecfunc('San Antonio Spurs', df['visitor'], df['home_pts'],df['visitor_pts'] )
print(out)

['L' 'W' 'W' 'W' 'L' 'W' 'W' 'L' 'L' 'L' 'L' 'L' 'W' 'L' 'L' 'L' 'L' 'L'
 'L' 'L' 'L' 'L' 'L' 'L' 'W' 'W' 'W' 'L' 'L' 'W' 'L' 'L' 'W' 'L' 'W' 'L'
 'L' 'L' 'W' 'L' 'L' 'L' 'L' 'L' 'W' 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'L'
 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'W' 'W' 'L' 'W' 'W' 'W' 'W' 'W' 'W' 'L'
 'L' 'L' 'L' 'W' 'L' 'L' 'L' 'W' 'W' 'L']


# Day Report 

In [235]:
# Get all games for the day

In [236]:
# for each game 
# calcualte basic metrics from team report 
# get the prediction for this game 
# get the previous results b/t these 2 teams 
# get how hard their schedule was for the past 10, 21, 30 days