In [1]:
"""
This Script will scrape all of the data, clean it, and prepare it for visualization
"""

'\nThis Script will scrape all of the data, clean it, and prepare it for visualization\n'

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import os

In [3]:
#url for scraping
url = 'https://www.basketball-reference.com/leagues/NBA_2024_per_game.html'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')

In [4]:
#get table and titles of columns
table = soup.find('table',class_ = "stats_table")

#finds all of the titles of the table minus the first title, "RK"
titles = table.find_all('th')[1:30]

#just gets the text of the titles of the columns
table_titles = [title.text.strip() for title in titles]

In [5]:
df = pd.DataFrame(columns = table_titles)

df = df.rename(columns = {'Pos':'Position', 'Tm':'Team', 'G':'Games Played', 'GS':'Games Started', 
                     'MP':'Minutes Played Per Game', 'FG':'Field Goals Per Game', 'FGA':'Field Goal Attempts Per Game',
                    'FG%':'Field Goal Percentage', '3P':'3-Point Field Goals Per Game', '3PA':'3-Point Field Goal Attempts Per Game',
                    '3P%':'3-Point Field Goal Percentage', '2P':'2-Point Field Goals Per Game', '2PA':'2-Point Field Goal Attempts Per Game',
                    '2P%':'2-Point Field Goal Percentage', 'eFG%':'Effective Field Goal Percentage', 
                    'FT':'Free Throws Per Game', 'FTA':'Free Throw Attempts Per Game', 'FT%':'Free Throw Percentage',
                    'ORB':'Offensive Rebounds Per Game', 'DRB':'Defensive Rebounds Per Game', 'TRB':'Total Rebounds Per Game',
                    'AST':'Assists Per Game', 'STL':'Steals Per Game', 'BLK':'Blocks Per Game', 'TOV':'Turnovers Per Game',
                    'PF':'Personal Fouls Per Game', 'PTS':'Points Per Game'})

In [6]:
#all of the data is wraped in the 'tr' tags within <tbody>
column_data = table.find_all('tr')

In [7]:
"""
fills the df
starts @index 1 since it should exclude the first row of headers in the table
"""

for row in column_data[1:]:
    #all data points that are not the rank number are in the 'td' tag
    row_data = row.find_all('td')
    
    #need this if to prevent a mismatched data and df column error
    #Because the table on the basketball-reference.com restates the column titles every 20 players
    if (len(row_data) != 29):
        continue
    else:
        individual_row_data = [data.text.strip() for data in row_data]
        length = len(df)
        df.loc[length] = individual_row_data

In [8]:
df.replace("", None, inplace = True)

#converting columns into appropriate types so we can manipluate data later
df = df.astype({'Player':'string',
                  'Position':'string', 
                  'Age':'int64',
                  'Team':'string',
                  'Games Played':'int64', 
                  'Games Started':'int64',
                  'Minutes Played Per Game':'float64',
                  'Field Goals Per Game':'float64',
                  'Field Goal Attempts Per Game':'float64',
                  'Field Goal Percentage':'float64',
                  '3-Point Field Goals Per Game':'float64',
                  '3-Point Field Goal Attempts Per Game':'float64',
                  '3-Point Field Goal Percentage':'float64',
                  '2-Point Field Goals Per Game':'float64',
                  '2-Point Field Goal Attempts Per Game':'float64',
                  '2-Point Field Goal Percentage':'float64',
                  'Effective Field Goal Percentage':'float64',
                  'Free Throws Per Game':'float64',
                  'Free Throw Attempts Per Game':'float64',
                  'Free Throw Percentage':'float64',
                  'Offensive Rebounds Per Game':'float64',
                  'Defensive Rebounds Per Game':'float64',
                  'Total Rebounds Per Game':'float64',
                  'Assists Per Game':'float64',
                  'Steals Per Game':'float64',
                  'Blocks Per Game':'float64',
                  'Turnovers Per Game':'float64',
                  'Personal Fouls Per Game':'float64',
                  'Points Per Game':'float64'})

In [9]:
#calculate fantasy points

fpts = []

for i in range(0, len(df['Player'])):
    fpt = (df['Points Per Game'].iloc[i] + 
           df['3-Point Field Goals Per Game'].iloc[i] -
           df['Field Goal Attempts Per Game'].iloc[i] +
           2*df['Field Goals Per Game'].iloc[i] -
           df['Free Throw Attempts Per Game'].iloc[i] +
           df['Free Throws Per Game'].iloc[i] + 
           df['Total Rebounds Per Game'].iloc[i] +
           2*df['Assists Per Game'].iloc[i] +
           4*df['Steals Per Game'].iloc[i] +
           4*df['Blocks Per Game'].iloc[i] - 
           2*df['Turnovers Per Game'].iloc[i])
    fpts.append(round(fpt))

df['Average Fantasy Points Per Game'] = fpts

In [10]:
#calculate total points

total_pts = []

for i in range(0, len(df['Player'])):
    total_pt = df['Games Played'].iloc[i] * df['Points Per Game'].iloc[i]
    total_pts.append(round(total_pt))

df['Total Points'] = total_pts

In [11]:
#calculate total minutes

total_mins = []

for i in range(0, len(df['Player'])):
    total_min = df['Games Played'].iloc[i] * df['Minutes Played Per Game'].iloc[i]
    total_mins.append(round(total_min))

df['Total Minutes'] = total_mins

In [12]:
#calculate total rebounds

total_rbds = []

for i in range(0, len(df['Player'])):
    total_rbd = df['Games Played'].iloc[i] * df['Total Rebounds Per Game'].iloc[i]
    total_rbds.append(round(total_rbd))
    
df['Total Rebounds'] = total_rbds

In [13]:
#calculate total assists

total_assists = []

for i in range(0, len(df['Player'])):
    total_as = df['Games Played'].iloc[i] * df['Assists Per Game'].iloc[i]
    total_assists.append(round(total_as))
    
df['Total Assists'] = total_assists

In [14]:
"""
creates a df of all players. includes repeat appearances of a player on different teams
does not include season totals of that players who were repeated
"""

#sort it by team, reindex the df
df_no_totals = df.copy(deep = True).sort_values(by = ['Team'], axis = 0, inplace = False).reset_index(drop = True)

#get the index of all players with team 'TOT', meaning these values in this row are their season totals
ind = df_no_totals[(df_no_totals['Team'] == 'TOT')].index

#drop the 'TOT' rows
df_no_totals.drop(ind, inplace = True)

#resort the df by 'Player'
df_no_totals.sort_values(by = ['Player'], axis = 0, inplace = True)

#reindex the df
df_no_totals.reset_index(drop = True, inplace = True)

In [15]:
"""
creates a temporary df with only the players who have 'TOT' as their team
this df will be used to create df_totals
"""

df_only_totals = df.copy(deep = True).sort_values(by = ['Player'], axis = 0, inplace = False).reset_index(drop = True)

#gets a the indicies only players who have team != 'TOT'
ind = df_only_totals[(df_only_totals['Team'] != 'TOT')].index

#makes a DataFrame of players with only 'Team' = 'TOT'
df_only_totals.drop(ind, inplace = True)

In [16]:
"""
Creates a df with all players only occuring once
Shows season totals for all players
"""

df_without_totals = df.copy(deep = True)

#drops all occurances of all duplicated players
df_without_totals.drop_duplicates(subset = ['Player'], keep = False, inplace = True)

#DataFrame of all palyers' season totals
df_totals = pd.concat([df_without_totals, df_only_totals])
df_totals.sort_values(by = ['Player'], axis = 0, inplace = True)
df_totals.reset_index(drop = True)

Unnamed: 0,Player,Position,Age,Team,Games Played,Games Started,Minutes Played Per Game,Field Goals Per Game,Field Goal Attempts Per Game,Field Goal Percentage,...,Steals Per Game,Blocks Per Game,Turnovers Per Game,Personal Fouls Per Game,Points Per Game,Average Fantasy Points Per Game,Total Points,Total Minutes,Total Rebounds,Total Assists
0,A.J. Green,SG,24,MIL,56,0,11.0,1.5,3.5,0.423,...,0.2,0.1,0.2,0.9,4.5,8,252,616,62,28
1,A.J. Lawson,SG,23,DAL,42,0,7.4,1.3,2.9,0.446,...,0.2,0.1,0.3,0.5,3.2,6,134,311,50,21
2,AJ Griffin,SF,20,ATL,20,0,8.6,0.9,3.1,0.290,...,0.1,0.1,0.4,0.3,2.4,3,48,172,18,6
3,Aaron Gordon,PF,28,DEN,73,73,31.5,5.5,9.8,0.556,...,0.8,0.6,1.4,1.9,13.9,31,1015,2300,474,256
4,Aaron Holiday,PG,27,HOU,78,1,16.3,2.4,5.3,0.446,...,0.5,0.1,0.7,1.6,6.6,13,515,1271,125,140
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
567,Zach LaVine,SG,28,CHI,25,23,34.9,6.8,15.0,0.452,...,0.8,0.3,2.1,2.3,19.5,33,488,872,130,98
568,Zavier Simpson,PG,26,MEM,7,0,23.0,2.4,7.7,0.315,...,1.0,0.4,1.4,1.6,6.0,16,42,161,20,25
569,Zeke Nnaji,PF,23,DEN,58,0,9.9,1.2,2.6,0.463,...,0.3,0.7,0.5,1.4,3.2,9,186,574,128,35
570,Ziaire Williams,SF,22,MEM,51,15,20.4,2.9,7.4,0.397,...,0.7,0.2,1.3,1.7,8.2,15,418,1040,178,76


In [17]:
#gets rid of duplicate positions form players who were traded mid season
#assigns them the first position that is listed
for i in range(0, len(df_totals['Position'])):
    if '-' in df_totals['Position'].iloc[i] and df_totals['Position'].iloc[i][0] != 'C':
        df_totals['Position'].iloc[i] = df_totals['Position'].iloc[i][0:2]
    elif df_totals['Position'].iloc[i][0] == 'C':
        df_totals['Position'].iloc[i] = df_totals['Position'].iloc[i][0:1]

In [18]:
#Save these DataFrames to csv files to be used later for visualization and analysis
path = '/Users/leohsu/Desktop/Predict NBA Points Per Game/data'

if not os.path.exists(path):
    os.makedirs(path)

df_totals.to_csv(f'{path}/df_totals.csv', index=False)
df_no_totals.to_csv(f'{path}/df_no_totals.csv', index=False)