Importing all important libraries

In [2]:
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
from bs4 import Comment
import re
import numpy as np

Define method for grabbing All-NBA team

In [2]:

def grab_AllNBA(year):
    url_allNBA = "https://www.basketball-reference.com/leagues/NBA_{}.html".format(year)
    html_allNBA = urlopen(url_allNBA)
    # only_div = SoupStrainer("div")
    soup2 = BeautifulSoup(html_allNBA,"html.parser")
    all_nba = soup2.find_all("div",id="all_honors")[0]
    comment = all_nba.contents[0].find(text=lambda text: isinstance(text, Comment))
    comment_str = str(comment)
    #from looking at output, we can see that names of players in All-NBA team are between .html\'> and </a>&nbsp; 
    # build for loop that will grab names between each string
    begin_str = ".html'>"
    end_str = "</a>&nbsp;"
    all_nba_arr = []
    for i in range(15):
        begin_cut = comment_str.index(begin_str)
        end_cut = comment_str.index(end_str)
        all_nba_arr.append(comment_str[begin_cut+len(begin_str):end_cut])
        comment_str = comment_str[end_cut+len(end_str):]
    return(all_nba_arr)



In [3]:
#grab training data from Basketball_reference and clean it
training_year_arr = [1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016]
testing_year_arr = [2017,2018,2019,2020]
type_of_stats = ["per_game","per_minute","per_poss","advanced"]

def grab_NBA_Data(year_arr,file_name):
    print("Starting making ",file_name)
    non_changeable_cols = ['Player', 'Pos', 'Age', 'Tm', 'G', 'MP']
    final_training_data = pd.DataFrame()
    for year in year_arr:
        temp_panda = pd.DataFrame()
        #grabbing ALL-NBA team given year
        all_nba_arr = grab_AllNBA(year)
        statistic = ""
        for stat_type in type_of_stats:
            statistic = stat_type
            #grab raw data
            url = "https://www.basketball-reference.com/leagues/NBA_{}_{}.html".format(year,stat_type) 
            html = urlopen(url)
            soup = BeautifulSoup(html)
            headers = [th.getText() for th in soup.find_all('tr',limit=2)[0].findAll('th')][1:]
            rows = soup.findAll('tr')[1:]
            player_stats = [[td.getText() for td in rows[i].find_all('td')]for i in range(len(rows))]
            stats = pd.DataFrame(player_stats,columns=headers)
            #changing column types to Strings
            stats = stats.convert_dtypes() 
            #removing duplicate names
            stats = stats.drop_duplicates("Player")
            #filtering null rows
            stats = stats[stats.Player.notnull()]
            #filtering null columns
            stats = stats.loc[:,~stats.columns.duplicated()]
            if '' in stats.columns:
                stats = stats.drop('',axis=1)
            if stat_type == "advanced":
                stats = stats.drop(stats.columns[18],axis=1)
            #deleting Game Started column so it makes joining easier
            if stat_type != "advanced":
                stats = stats.drop(["GS"],axis=1)
            #changing name of columns based on stat_type
            new_columns = [col_name+"_{}".format(stat_type) for col_name in stats.columns if col_name not in non_changeable_cols]
            stats.columns = non_changeable_cols + new_columns
            try:
                temp_panda = temp_panda.merge(stats,left_on="Player",right_on="Player",suffixes=("","_"))
            except:
                temp_panda = stats    
        #renaming panda to stats and removing duplicate rows
        stats = temp_panda
        stats = stats.loc[:,~stats.columns.duplicated()]
        stats = stats.drop(['Pos_', 'Age_', 'Tm_', 'G_', 'MP_'],axis=1)
        #add year column
        stats["Year"] = year
        #creating all-nba column
        stats["All_NBA"] = False
        #changing name column to remove star and edit All-NBA column
        for i in stats.index:
            #edit names to remove star at end of name
            stats["Player"][i] = stats["Player"][i][:-1] if stats["Player"][i][-1:] == "*" else stats["Player"][i]
            #filling in All_NBA values correctly
            stats["All_NBA"][i] = True if stats["Player"][i] in all_nba_arr else False
        try:
            final_training_data = final_training_data.append(stats)
        except:
            final_training_data = stats
    #save panda to csv
    final_training_data.to_csv(file_name)
    final_training_data = pd.read_csv(file_name)
    final_training_data.fillna(0,inplace=True)
    final_training_data = final_training_data.drop(["Unnamed: 0"],axis=1)
    final_training_data.to_csv(file_name)
    print("Finished making ",file_name)



In [4]:
#creating datasets
grab_NBA_Data(training_year_arr,"full_training_data_1988-2016")
grab_NBA_Data(testing_year_arr,"testing_data_2017-2020")

Starting making  full_training_data_1988-2016
Finished making  full_training_data_1988-2016
Starting making  testing_data_2017-2020
Finished making  testing_data_2017-2020


Different ways to show that that there are no null values

In [3]:
training_data = pd.read_csv("full_training_data_1988-2016")
print(training_data.shape)
# training_data.fillna(0,inplace=True)
null_col = training_data.columns[training_data.isnull().any()]
print(null_col)
print(training_data[null_col].isnull().sum())
bird = training_data.loc[training_data.Player == "Larry Bird"]


testing_data = pd.read_csv("testing_data_2017-2020")
print(testing_data.shape)
null_col = testing_data.columns[testing_data.isnull().any()]
print(null_col)
print(testing_data[null_col].isnull().sum())


(12204, 95)
Index([], dtype='object')
Series([], dtype: float64)
(2085, 95)
Index([], dtype='object')
Series([], dtype: float64)


In [5]:
testing_data = pd.read_csv("testing_data_2017-2020")
print(testing_data.shape)
print(len(testing_data.Player.unique()))

864
