# Project Luther

Kenny Leung - kenleung11@gmail.com

Part 2/8 - Data scraping NCAA basketball statistics from 2003 to 2016

This notebook documents the process of scraping NCAA basketball statistics from https://basketball.realgm.com/.

In [1]:
# import the packages
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

# Data Scraping

Since I wanted to use this data to predict NBA rookie statistics, I only really needed college data for the 60 players who are drafted into the NBA each year. The website tabulates college statistics of all players since the 2002-2003 season, however, I focused on scraping the first 20 pages, or 2000 players, of each season being fairly confident that all 60 drafted players will be included in the data scraped.

I wanted to test scraping data for one page of the 2003 season to make sure the code works as intended.

In [17]:
url = "https://basketball.realgm.com/ncaa/stats/2003/Averages/Qualified/All/Season/All/points/desc/1/"

response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, "lxml")

# create the column names
column_headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]

# get the data for each player
data_rows = soup.findAll('tr')[1:]
player_data = [[td.getText() for td in data_rows[i].findAll('td')] for i in range(len(data_rows))]

df = pd.DataFrame(player_data, columns=column_headers)

In [24]:
df.head()

Unnamed: 0,#,Player,Team,GP,MPG,FGM,FGA,FG%,3PM,3PA,...,FT%,TOV,PF,ORB,DRB,RPG,APG,SPG,BPG,PPG
0,1,Henry Domercant,EIU,30,33.7,8.8,19.2,0.46,3.0,6.8,...,0.844,2.3,2.1,2.0,4.8,6.8,2.8,1.4,0.5,28.2
1,2,Ruben Douglas,UNM,28,35.1,7.8,19.6,0.397,3.4,8.5,...,0.841,3.9,2.7,1.9,4.8,6.6,2.1,1.2,0.3,28.0
2,3,Mike Helms,OU,28,34.5,8.6,19.0,0.452,2.6,7.0,...,0.745,3.8,3.1,1.2,2.7,4.0,2.0,1.4,0.2,26.9
3,4,Michael Watson,UMKC,29,38.8,8.5,22.6,0.377,4.1,11.6,...,0.753,3.7,2.4,0.8,2.9,3.7,3.8,1.4,0.2,25.5
4,5,Troy Bell,BC,31,38.6,7.2,16.4,0.441,3.4,8.5,...,0.847,2.5,2.1,1.5,3.0,4.6,3.7,2.3,0.2,25.2


# Scraping All 2003 College Data

Now that I have working code, I looped the code to collect all the first 20 pages of the data for the 2002-2003 NCAA season and appended all the data into one Pandas dataframe.

In [18]:
url_template = "https://basketball.realgm.com/ncaa/stats/2003/Averages/Qualified/All/Season/All/points/desc/{webpage}/"
college_df = pd.DataFrame()

for webpage in range(21):  # for each page
    url = url_template.format(webpage=webpage)  # get the url
    
    response = requests.get(url)
    page = response.text
    
    soup = BeautifulSoup(page, 'html5lib')
    
    column_headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
    
    # get our player data
    data_rows = soup.findAll('tr')[1:] 
    player_data = [[td.getText() for td in data_rows[i].findAll('td')] for i in range(len(data_rows))]
    
    # Turn page data into a DataFrame
    page_df = pd.DataFrame(player_data, columns=column_headers)
    
    # Append to the big dataframe
    college_df = college_df.append(page_df, ignore_index=True)
    
    print(webpage)
    
    time.sleep(.5+2*random.random()) # add a random sleep timer

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20


In [20]:
college_df_copy.head()

Unnamed: 0,#,Player,Team,GP,MPG,FGM,FGA,FG%,3PM,3PA,...,FT%,TOV,PF,ORB,DRB,RPG,APG,SPG,BPG,PPG
0,1,Henry Domercant,EIU,30,33.7,8.8,19.2,0.46,3.0,6.8,...,0.844,2.3,2.1,2.0,4.8,6.8,2.8,1.4,0.5,28.2
1,2,Ruben Douglas,UNM,28,35.1,7.8,19.6,0.397,3.4,8.5,...,0.841,3.9,2.7,1.9,4.8,6.6,2.1,1.2,0.3,28.0
2,3,Mike Helms,OU,28,34.5,8.6,19.0,0.452,2.6,7.0,...,0.745,3.8,3.1,1.2,2.7,4.0,2.0,1.4,0.2,26.9
3,4,Michael Watson,UMKC,29,38.8,8.5,22.6,0.377,4.1,11.6,...,0.753,3.7,2.4,0.8,2.9,3.7,3.8,1.4,0.2,25.5
4,5,Troy Bell,BC,31,38.6,7.2,16.4,0.441,3.4,8.5,...,0.847,2.5,2.1,1.5,3.0,4.6,3.7,2.3,0.2,25.2


# Scraping All College Data from 2003 to 2016

After successfully scraping data from the first 20 pages of one season, I looped the code to scrape the first 20 pages of the seasons from 2003 to 2016. I will not need the college stats of the players in 2017 since I would have no NBA statistics for those players to compare with.

In [23]:
url_template2 = "https://basketball.realgm.com/ncaa/stats/{year}/Averages/Qualified/All/Season/All/points/desc/{webpage}/"
total_college_df = pd.DataFrame()

for year in range(2003, 2017):
    print(year)
    for webpage in range(21):  # for each page
        url = url_template2.format(year=year,webpage=webpage)  # get the url

        response = requests.get(url)
        page = response.text

        soup = BeautifulSoup(page, 'html5lib') # create BS object

        column_headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
        # get our player data
        data_rows = soup.findAll('tr')[1:] 
        player_data = [[td.getText() for td in data_rows[i].findAll('td')] for i in range(len(data_rows))]

        # Turn page data into a DataFrame
        page_df = pd.DataFrame(player_data, columns=column_headers)

        # Append to the big dataframe
        total_college_df = total_college_df.append(page_df, ignore_index=True)

        print(webpage)

        time.sleep(.5+2*random.random()) # add a random sleep timer

2003
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
2004
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
2005
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
2006
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
2007
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
2008
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
2009
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
2010
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
2011
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
2012
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
2013
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
2014
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
2015
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
2016
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20


In [90]:
# saving the scraped raw data to a csv file
total_college_df.to_csv('college_raw.csv')

# Cleaning the data

Next, I cleaned the raw college data by dropping unnecessary columns, changing the dtypes of the data, and checking for null values.

In [2]:
# read in csv file
total_college_df_copy = pd.read_csv('college_raw.csv',index_col=0)

In [3]:
total_college_df_copy.head()

Unnamed: 0,#,Player,Team,GP,MPG,FGM,FGA,FG%,3PM,3PA,...,FT%,TOV,PF,ORB,DRB,RPG,APG,SPG,BPG,PPG
0,1,Henry Domercant,EIU,30,33.7,8.8,19.2,0.46,3.0,6.8,...,0.844,2.3,2.1,2.0,4.8,6.8,2.8,1.4,0.5,28.2
1,2,Ruben Douglas,UNM,28,35.1,7.8,19.6,0.397,3.4,8.5,...,0.841,3.9,2.7,1.9,4.8,6.6,2.1,1.2,0.3,28.0
2,3,Mike Helms,OU,28,34.5,8.6,19.0,0.452,2.6,7.0,...,0.745,3.8,3.1,1.2,2.7,4.0,2.0,1.4,0.2,26.9
3,4,Michael Watson,UMKC,29,38.8,8.5,22.6,0.377,4.1,11.6,...,0.753,3.7,2.4,0.8,2.9,3.7,3.8,1.4,0.2,25.5
4,5,Troy Bell,BC,31,38.6,7.2,16.4,0.441,3.4,8.5,...,0.847,2.5,2.1,1.5,3.0,4.6,3.7,2.3,0.2,25.2


In [4]:
total_college_df_copy.columns

Index(['#', 'Player', 'Team', 'GP', 'MPG', 'FGM', 'FGA', 'FG%', '3PM', '3PA',
       '3P%', 'FTM', 'FTA', 'FT%', 'TOV', 'PF', 'ORB', 'DRB', 'RPG', 'APG',
       'SPG', 'BPG', 'PPG'],
      dtype='object')

In [5]:
# drop '#' column
total_college_df_copy = total_college_df_copy.drop(['#'],axis=1)

In [6]:
# change dtypes to numeric values
cols=[i for i in total_college_df_copy.columns if i not in ['Player', 'Team']]
for col in cols:
    total_college_df_copy[col] = total_college_df_copy[col].apply(pd.to_numeric, errors='coerce')

In [7]:
total_college_df_copy[total_college_df_copy['Player'] == 'Kenny Mitchell']

Unnamed: 0,Player,Team,GP,MPG,FGM,FGA,FG%,3PM,3PA,3P%,...,FT%,TOV,PF,ORB,DRB,RPG,APG,SPG,BPG,PPG
15159,Kenny Mitchell,CSU,30,21.6,3.5,6.4,0.539,0.0,0.2,0.0,...,0.701,1.4,2.5,1.9,2.5,4.4,0.8,0.8,0.9,8.5
17125,Kenny Mitchell,,32,20.7,3.4,7.0,0.484,0.0,0.1,0.0,...,0.663,1.6,3.0,1.9,3.2,5.1,0.6,0.8,0.4,8.7


In [98]:
# change this NaN value to the player's college team
total_college_df_copy = total_college_df_copy.set_value(17125,'Team','UVA')

In [99]:
# check for null values
total_college_df_copy.isnull().values.any()

False

In [102]:
# change % symbol in the column headers
total_college_df_copy.columns = total_college_df_copy.columns.str.replace('%', '_Perc')

In [103]:
total_college_df_copy

Unnamed: 0,Player,Team,GP,MPG,FGM,FGA,FG_Perc,3PM,3PA,3P_Perc,...,FT_Perc,TOV,PF,ORB,DRB,RPG,APG,SPG,BPG,PPG
0,Henry Domercant,EIU,30,33.7,8.8,19.2,0.460,3.0,6.8,0.434,...,0.844,2.3,2.1,2.0,4.8,6.8,2.8,1.4,0.5,28.2
1,Ruben Douglas,UNM,28,35.1,7.8,19.6,0.397,3.4,8.5,0.395,...,0.841,3.9,2.7,1.9,4.8,6.6,2.1,1.2,0.3,28.0
2,Mike Helms,OU,28,34.5,8.6,19.0,0.452,2.6,7.0,0.379,...,0.745,3.8,3.1,1.2,2.7,4.0,2.0,1.4,0.2,26.9
3,Michael Watson,UMKC,29,38.8,8.5,22.6,0.377,4.1,11.6,0.350,...,0.753,3.7,2.4,0.8,2.9,3.7,3.8,1.4,0.2,25.5
4,Troy Bell,BC,31,38.6,7.2,16.4,0.441,3.4,8.5,0.402,...,0.847,2.5,2.1,1.5,3.0,4.6,3.7,2.3,0.2,25.2
5,Kee-Kee Clark,SPC,29,38.2,8.0,20.1,0.396,3.8,9.6,0.392,...,0.853,4.0,1.9,0.8,2.4,3.3,4.2,1.4,0.2,24.9
6,Luis Flores,MAN,30,38.9,7.7,16.9,0.455,1.9,4.8,0.386,...,0.902,3.3,2.3,1.6,4.0,5.6,2.9,1.9,0.4,24.6
7,Chris Williams,BSU,30,35.8,7.5,17.6,0.429,2.1,6.5,0.327,...,0.859,2.3,2.6,1.2,2.2,3.4,2.2,1.3,0.1,24.5
8,Mike Sweetney,GU,34,32.0,7.8,14.2,0.547,0.0,0.1,0.000,...,0.738,2.1,2.8,4.1,6.7,10.8,1.9,1.4,3.2,22.8
9,Kevin Martin,WCU,24,31.6,6.7,15.8,0.425,2.1,6.6,0.314,...,0.879,2.9,2.5,1.0,2.8,3.8,1.8,1.4,0.5,22.8


In [104]:
# sort the players by name and count to determine how many years they played in college, and also get their
# average college statistics
counts = total_college_df_copy.groupby('Player', as_index=False).count()['Team']
total_college_df_copy = total_college_df_copy.groupby('Player', as_index=False).mean()
total_college_df_copy['Yrs'] = counts
Yrs = total_college_df_copy['Yrs']
total_college_df_copy.drop(labels=['Yrs'], axis=1,inplace = True)
total_college_df_copy.insert(0, 'Yrs', Yrs)

In [110]:
total_college_df_copy.dtypes

Yrs          int64
Player      object
GP         float64
MPG        float64
FGM        float64
FGA        float64
FG_Perc    float64
3PM        float64
3PA        float64
3P_Perc    float64
FTM        float64
FTA        float64
FT_Perc    float64
TOV        float64
PF         float64
ORB        float64
DRB        float64
RPG        float64
APG        float64
SPG        float64
BPG        float64
PPG        float64
dtype: object

In [111]:
# take a look at the players with the highest points per game average
total_college_df_copy.sort_values(by='PPG',ascending=False)

Unnamed: 0,Yrs,Player,GP,MPG,FGM,FGA,FG_Perc,3PM,3PA,3P_Perc,...,FT_Perc,TOV,PF,ORB,DRB,RPG,APG,SPG,BPG,PPG
4976,1,Henry Domercant,30.000000,33.700000,8.800000,19.200000,0.460000,3.000000,6.800000,0.434000,...,0.844000,2.300000,2.100000,2.000000,4.800000,6.800000,2.800000,1.400,0.500000,28.200000
10969,1,Ruben Douglas,28.000000,35.100000,7.800000,19.600000,0.397000,3.400000,8.500000,0.395000,...,0.841000,3.900000,2.700000,1.900000,4.800000,6.600000,2.100000,1.200,0.300000,28.000000
8162,2,Lester Hudson,32.500000,36.450000,9.300000,20.400000,0.456500,3.550000,9.500000,0.371500,...,0.857500,3.550000,2.650000,2.200000,5.750000,7.850000,4.350000,2.550,0.650000,26.600000
9095,1,Michael Beasley,33.000000,31.500000,9.300000,17.500000,0.532000,1.100000,2.900000,0.379000,...,0.774000,2.900000,2.600000,4.000000,8.400000,12.400000,1.200000,1.300,1.600000,26.200000
497,1,Andre Collins,28.000000,38.100000,9.100000,22.100000,0.414000,4.200000,11.500000,0.366000,...,0.902000,3.600000,2.400000,0.800000,2.800000,3.600000,4.700000,2.300,0.300000,26.100000
7318,4,Kee-Kee Clark,29.500000,38.625000,8.175000,20.350000,0.402500,3.725000,10.125000,0.366250,...,0.854500,3.375000,1.950000,1.100000,2.825000,3.950000,4.250000,2.250,0.175000,25.900000
7581,1,Kevin Durant,35.000000,35.900000,8.700000,18.500000,0.472000,2.300000,5.800000,0.404000,...,0.816000,2.800000,2.000000,3.000000,8.100000,11.100000,1.300000,1.900,1.900000,25.800000
11581,3,Stephen Curry,34.666667,32.566667,8.366667,17.933333,0.467000,3.966667,9.633333,0.411333,...,0.875000,3.033333,2.466667,0.766667,3.766667,4.533333,3.766667,2.100,0.266667,25.333333
12487,1,Troy Bell,31.000000,38.600000,7.200000,16.400000,0.441000,3.400000,8.500000,0.402000,...,0.847000,2.500000,2.100000,1.500000,3.000000,4.600000,3.700000,2.300,0.200000,25.200000
9314,2,Mike Helms,29.000000,32.600000,7.950000,18.450000,0.428500,2.600000,7.300000,0.362500,...,0.747500,3.250000,3.150000,1.000000,2.750000,3.800000,2.250000,1.450,0.350000,25.050000


In [112]:
# save cleaned data to a csv file
total_college_df_copy.to_csv('college_data.csv')