# WNBA Data Scrape
In this notebook, I scraped data from all WNBA players listed on Basketball-Reference.com. I acquired the position names and career stats for per-game summaries for each player.

### Import Libraries

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import re


In [2]:
# set preferences to show 25 columns
pd.set_option('display.max_columns', 25)

### Load URL

In [3]:
url = 'https://www.basketball-reference.com/wnba/players'
res = requests.get(url)
res.status_code

200

In [4]:
soup = BeautifulSoup(res.content, 'lxml')

# WNBA Roster

In [5]:
# Empty list to append all info
alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
all_pages = []
not_wanted = ['WNBA', 'Players', 'Teams', 'Seasons', 'Player Stats', 'Coaches', 'Leaders', 'Awards', 'Support us without the ads? Go Ad-Free.']

for i in range(26):
    # set url to a variable
    url = 'https://www.basketball-reference.com/wnba/players/' + alphabet[i]
    
    # request data
    res = requests.get(url)
    
    # create a soup object
    soup = BeautifulSoup(res.content, 'lxml')
    
    # Get all players names by alphabet letter
    name_index = soup.find('div', {'id': 'content'})
    
    # Iterate through each page and find info 
    for row in name_index.find_all('a'):
        for word in not_wanted:
            if row.text not in not_wanted: 
                player_dict = {}
                player_dict['name'] = row.text
                player_dict['slug'] = row.attrs['href'].replace('/wnba/players/', '')
                all_pages.append(player_dict)

# Convert list to DataFrame  
df_wnba_roster = pd.DataFrame(all_pages)

In [6]:
print(df_wnba_roster.shape)

(8307, 2)


In [7]:
df_wnba_roster.drop_duplicates(inplace=True)

df_wnba_roster.reset_index(inplace=True)

df_wnba_roster.drop(columns='index', inplace=True)

In [8]:
print(df_wnba_roster.shape)

(923, 2)


In [9]:
df_wnba_roster.head()

Unnamed: 0,name,slug
0,Farhiya Abdi,a/abdifa01w.html
1,Tajama Abraham,a/abrahta01w.html
2,Svetlana Abrosimova,a/abrossv01w.html
3,Natalie Achonwa,a/achonna01w.html
4,Jessica Adair,a/adairje01w.html


In [10]:
df_wnba_roster[df_wnba_roster['name'] == 'Michelle Campbell']

Unnamed: 0,name,slug
135,Michelle Campbell,c/campbmi01w.html
136,Michelle Campbell,c/campbmi02w.html


In [11]:
#df_wnba_roster.to_csv('./data/wnba_roster.csv')

# WNBA Positions

In [12]:
all_positions = []

for i in range(len(df_wnba_roster)):
    # set url to a variable
    url = 'https://www.basketball-reference.com/wnba/players/' + df_wnba_roster['slug'].loc[i]
    player = df_wnba_roster['name'][i]
    
    # request data
    res = requests.get(url)
    
    # create a soup object
    soup = BeautifulSoup(res.content, 'lxml')
    
    # Get all player info
    player_info = soup.find('div', {'class': 'players'})
    
    # Iterate through each page and find info 
    for info in player_info.find_all('p', limit=2):
        position_dict = {}
        string = info.text
        string = string.replace('Position:', '')
        string = string.replace('\n', '')
        string = string.replace(' ', '')
        position_dict['name'] = player
        position_dict['position'] = string
        all_positions.append(position_dict)

# Convert list to DataFrame  
wnba_positions = pd.DataFrame(all_positions)

# show head
wnba_positions.head()

Unnamed: 0,name,position
0,Farhiya Abdi,Forward
1,Farhiya Abdi,"6-2, 180lb (188cm, 81kg)"
2,Tajama Abraham,Center
3,Tajama Abraham,"6-2, 190lb (188cm, 86kg)"
4,Svetlana Abrosimova,Forward


In [13]:
forward = wnba_positions['position'] == 'Forward'
forward_ = wnba_positions['position'] == 'Forward-'
center = wnba_positions['position'] == 'Center'
forward_center = wnba_positions['position'] == 'Forward-Center'
center_forward = wnba_positions['position'] == 'Center-Forward'
guard = wnba_positions['position'] == 'Guard'
forward_guard = wnba_positions['position'] == 'Forward-Guard'
guard_forward = wnba_positions['position'] == 'Guard-Forward'


In [14]:
wnba_positions = wnba_positions[forward | center | forward_center | guard | forward_guard | guard_forward | center_forward | forward_]

In [15]:
wnba_positions.head()

Unnamed: 0,name,position
0,Farhiya Abdi,Forward
2,Tajama Abraham,Center
4,Svetlana Abrosimova,Forward
6,Natalie Achonwa,Forward
8,Jessica Adair,Center


In [16]:
wnba_positions.shape

(908, 2)

In [17]:
#wnba_positions.to_csv('./data/wnba_positions.csv')

# WNBA All Stats

In [18]:
all_stats = []

for i in range(len(df_wnba_roster)):
    # set url to a variable
    url = 'https://www.basketball-reference.com/wnba/players/' + df_wnba_roster['slug'].loc[i]
    player = df_wnba_roster['name'][i]
    
    # request data
    res = requests.get(url)
    
    # create a soup object
    soup = BeautifulSoup(res.content, 'lxml')
    
    # Get all players per game stats
    per_game = soup.find('table', {'id': 'wnba_per_game'})
    
    # Iterate through each page and find info 
    for row in per_game.find_all('tr', limit=1):
        # Code block from https://towardsdatascience.com/web-scraping-nba-stats-4b4f8c525994
        headers = [th.getText() for th in soup.findAll('tr', limit=1)[0].findAll('th')]
        headers = headers[1:]
        name = 'name'
        headers = [name] + headers
        rows = soup.findAll('tr')
        player_stats = [[td.getText() for td in rows[i].findAll('td')] for i in range(len(rows))]
        # Code block from https://towardsdatascience.com/web-scraping-nba-stats-4b4f8c525994 
        player_stats = player_stats[1:-1]
        for i in range(len(player_stats)):
            player_stats[i] = [player] + player_stats[i]
            
        all_stats.extend(player_stats)

wnba_all_stats = pd.DataFrame(all_stats, columns = headers) 

# Preview data
print(wnba_all_stats.shape)
wnba_all_stats.head()

(3865, 27)


Unnamed: 0,name,Tm,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,...,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Farhiya Abdi,LAS,21,17,0,7.2,0.9,2.3,0.385,0.0,0.0,,...,0.5,0.7,0.667,0.2,0.5,0.7,0.5,0.1,0.1,0.2,0.9,2.2
1,Farhiya Abdi,LAS,22,17,0,8.4,1.1,2.6,0.432,0.1,0.4,0.143,...,0.2,0.2,1.0,0.2,0.4,0.5,0.3,0.1,0.0,0.4,0.9,2.5
2,Farhiya Abdi,LAS,23,18,5,13.2,1.6,4.4,0.354,0.5,1.8,0.273,...,0.2,0.4,0.571,0.3,1.4,1.7,0.5,0.4,0.1,0.7,1.7,3.8
3,Tajama Abraham,SAC,21,28,5,15.1,1.7,4.5,0.381,0.0,0.0,,...,0.9,1.4,0.684,1.1,1.3,2.4,0.5,0.4,0.4,1.8,2.5,4.4
4,Tajama Abraham,DET,22,12,0,3.7,0.4,1.2,0.357,0.0,0.0,,...,0.7,1.3,0.533,0.2,0.4,0.6,0.0,0.2,0.1,0.4,0.7,1.5


In [19]:
#wnba_all_stats.to_csv('./data/wnba_all_stats.csv')

# WNBA Career Highlights

In [20]:
all_stats = []

for i in range(len(df_wnba_roster)):
    # set url to a variable
    url = 'https://www.basketball-reference.com/wnba/players/' + df_wnba_roster['slug'].loc[i]
    player = df_wnba_roster['name'][i]
    
    # request data
    res = requests.get(url)
    
    # create a soup object
    soup = BeautifulSoup(res.content, 'lxml')
    
    # Get all players per game stats
    per_game = soup.find('table', {'id': 'wnba_per_game'})
    
    for row in per_game.find_all('tr', limit=1):
        # Code modified from https://towardsdatascience.com/web-scraping-nba-stats-4b4f8c525994
        headers = [th.getText() for th in soup.findAll('tr', limit=1)[0].findAll('th')]
        headers = headers[3:]
        name = 'name'
        headers = [name] + headers
        rows = soup.findAll('tr')
        player_stats = [[td.getText() for td in rows[i].findAll('td')] for i in range(len(rows))]
        player_stats = player_stats[-1]
        player_stats = [player] + player_stats[2:]


        all_stats.append(player_stats)

# Save to DataFrame
wnba_career_highlights = pd.DataFrame(all_stats, columns = headers) 

# Inspect data
print(wnba_career_highlights.shape)
wnba_career_highlights.head()

(923, 25)


Unnamed: 0,name,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Farhiya Abdi,52,5,9.6,1.2,3.1,0.383,0.2,0.8,0.25,1.0,2.3,0.426,0.3,0.4,0.682,0.2,0.8,1.0,0.4,0.2,0.1,0.4,1.2,2.9
1,Tajama Abraham,40,5,11.7,1.3,3.5,0.379,0.0,0.0,,1.3,3.5,0.379,0.9,1.3,0.642,0.9,1.0,1.9,0.3,0.4,0.3,1.4,2.0,3.5
2,Svetlana Abrosimova,263,153,24.2,3.3,8.4,0.395,0.9,2.5,0.35,2.4,5.9,0.415,1.6,2.5,0.654,1.2,2.9,4.1,2.1,1.3,0.2,2.4,2.3,9.2
3,Natalie Achonwa,120,68,18.2,3.0,5.5,0.542,0.0,0.0,,3.0,5.5,0.542,1.5,2.0,0.768,1.5,2.7,4.2,0.8,0.6,0.5,1.0,2.2,7.5
4,Jessica Adair,51,1,9.9,1.3,2.8,0.454,0.0,0.0,,1.3,2.8,0.454,1.1,1.7,0.625,1.1,1.6,2.7,0.3,0.2,0.4,0.8,1.6,3.6


In [22]:
# convert dtype object to float

# get col names except 'name'
features = [col for col in wnba_career_highlights.columns if col != 'name']

# convert all numeric columns to float
wnba_career_features = wnba_career_highlights[features].apply(pd.to_numeric, errors='coerce')

# merge dataframes with correct datatypes
wnba_career_highlights1 = pd.concat([wnba_career_highlights['name'], wnba_career_features], axis=1)

# show details
print(wnba_career_highlights1.shape)
wnba_career_highlights1.head()

(923, 25)


Unnamed: 0,name,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Farhiya Abdi,52,5,9.6,1.2,3.1,0.383,0.2,0.8,0.25,1.0,2.3,0.426,0.3,0.4,0.682,0.2,0.8,1.0,0.4,0.2,0.1,0.4,1.2,2.9
1,Tajama Abraham,40,5,11.7,1.3,3.5,0.379,0.0,0.0,,1.3,3.5,0.379,0.9,1.3,0.642,0.9,1.0,1.9,0.3,0.4,0.3,1.4,2.0,3.5
2,Svetlana Abrosimova,263,153,24.2,3.3,8.4,0.395,0.9,2.5,0.35,2.4,5.9,0.415,1.6,2.5,0.654,1.2,2.9,4.1,2.1,1.3,0.2,2.4,2.3,9.2
3,Natalie Achonwa,120,68,18.2,3.0,5.5,0.542,0.0,0.0,,3.0,5.5,0.542,1.5,2.0,0.768,1.5,2.7,4.2,0.8,0.6,0.5,1.0,2.2,7.5
4,Jessica Adair,51,1,9.9,1.3,2.8,0.454,0.0,0.0,,1.3,2.8,0.454,1.1,1.7,0.625,1.1,1.6,2.7,0.3,0.2,0.4,0.8,1.6,3.6


In [23]:
wnba_career_highlights1.dtypes

name     object
G         int64
GS        int64
MP      float64
FG      float64
FGA     float64
FG%     float64
3P      float64
3PA     float64
3P%     float64
2P      float64
2PA     float64
2P%     float64
FT      float64
FTA     float64
FT%     float64
ORB     float64
DRB     float64
TRB     float64
AST     float64
STL     float64
BLK     float64
TOV     float64
PF      float64
PTS     float64
dtype: object

In [24]:
#wnba_career_highlights1.to_csv('./data/wnba_all_career_highlights.csv')

# Add Positions to Career Highlights

In [25]:
wnba = pd.merge(wnba_positions, wnba_career_highlights1)

In [27]:
wnba.head()

Unnamed: 0,name,position,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,...,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Farhiya Abdi,Forward,52,5,9.6,1.2,3.1,0.383,0.2,0.8,0.25,1.0,...,0.3,0.4,0.682,0.2,0.8,1.0,0.4,0.2,0.1,0.4,1.2,2.9
1,Tajama Abraham,Center,40,5,11.7,1.3,3.5,0.379,0.0,0.0,,1.3,...,0.9,1.3,0.642,0.9,1.0,1.9,0.3,0.4,0.3,1.4,2.0,3.5
2,Svetlana Abrosimova,Forward,263,153,24.2,3.3,8.4,0.395,0.9,2.5,0.35,2.4,...,1.6,2.5,0.654,1.2,2.9,4.1,2.1,1.3,0.2,2.4,2.3,9.2
3,Natalie Achonwa,Forward,120,68,18.2,3.0,5.5,0.542,0.0,0.0,,3.0,...,1.5,2.0,0.768,1.5,2.7,4.2,0.8,0.6,0.5,1.0,2.2,7.5
4,Jessica Adair,Center,51,1,9.9,1.3,2.8,0.454,0.0,0.0,,1.3,...,1.1,1.7,0.625,1.1,1.6,2.7,0.3,0.2,0.4,0.8,1.6,3.6


In [28]:
#wnba.to_csv('./data/WNBA_ALL.csv')