In [60]:
import os
import lxml
import sys
import time
import numpy as np
import pandas as pd
import pymongo
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Scrape NBA data and store in MongoDB

In [61]:
HOME_URL = r"https://www.basketball-reference.com/"
PLAYERS_URL = r"https://www.basketball-reference.com/players/"
ALL_NBA_URL = r"https://www.basketball-reference.com/awards/all_league.html"
ALL_DEFENSE_URL= r"https://www.basketball-reference.com/awards/all_defense.html"
MVP_URL = r"https://www.basketball-reference.com/awards/mvp.html"
SEASONS_URL = r"https://www.basketball-reference.com/leagues/"

DATA_PATH = r"/Users/jeffreysung/Documents/nba-champion-predictor/data"
PLAYERS_PATH = os.path.join(DATA_PATH, "players.csv")
AWARD_PATH = os.path.join(DATA_PATH, "awards.csv")
SEASON_PATH = os.path.join(DATA_PATH, "seasons.csv")

PARSER = 'lxml'
ONLY_ACTIVE_PLAYERS = None

In [62]:
load_dotenv()

COLLECTION_PLAYER = "player"
PLAYER_SECTIONS = ["totals", "per_game", "playoffs_per_game", "advanced", "playoffs_advanced"]
FIELDS_DEFAULT = ["name", "position", "height", "weight", "hall_of_fame", "active"]
FIELD_CHAMPION = "champion"
FIELD_MVP = "mvp"
FIELD_DPOY = "dpoy"
FIELD_ALL_NBA = "all_nba"
FIELD_ALL_DEFENSE = "all_defensive"

COLLECTION_TEAM = "team"

In [63]:
class MongoDBAgent:
    name = "MongoDBAgent"

    def __init__(self, con_string: str, db: str):
        self.__client = pymongo.MongoClient(con_string)
        self.__db = self.__client[db]
        self.__connect_db()


    def __connect_db(self):
        self.__client.server_info()


    def find(self, collection_name: str, query: dict, count=False):
        collection = self.__db[collection_name]
        documents = collection.find(query)
        if count: return collection.count_documents(query)
        if collection.count_documents(query) == 0: return None
        return documents


    def insert_one(self, collection_name: str, data: dict):
        collection = self.__db[collection_name]
        return_statement = collection.insert_one(data)


    def update_one(self, collection_name: str, query: dict, data):
        collection = self.__db[collection_name]
        collection.update_one(filter=query, update=data) 

In [41]:
mongodb_agent = MongoDBAgent(con_string=f"mongodb+srv://{USERNAME}:{PASSWORD}@nba.zhue1fc.mongodb.net/?retryWrites=true&w=majority", db=DB)

In [30]:
def filter_out_comment(soup: BeautifulSoup) -> BeautifulSoup:
    """Utility function to remove HTML comments."""
    content = str(soup).replace('<!--', '')
    content = content.replace('-->', '')
    return BeautifulSoup(content, PARSER)

def request_data(url: str, sleep_time_sec: float = 1.0, with_comment: bool = True) -> BeautifulSoup:
    """Get data from a url."""
    time.sleep(sleep_time_sec)
    
    if with_comment: 
        return BeautifulSoup(requests.get(url).content, PARSER)
    return filter_out_comment(BeautifulSoup(requests.get(url).content, PARSER)) 

## Get player data

In [13]:
# Players page of basketball-reference are sorted by alphabet so we get a separate url for each letter

content = request_data(PLAYERS_URL, 1.0, False)
content = content.find("div", id="div_alphabet")

alphabet_dict = {tag.get_text(): tag['href'] for tag in content.find_all("a")}
alphabet_dict = {key: urljoin(PLAYERS_URL, value) for key, value in alphabet_dict.items()}

df_alphabet_urls = pd.DataFrame.from_dict(alphabet_dict, orient="index", columns=["url"])

In [39]:
# Go through each letter of the alphabet and get data for each player (including url with even more detail) 
# then save to a csv

dfs = []
i = 0

for url in df_alphabet_urls["url"].values: 
    print(f"\r{df_alphabet_urls.index[i]}...")
    
    content = request_data(url, 4.0, False)
    content = content.find("table", id="players")
    df = pd.read_html(str(content))[0]
    
    df['Hall_of_Fame'] = df['Player'].str.contains("\*")
    df['Player'] = df['Player'].str.replace("\*", "", regex=True)
    
    all_as = [a for a in content.find_all("a") if "players" in a['href']]
    all_as = [ [urljoin(ALPHABET_URL, a['href']) , True] if a.previous_element.name == "strong" else [urljoin(ALPHABET_URL, a['href']), False] 
             for a in all_as]

    df['Active'] = [is_active[-1] for is_active in all_as]
    df['Url'] = [is_active[0] for is_active in all_as]
    
    dfs.append(df)
    i += 1

dfs = pd.concat(dfs, ignore_index=True)
dfs.to_csv(PLAYER_PATH, index=False, encoding="utf-8-sig")
print("\nSaved to: ", PLAYER_PATH)

A...
B...
C...
D...
E...
F...
G...
H...
I...
J...
K...
L...
M...
N...
O...
P...
Q...
R...
S...
T...
U...
V...
W...
Y...
Z...

Saved to:  /Users/jeffreysung/Documents/nba-champion-predictor/data/players.csv


In [56]:
df_players = pd.read_csv(PLAYER_PATH, encoding="utf-8-sig", index_col=False)

if ONLY_ACTIVE_PLAYERS != None:
    df_players = df_players[df_players['Active'] == ONLY_ACTIVE_PLAYERS]

df_players

Unnamed: 0,Player,From,To,Pos,Ht,Wt,Birth Date,Colleges,Hall_of_Fame,Active,Url
0,Alaa Abdelnaby,1991,1995,F-C,6-10,240.0,"June 24, 1968",Duke,False,False,https://www.basketball-reference.com/players/a...
1,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235.0,"April 7, 1946",Iowa State,False,False,https://www.basketball-reference.com/players/a...
2,Kareem Abdul-Jabbar,1970,1989,C,7-2,225.0,"April 16, 1947",UCLA,True,False,https://www.basketball-reference.com/players/a...
3,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162.0,"March 9, 1969",LSU,False,False,https://www.basketball-reference.com/players/a...
4,Tariq Abdul-Wahad,1998,2003,F,6-6,223.0,"November 3, 1974","Michigan, San Jose State",False,False,https://www.basketball-reference.com/players/a...
...,...,...,...,...,...,...,...,...,...,...,...
5096,Ante Žižić,2018,2020,F-C,6-10,266.0,"January 4, 1997",,False,False,https://www.basketball-reference.com/players/z...
5097,Jim Zoet,1983,1983,C,7-1,240.0,"December 20, 1953",Kent State University,False,False,https://www.basketball-reference.com/players/z...
5098,Bill Zopf,1971,1971,G,6-1,170.0,"June 7, 1948",Duquesne,False,False,https://www.basketball-reference.com/players/z...
5099,Ivica Zubac,2017,2023,C,7-0,240.0,"March 18, 1997",,False,True,https://www.basketball-reference.com/players/z...


In [57]:
def season_to_int(cell_value: str):
    """Seasons are listed as, for example, 1980-81
    We use this function to just return the first part, 1980
    """
    if cell_value[-2:] == "00":
        return (int(cell_value[:2]) + 1)*100
    else:
        return int(cell_value[:2] + cell_value[-2:])  

In [None]:
# Upload player data to MongoDB

i = 0
for name, pos, ht, wt, hof, active, url in df_players[['Player', 'Pos', 'Ht', 'Wt', 'Hall_of_Fame', 'Active', 'Url']].values:
    
    i += 1
    sys.stdout.write(f"\r{i}/{len(df_players)}...")
    
    for field, value in zip(FIELDS_DEFAULT, [name, pos, ht, wt, hof, active]):
        mongodb_agent.update_one(collection_name=COLLECTION_PLAYER, query={"player_id": url}, data={"$set": {field: value}})
       
    content = request_data(url, 4.0, False)
    
    for section in PLAYER_SECTIONS: 
        table = content.find("table", id=section)
        if table == None: continue

        # Filter Row/Columns
        df_table = pd.read_html(str(table))[0]
        df_table = df_table[df_table['Season'].notna()]
        df_table.drop([col for col in df_table.columns if "Unnamed:" in col], axis="columns", inplace=True)

        # Season
        df_table = df_table[df_table['Season'].str.contains('-')] 
        df_table['Season'] = df_table['Season'].apply(lambda x: season_to_int(x))

        # League
        df_table['Lg'] = df_table['Lg'][df_table['Lg'] == "NBA"]

        # Team
        team_ids = []
        for tr in table.find("tbody").find_all("tr")[:len(df_table)]:
            td = tr.find("td", attrs={"data-stat":"team_id"})

            if td == None:
                team_ids.append(urljoin(MAIN_URL, "DidNotPlay")) 
                continue
            if td.a == None:
                team_ids.append(urljoin(MAIN_URL, td.text))
                continue
            team_ids.append(urljoin(MAIN_URL, td.a['href']))

        df_table.insert(loc=3, column='Tm_id', value=team_ids)

        # Insert/Update
        player_count = mongodb_agent.find(collection_name=COLLECTION_PLAYER, query={"player_id": url}, count=True) 
        if player_count == 0:
            mongodb_agent.insert_one(collection_name=COLLECTION_PLAYER, data={"player_id": url, section: df_table.to_dict("records")}) 
        else: 
            mongodb_agent.update_one(collection_name=COLLECTION_PLAYER, query={"player_id": url}, data={ "$set": {section: df_table.to_dict("records")} })

        # Champion
        if section == "playoffs_per_game": 
            for span in table.find("tbody").findAll("span", class_="sr_ring"):
                mongodb_agent.update_one(collection_name=COLLECTION_PLAYER, query={"player_id": url}, data={ "$addToSet": {FIELD_CHAMPION: {"Season": season_to_int(span.previous)} } })    

78/5101...

## Get award data

## Get team data