# Scrape NBA data and store in MongoDB

In [60]:
import os
import lxml
import sys
import time
import numpy as np
import pandas as pd
import pymongo
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [61]:
HOME_URL = r"https://www.basketball-reference.com/"
PLAYERS_URL = r"https://www.basketball-reference.com/players/"
ALL_NBA_URL = r"https://www.basketball-reference.com/awards/all_league.html"
ALL_DEFENSE_URL= r"https://www.basketball-reference.com/awards/all_defense.html"
MVP_URL = r"https://www.basketball-reference.com/awards/mvp.html"
SEASONS_URL = r"https://www.basketball-reference.com/leagues/"

DATA_PATH = r"/Users/jeffreysung/Documents/nba-champion-predictor/data"
PLAYERS_PATH = os.path.join(DATA_PATH, "players.csv")
AWARD_PATH = os.path.join(DATA_PATH, "awards.csv")
SEASON_PATH = os.path.join(DATA_PATH, "seasons.csv")

PARSER = 'lxml'
ONLY_ACTIVE_PLAYERS = None # when we update our dataset set to True so we don't have to request for existing data

In [116]:
load_dotenv() # MongoDB USERNAME and PASSWORD are in .env file

COLLECTION_PLAYER = "player"
PLAYER_SECTIONS = ["totals", "per_game", "playoffs_per_game", "advanced", "playoffs_advanced"]
FIELDS_DEFAULT = ["name", "position", "height", "weight", "hall_of_fame", "active"]
FIELD_CHAMPION = "champion"
FIELD_MVP = "mvp"
FIELD_DPOY = "dpoy"
FIELD_ALL_NBA = "all_nba"
FIELD_ALL_DEFENSE = "all_defensive"

COLLECTION_TEAM = "team"
TEAM_SECTIONS = ['per_game-team', 'per_game-opponent', 'advanced-team']
FIELD_PLAYOFF = "playoff"
FIELD_CONFERENCE = "conference"

In [63]:
class MongoDBAgent:
    name = "MongoDBAgent"

    def __init__(self, con_string: str, db: str):
        self.__client = pymongo.MongoClient(con_string)
        self.__db = self.__client[db]
        self.__connect_db()


    def __connect_db(self):
        self.__client.server_info()


    def find(self, collection_name: str, query: dict, count=False):
        collection = self.__db[collection_name]
        documents = collection.find(query)
        if count: return collection.count_documents(query)
        if collection.count_documents(query) == 0: return None
        return documents


    def insert_one(self, collection_name: str, data: dict):
        collection = self.__db[collection_name]
        return_statement = collection.insert_one(data)


    def update_one(self, collection_name: str, query: dict, data):
        collection = self.__db[collection_name]
        collection.update_one(filter=query, update=data) 

In [41]:
mongodb_agent = MongoDBAgent(con_string=f"mongodb+srv://{USERNAME}:{PASSWORD}@nba.zhue1fc.mongodb.net/?retryWrites=true&w=majority", db=DB)

In [30]:
def filter_out_comment(soup: BeautifulSoup) -> BeautifulSoup:
    """Utility function to remove HTML comments."""
    content = str(soup).replace('<!--', '')
    content = content.replace('-->', '')
    return BeautifulSoup(content, PARSER)

def request_data(url: str, sleep_time_sec: float = 3.0, with_comment: bool = True) -> BeautifulSoup:
    """Get data from a url."""
    time.sleep(sleep_time_sec)
    
    if with_comment: 
        return BeautifulSoup(requests.get(url).content, PARSER)
    return filter_out_comment(BeautifulSoup(requests.get(url).content, PARSER)) 

## Get player data

In [13]:
# Players page of basketball-reference are sorted by alphabet so we get a separate url for each letter

content = request_data(PLAYERS_URL, 3.0, False)
content = content.find("div", id="div_alphabet")

alphabet_dict = {tag.get_text(): tag['href'] for tag in content.find_all("a")}
alphabet_dict = {key: urljoin(PLAYERS_URL, value) for key, value in alphabet_dict.items()}

df_alphabet_urls = pd.DataFrame.from_dict(alphabet_dict, orient="index", columns=["url"])

In [39]:
# Go through each letter of the alphabet and get data for each player (including url with even more detail) 
# then save to a csv

dfs = []
i = 0

for url in df_alphabet_urls["url"].values: 
    print(f"\r{df_alphabet_urls.index[i]}...")
    
    content = request_data(url, 3.0, False)
    content = content.find("table", id="players")
    df = pd.read_html(str(content))[0]
    
    df['Hall_of_Fame'] = df['Player'].str.contains("\*")
    df['Player'] = df['Player'].str.replace("\*", "", regex=True)
    
    all_as = [a for a in content.find_all("a") if "players" in a['href']]
    all_as = [ [urljoin(ALPHABET_URL, a['href']) , True] if a.previous_element.name == "strong" else [urljoin(ALPHABET_URL, a['href']), False] 
             for a in all_as]

    df['Active'] = [is_active[-1] for is_active in all_as]
    df['Url'] = [is_active[0] for is_active in all_as]
    
    dfs.append(df)
    i += 1

dfs = pd.concat(dfs, ignore_index=True)
dfs.to_csv(PLAYER_PATH, index=False, encoding="utf-8-sig")
print("\nSaved to: ", PLAYER_PATH)

A...
B...
C...
D...
E...
F...
G...
H...
I...
J...
K...
L...
M...
N...
O...
P...
Q...
R...
S...
T...
U...
V...
W...
Y...
Z...

Saved to:  /Users/jeffreysung/Documents/nba-champion-predictor/data/players.csv


In [56]:
df_players = pd.read_csv(PLAYER_PATH, encoding="utf-8-sig", index_col=False)

if ONLY_ACTIVE_PLAYERS != None:
    df_players = df_players[df_players['Active'] == ONLY_ACTIVE_PLAYERS]

df_players

Unnamed: 0,Player,From,To,Pos,Ht,Wt,Birth Date,Colleges,Hall_of_Fame,Active,Url
0,Alaa Abdelnaby,1991,1995,F-C,6-10,240.0,"June 24, 1968",Duke,False,False,https://www.basketball-reference.com/players/a...
1,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235.0,"April 7, 1946",Iowa State,False,False,https://www.basketball-reference.com/players/a...
2,Kareem Abdul-Jabbar,1970,1989,C,7-2,225.0,"April 16, 1947",UCLA,True,False,https://www.basketball-reference.com/players/a...
3,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162.0,"March 9, 1969",LSU,False,False,https://www.basketball-reference.com/players/a...
4,Tariq Abdul-Wahad,1998,2003,F,6-6,223.0,"November 3, 1974","Michigan, San Jose State",False,False,https://www.basketball-reference.com/players/a...
...,...,...,...,...,...,...,...,...,...,...,...
5096,Ante Žižić,2018,2020,F-C,6-10,266.0,"January 4, 1997",,False,False,https://www.basketball-reference.com/players/z...
5097,Jim Zoet,1983,1983,C,7-1,240.0,"December 20, 1953",Kent State University,False,False,https://www.basketball-reference.com/players/z...
5098,Bill Zopf,1971,1971,G,6-1,170.0,"June 7, 1948",Duquesne,False,False,https://www.basketball-reference.com/players/z...
5099,Ivica Zubac,2017,2023,C,7-0,240.0,"March 18, 1997",,False,True,https://www.basketball-reference.com/players/z...


In [57]:
def season_to_int(cell_value: str):
    """Seasons are listed as, for example, 1980-81
    We use this function to just return the second part, 1981
    """
    if cell_value[-2:] == "00":
        return (int(cell_value[:2]) + 1)*100
    else:
        return int(cell_value[:2] + cell_value[-2:])  

In [68]:
# Upload player data to MongoDB

i = 0
for name, pos, ht, wt, hof, active, url in df_players[['Player', 'Pos', 'Ht', 'Wt', 'Hall_of_Fame', 'Active', 'Url']].values:
    
    i += 1
    sys.stdout.write(f"\r{i}/{len(df_players)}")
       
    content = request_data(url, 3.0, False)
    
    for field, value in zip(FIELDS_DEFAULT, [name, pos, ht, wt, hof, active]):
        mongodb_agent.update_one(collection_name=COLLECTION_PLAYER, query={"player_id": url}, data={"$set": {field: value}})
    
    for section in PLAYER_SECTIONS: 
        table = content.find("table", id=section)
        if table == None: continue

        # Filter Row/Columns
        df_table = pd.read_html(str(table))[0]
        df_table = df_table[df_table['Season'].notna()]
        df_table.drop([col for col in df_table.columns if "Unnamed:" in col], axis="columns", inplace=True)

        # Season
        df_table = df_table[df_table['Season'].str.contains('-')] 
        df_table['Season'] = df_table['Season'].apply(lambda x: season_to_int(x))

        # League
        df_table['Lg'] = df_table['Lg'][df_table['Lg'] == "NBA"]

        # Team
        team_ids = []
        for tr in table.find("tbody").find_all("tr")[:len(df_table)]:
            td = tr.find("td", attrs={"data-stat":"team_id"})

            if td == None:
                team_ids.append(urljoin(MAIN_URL, "DidNotPlay")) 
                continue
            if td.a == None:
                team_ids.append(urljoin(MAIN_URL, td.text))
                continue
            team_ids.append(urljoin(MAIN_URL, td.a['href']))

        df_table.insert(loc=3, column='Tm_id', value=team_ids)

        # Insert/Update
        player_count = mongodb_agent.find(collection_name=COLLECTION_PLAYER, query={"player_id": url}, count=True) 
        if player_count == 0:
            mongodb_agent.insert_one(collection_name=COLLECTION_PLAYER, data={"player_id": url, section: df_table.to_dict("records")}) 
        else: 
            mongodb_agent.update_one(collection_name=COLLECTION_PLAYER, query={"player_id": url}, data={ "$set": {section: df_table.to_dict("records")} })

        # Champion
        if section == "playoffs_per_game": 
            for span in table.find("tbody").findAll("span", class_="sr_ring"):
                mongodb_agent.update_one(collection_name=COLLECTION_PLAYER, query={"player_id": url}, data={ "$addToSet": {FIELD_CHAMPION: {"Season": season_to_int(span.previous)} } })    

5101/5101...

## Get award data

In [75]:
# MVP Voting
content = request_data(url=MVP_URL, sleep_time_sec=3.0, with_comment=False)
table = content.find("table", id="mvp_NBA")
df_table = pd.read_html(str(table))[0]
df_table = df_table.droplevel(0, axis=1)
df_table['Season'] = df_table['Season'].apply(lambda x: season_to_int(x))

votings = []
for td in table.find("tbody").findAll("td", class_="center", attrs={"data-stat":"voting"}):
    votings.append(urljoin(MAIN_URL, td.a['href']))
df_table.insert(loc=len(df_table.columns), column='Voting_Url', value=votings)

df_table = df_table[['Season', 'Voting_Url']]

# All NBA
df_table.loc[len(df_table)] = ["All_NBA", ALL_NBA_URL]

# All Defensive
df_table.loc[len(df_table)] = ["All_Defensive", ALL_DEFENSE_URL]

df_table.to_csv(AWARD_PATH, index=False, encoding="utf-8-sig")
print("Saved to: ", AWARD_PATH)

Saved to:  /Users/jeffreysung/Documents/nba-champion-predictor/data/awards.csv


In [76]:
df_awards = pd.read_csv(AWARD_PATH, encoding="utf-8-sig")
df_awards

Unnamed: 0,Season,Voting_Url
0,2022,https://www.basketball-reference.com/awards/aw...
1,2021,https://www.basketball-reference.com/awards/aw...
2,2020,https://www.basketball-reference.com/awards/aw...
3,2019,https://www.basketball-reference.com/awards/aw...
4,2018,https://www.basketball-reference.com/awards/aw...
...,...,...
64,1958,https://www.basketball-reference.com/awards/aw...
65,1957,https://www.basketball-reference.com/awards/aw...
66,1956,https://www.basketball-reference.com/awards/aw...
67,All_NBA,https://www.basketball-reference.com/awards/al...


In [81]:
i = 0
for season, url in df_awards[['Season', 'Voting_Url']].values[:-2]:
    i += 1
    sys.stdout.write(f"\r{i}/{len(df_awards) - 2}")
    
    content = request_data(url, 3.0, False)

    for table_id in ["mvp", "dpoy"]:
        table = content.find("table", id=table_id)

        if table == None and table_id == "mvp": 
            table = content.find("table", id=f"nba_{table_id}")
        if table == None: 
            continue

        df_table = pd.read_html(str(table))[0]
        df_table = df_table.droplevel(0, axis=1)
        df_table = df_table[['Rank', 'Player', 'Share']]
        if df_table['Rank'].dtype != np.int64:
            df_table['Rank'] = df_table['Rank'].apply(lambda cell: int(cell.replace("T", "")))

        player_urls = []
        for td in table.find("tbody").find_all("td", attrs={"data-stat":"player"}):
            player_urls.append(urljoin(MAIN_URL, td.a['href']))

        df_table['Player_Urls'] = player_urls
        for rk, share, url in df_table[['Rank', 'Share', 'Player_Urls']].values:
            if table_id == "mvp":
                mongodb_agent.update_one(collection_name=COLLECTION_PLAYER, query={"player_id": url}, data={ "$addToSet": {FIELD_MVP: {"Season": int(season), "Rank": rk, "Share": share}} })
            else:
                mongodb_agent.update_one(collection_name=COLLECTION_PLAYER, query={"player_id": url}, data={ "$addToSet": {FIELD_DPOY: {"Season": int(season), "Rank": rk, "Share": share}} })

67/67...

In [105]:
for field, url in df_awards[['Season', 'Voting_Url']].values[-2:]:
    
    content = request_data(url, 3.0, False)
    
    if field == "All_NBA":
        print(field)
        table = content.find("table", id="awards_all_league")
    elif field == "All_Defensive":
        print(f"\n{field}")
        table = content.find("table", id="awards_all_defense") 

    df_table = pd.read_html(str(table))[0]

    df_table = df_table[(df_table['Season'].notna()) & (df_table['Lg'] == "NBA") & ((df_table['Tm'] == "1st") | (df_table['Tm'] == "2nd") | (df_table['Tm'] == "3rd"))]
    df_table.drop([col for col in df_table.columns if col in ['Lg', 'Tm', 'Voting']], axis="columns", inplace=True)
    df_table['Season'] = df_table['Season'].apply(lambda x: season_to_int(x))

    list_tr = table.find("tbody").findAll("tr")
    list_tr = [tr for idx, tr in enumerate(list_tr) if idx in list(df_table.index)]

    j = 0
    for tr, season in zip(list_tr, df_table['Season'].values):
            j += 1
            sys.stdout.write(f"\r{j}/{len(df_table)}")

            start = 1
            end = start + 5
            while tr.find("td", class_="left", attrs={"data-stat":str(start)}) == None: 
                start = end
                end = start + 5

            for i in range(start,end,1): 
                    td = tr.find("td", class_="left", attrs={"data-stat":str(i)})
                    if td == None: 
                        print("break")
                        break
                    url = urljoin(MAIN_URL, td.a['href'])
                    if field == "All_NBA":
                        mongodb_agent.update_one(collection_name=COLLECTION_PLAYER, query={"player_id": url}, data={ "$addToSet": {FIELD_ALL_NBA: {"Season": int(season)} } })
                    if field == "All_Defensive":  
                        mongodb_agent.update_one(collection_name=COLLECTION_PLAYER, query={"player_id": url}, data={ "$addToSet": {FIELD_ALL_DEFENSE: {"Season": int(season)} } })                

All_NBA
180/180...
All_Defensive
108/108...

## Get team data

In [107]:
content = request_data(SEASON_URL, with_comment=False)
content = content.find("table", id="stats")
df = pd.read_html(str(content))[0]
df = df.droplevel(0, axis=1)
df.drop(['MVP', 'Rookie of the Year', 'Points', 'Rebounds', 'Assists', 'Win Shares'], axis="columns", inplace=True)
df = df[df['Lg'] == 'NBA']

seasons = []
for season in df['Season'].values:
    season = content.find(string=season)
    seasons.append(urljoin(SEASON_URL, season.parent['href']))

df['Url_Season_Summary'] = seasons  
df['Url_Season_Standings'] = df['Url_Season_Summary'].apply(lambda cell: cell[:-len(".html")] + "_standings.html")
df['Url_Playoff_Standings'] = df['Url_Season_Standings'].str.replace("leagues", "playoffs")
df['Season'] = df['Season'].apply(lambda x: season_to_int(x))    
    
df.to_csv(SEASON_PATH, index=False, encoding="utf-8-sig")
print("Saved to: ", SEASON_PATH)

Saved to:  /Users/jeffreysung/Documents/nba-champion-predictor/data/seasons.csv


In [110]:
df_seasons = pd.read_csv(SEASON_PATH, usecols=['Url_Season_Summary', 'Url_Season_Standings', 'Url_Playoff_Standings'], encoding="utf-8-sig")
df_seasons

Unnamed: 0,Url_Season_Summary,Url_Season_Standings,Url_Playoff_Standings
0,https://www.basketball-reference.com/leagues/N...,https://www.basketball-reference.com/leagues/N...,https://www.basketball-reference.com/playoffs/...
1,https://www.basketball-reference.com/leagues/N...,https://www.basketball-reference.com/leagues/N...,https://www.basketball-reference.com/playoffs/...
2,https://www.basketball-reference.com/leagues/N...,https://www.basketball-reference.com/leagues/N...,https://www.basketball-reference.com/playoffs/...
3,https://www.basketball-reference.com/leagues/N...,https://www.basketball-reference.com/leagues/N...,https://www.basketball-reference.com/playoffs/...
4,https://www.basketball-reference.com/leagues/N...,https://www.basketball-reference.com/leagues/N...,https://www.basketball-reference.com/playoffs/...
...,...,...,...
69,https://www.basketball-reference.com/leagues/N...,https://www.basketball-reference.com/leagues/N...,https://www.basketball-reference.com/playoffs/...
70,https://www.basketball-reference.com/leagues/N...,https://www.basketball-reference.com/leagues/N...,https://www.basketball-reference.com/playoffs/...
71,https://www.basketball-reference.com/leagues/N...,https://www.basketball-reference.com/leagues/N...,https://www.basketball-reference.com/playoffs/...
72,https://www.basketball-reference.com/leagues/N...,https://www.basketball-reference.com/leagues/N...,https://www.basketball-reference.com/playoffs/...


In [126]:
def get_season_summary(season: int, lg: str, url: str):
    
    content = request_data(url, 3.0, with_comment=False)

    # Conference 
    for conference, table_id in [("East", "divs_standings_E"), ("West", "divs_standings_W")]:
        table = content.find("table", id=table_id)

        # Before 1970
        if table == None: 
            curr_conference = conference 
            table = content.find("table", id="divs_standings_") 

            for tr in table.find("tbody").findAll("tr"):
                if tr['class'][0] == "thead" and "East" in tr.text: curr_conference = "East"
                elif tr['class'][0] == "thead" and "West" in tr.text: curr_conference = "West" 

                if tr['class'][0] == "full_table":
                    mongodb_agent.update_one(collection_name=COLLECTION_TEAM, query={"team_id": urljoin(HOME_URL, tr.a['href'])}, data={ "$set": {FIELD_CONFERENCE: curr_conference} })  
            break

        # Until 1971
        for th in table.find("tbody").findAll("th", class_="left", attrs={"scope":"row", "data-stat": "team_name"}):
            mongodb_agent.update_one(collection_name=COLLECTION_TEAM, query={"team_id": urljoin(HOME_URL, th.a['href'])}, data={ "$set": {FIELD_CONFERENCE: conference} })


    for table_id in TEAM_SECTIONS: 
        table = content.find("table", id=table_id)
        df_team = pd.read_html(str(table))[0]

        # Change advanced-team columns
        if table_id == 'advanced-team': 
            df_team.columns = [col[1] if 'Unnamed:' in col[0] else '|'.join([str(level_col) for level_col in col]) for col in df_team.columns]

        # Filter Row/Columns
        df_team.drop([col for col in df_team.columns if "Unnamed:" in col], axis="columns", inplace=True)
        del df_team['Rk']
        del df_team['Team']
        df_team = df_team[:-1]

        # Change per_game-opponent columns
        if table_id == 'per_game-opponent': 
            df_team.columns = [f"{col}_opp" for col in df_team.columns]

        # Get team url -> team id    
        teams_url = []
        for td in table.find("tbody").findAll("td", class_="left", attrs={"data-stat":"team"}):
            teams_url.append(urljoin(HOME_URL, td.a['href']))

        if len(teams_url) != len(df_team):
            ValueError()

        # Insert/Update
        for team_url in teams_url:
            team_count = mongodb_agent.find(collection_name=COLLECTION_TEAM, query={"team_id": team_url}, count=True) 
            if team_count == 0:
                mongodb_agent.insert_one(collection_name=COLLECTION_TEAM, data={"team_id": team_url, "season": int(season), "lg": lg}) 

        for team_url, team_dict in zip(teams_url, df_team.to_dict("records")):
            mongodb_agent.update_one(collection_name=COLLECTION_TEAM, query={"team_id": team_url}, data={ "$set": {table_id: team_dict} })

In [142]:
def get_season_standing(champ, url: str):
    content = request_data(url, 3.0, with_comment=False)
    table_id = "expanded_standings"
    table = content.find("table", id=table_id)
    df_team = pd.read_html(str(table))[0]

    # Filter Row/Columns
    df_team = df_team.droplevel(0, axis=1)
    df_team.drop([col for col in df_team.columns if col not in ['Rk', 'Team', 'Overall', 'Home', 'Road', 'Pre', 'Post', '≤3', '≥10', 'Oct', 'Nov', 'Dec', 'Jan', 'Feb', 'Mar', 'Apr']],
                 axis="columns",
                 inplace=True)

    # Season
    df_team.rename(columns={'Rk':'Rk_Season'}, inplace=True)

    # Champion
    if isinstance(champ, str):
        champ_list = df_team['Team'].str.contains(champ).to_list()
    else:
        champ_list = [False for x in range(len(df_team['Team']))]

    # Team
    team_list = df_team.pop(item="Team")

    # Get team url -> team id
    teams_url = []
    for td in table.find("tbody").findAll("td", class_="left", attrs={"data-stat":"team_name"}):
        teams_url.append(urljoin(HOME_URL, td.a['href']))

    if len(teams_url) != len(df_team) != len(champ_list) != len(team_list):
        ValueError()

    # Insert/Update
    for team_url, team_dict, champ_bool, team_name in zip(teams_url, df_team.to_dict("records"), champ_list, team_list):
        mongodb_agent.update_one(collection_name=COLLECTION_TEAM, query={"team_id": team_url}, data={ "$set": {"name": team_name}})
        mongodb_agent.update_one(collection_name=COLLECTION_TEAM, query={"team_id": team_url}, data={ "$set": {"champion": champ_bool}})
        mongodb_agent.update_one(collection_name=COLLECTION_TEAM, query={"team_id": team_url}, data={ "$set": {table_id: team_dict} })

In [113]:
def champion_share(rank):
    if rank == 0: return 0 
    if rank == 1: return 100
    if rank == 2: return 50
    if rank == 3 or rank == 4: return 25
    if rank >= 5 and rank <= 8: return 12.5
    if rank >= 9 and rank <= 16: return 6.25
    if rank >= 17 and rank <= 32: return 3.125

In [115]:
def get_win(cell: str) -> int:
    if pd.isna(cell):
        return 0
    return int(cell.split('-')[0])

In [147]:
def get_playoff_standing(url: str):
    table_id = "expanded_standings"
    content = request_data(url, 3.0, False)
    table = content.find("table", id=table_id)
    if table:
        df_team = pd.read_html(str(table))[0]

        # Filter Row/Columns
        df_team = df_team.droplevel(0, axis=1)
        df_team.drop([col for col in df_team.columns if col not in ['Rk', 'Overall']],
                     axis="columns", 
                     inplace=True)

        # Champion Percent
        df_team['Champion_Percent'] = df_team['Rk'].apply(lambda cell: champion_share(cell))

        # Champion Win share
        df_team['Overall'] = df_team['Overall'].apply(lambda cell: get_win(cell))
        max_wins = df_team['Overall'].values[0]
        df_team['Overall'] = df_team['Overall'].apply(lambda cell: cell/max_wins)
        df_team.rename(columns={'Overall':'Champion_Win_Share'}, inplace=True)

        # Get team url -> team id
        teams_url = []
        for td in table.find("tbody").findAll("td", class_="left", attrs={"data-stat":"team_name"}):
            teams_url.append(urljoin(HOME_URL, td.a['href']))

        if len(teams_url) != len(df_team):
            ValueError()

        # Insert/Update
        for team_url, team_dict in zip(teams_url, df_team.to_dict("records")):
            mongodb_agent.update_one(collection_name=COLLECTION_TEAM, query={"team_id": team_url}, data={ "$set": {FIELD_PLAYOFF: team_dict} })

In [None]:
# Season Summary
i = 0
for url in df_seasons['Url_Season_Summary'].unique(): 
    i += 1
    sys.stdout.write(f"\r{i}/{len(df['Url_Season_Summary'].unique())}")
    season = df.loc[df['Url_Season_Summary'] == url, "Season"].values[0]
    lg = df.loc[df['Url_Season_Summary'] == url, "Lg"].values[0]
    get_season_summary(season=season, lg=lg, url=url)

In [144]:
# Season Standings 
i = 0     
for url in df_seasons['Url_Season_Standings'].unique():
    i += 1
    sys.stdout.write(f"\r{i}/{len(df['Url_Season_Standings'].unique())}")
    champ = df.loc[df['Url_Season_Standings'] == url, "Champion"].values[0]
    get_season_standing(champ=champ, url=url)

74/74...

In [148]:
# Playoff Standings    
i = 0     
for url in df_seasons['Url_Playoff_Standings'].unique():
    i += 1
    sys.stdout.write(f"\r{i}/{len(df['Url_Playoff_Standings'].unique())}")
    get_playoff_standing(url=url)

74/74