In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By 
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
import pandas as pd
import re
from io import StringIO
import unicodedata

In [4]:
def init_driver():
    chrome_options = Options()
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--disable-infobars")
    chrome_options.add_argument("--headless=new") 
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    # prevent loading images, fonts, etc.
    chrome_prefs = {
        "profile.managed_default_content_settings.images": 2,
        "profile.managed_default_content_settings.stylesheets": 2,
        "profile.managed_default_content_settings.fonts": 2
    }
    chrome_options.add_experimental_option("prefs", chrome_prefs)
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    driver.set_page_load_timeout(7)  # fail faster on slow pages
    return driver

In [5]:
def draft_year_to_season_string(year):
    start_year = year - 1
    end_year = str(year)[-2:]
    return f"{start_year}-{end_year}"

def clean_name(name):
    # remove anyth in parantheses (eg. (UMD))
    cleaned_name = re.sub(r'\s*\(.*?\)', '', name) 
    # remove suffixes
    cleaned_name = re.sub(r'\s+(jr|sr|ii|iii|iv|v)\.?$', '', cleaned_name, flags=re.IGNORECASE)
    return cleaned_name.strip()

def clean_school_name(school):
    aliases = {
        "UConn": "Connecticut",
        "UNC": "North Carolina",
        "LSU": "Louisiana State",
        "USC": "Southern California"
    }
    return aliases.get(school, school)

# SPORTSREFERENCE SCRAPER FOR COLLEGE PLAYERS
def get_per_game_stats(player_name, target_season, school, driver):
    try:
        driver.get("https://www.sports-reference.com/cbb/players/")

        # wait for search bar to appear
        search_bar = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "input[name='search']"))
        )
        search_bar.clear()
        search_bar.send_keys(clean_name(player_name))

        search_bar.send_keys(Keys.RETURN)
        time.sleep(3)
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # handle search page if any (multiple results)
        if soup.find("div", class_="search-item"):
            candidates = []
            cleaned_school = clean_school_name(school)

            for item in soup.select("div.search-item"):
                name_tag = item.select_one("a")
                url = name_tag["href"] if name_tag else None 
                years_text = name_tag.next_sibling.strip() if name_tag and name_tag.next_sibling else ""
                team_tag = item.select_one("div.search-item-team")
                team_text = team_tag.text if team_tag else ""

                # extract last year played 
                try:
                    # \d{4} matches 4 digit number (year)
                    # \) matches the closing parenthesis
                    # $ ensures it's at the end, so only last year is captured, not starting year
                    # () groups the 4 digit num, so we can extract with .group(1)
                    last_year = int(re.search(r'(\d{4})\)$', years_text).group(1))
                except:
                    last_year = 0

                if cleaned_school.lower() in team_text.lower() and "(Men)" in team_text and last_year <= target_season:
                    candidates.append((last_year, url))

            if not candidates:
                print(f"No relevant results found for {player_name}")
                return pd.DataFrame()
            
            best_candidate = sorted(candidates, reverse=True)[0][1]  # get the url of the most recent candidate
            driver.get(f"https://www.sports-reference.com{best_candidate}")
            time.sleep(1)
            soup = BeautifulSoup(driver.page_source, "html.parser")


        # parse player page with beautifulsoup
        soup = BeautifulSoup(driver.page_source, "html.parser")
        table = soup.find("table", id="players_per_game")
        if table is not None:
            # turn into pandas df
            df = pd.read_html(StringIO(str(table)))[0]
            df = df[df['Season'] == draft_year_to_season_string(target_season)]
            df = df.drop(columns=['Team', 'Conf', 'Class', 'Awards'])
            df['Name'] = player_name

            # reorder columns
            cols = df.columns.tolist()
            cols.remove("Name")
            df = df[["Name"] + cols]
        else:
            print(f"No per-game stats found for {player_name}")
            df = pd.DataFrame()

        # advanced stats
        advanced_table = soup.find("table", id="players_advanced")
        if advanced_table is not None:
            advanced_df = pd.read_html(StringIO(str(advanced_table)))[0]
            advanced_df = advanced_df.drop(columns=['Team', 'Conf', 'Class', 'Awards', 'Pos', 'G', 'GS', 'MP'])
            advanced_df = advanced_df[advanced_df['Season'] == draft_year_to_season_string(target_season)]
        else:
            print(f"No advanced stats found for {player_name}") 
            advanced_df = pd.DataFrame()

        if not df.empty and not advanced_df.empty:
            result_df = pd.merge(df, advanced_df, on="Season")
        else:
            print("something is empty")
            result_df = pd.DataFrame()
            
        return result_df.reset_index(drop=True)

    except Exception as e:
        print(f"Error for {player_name}: {e}")
        return pd.DataFrame() 

In [6]:
# testing for indiv player 
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
get_per_game_stats("Devin Carter", 2024, "Providence", driver)

Unnamed: 0,Name,Season,Pos,G,GS,MP,FG,FGA,FG%,3P,...,BLK%,TOV%,USG%,OWS,DWS,WS,WS/40,OBPM,DBPM,BPM
0,Devin Carter,2023-24,G,33.0,33.0,35.3,6.6,14.0,0.473,2.5,...,2.8,14.1,28.1,3.3,2.7,6.0,0.206,7.3,5.0,12.3


In [7]:
def get_college_prospects(year):
    draft_pool_df = pd.read_csv(f"../data/raw/{year}/prospect_pool_{year}.csv")
    college_df = draft_pool_df[draft_pool_df['classification'] == 'College']
    return college_df, college_df['name'].tolist(), college_df['team'].tolist()

def collect_college_stats(year):
    _, names, schools = get_college_prospects(year)
    result_df = pd.DataFrame()
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

    try: 
        for name, school in zip(names, schools):
            try:
                stats_df = get_per_game_stats(name, year, school, driver)
                if not stats_df.empty:
                    result_df = pd.concat([result_df, stats_df], ignore_index=True)
            except Exception as e:
                print(f"Error processing {name}: {e}")
                continue
    finally:
        driver.quit()

    #result_df.to_csv(f"../data/processed/{year}/college_stats_{year}.csv", index=False)
    return result_df 

In [8]:
# testing for 2024 pool
collect_college_stats(2024)

Unnamed: 0,Name,Season,Pos,G,GS,MP,FG,FGA,FG%,3P,...,BLK%,TOV%,USG%,OWS,DWS,WS,WS/40,OBPM,DBPM,BPM
0,Reed Sheppard,2023-24,G,33.0,5.0,28.9,4.3,8.0,0.536,2.3,...,2.5,18.3,18.0,3.0,1.4,4.5,0.188,6.4,5.0,11.4
1,Stephon Castle,2023-24,G,34.0,30.0,27.0,4.0,8.5,0.472,0.6,...,2.1,13.0,22.0,2.5,1.7,4.3,0.186,3.4,3.1,6.5
2,Donovan Clingan,2023-24,C,35.0,33.0,22.5,5.3,8.3,0.639,0.1,...,11.4,7.3,25.1,3.8,2.2,6.0,0.302,8.8,6.2,15.0
3,Rob Dillingham,2023-24,G,32.0,1.0,23.3,5.4,11.3,0.475,2.0,...,0.3,13.5,30.3,2.6,0.7,3.3,0.174,5.6,0.7,6.2
4,Zach Edey,2023-24,C,39.0,39.0,32.0,8.6,13.8,0.623,0.0,...,6.9,10.8,33.4,7.9,2.6,10.5,0.336,13.1,3.7,16.8
5,Cody Williams,2023-24,F,24.0,18.0,28.4,4.5,8.1,0.552,0.7,...,2.4,17.2,20.8,1.2,0.8,2.0,0.118,2.4,1.5,3.9
6,Devin Carter,2023-24,G,33.0,33.0,35.3,6.6,14.0,0.473,2.5,...,2.8,14.1,28.1,3.3,2.7,6.0,0.206,7.3,5.0,12.3
7,Carlton Carrington,2023-24,G,33.0,33.0,33.2,4.8,11.7,0.412,2.0,...,0.9,13.0,23.0,2.2,1.4,3.6,0.133,2.6,1.3,3.9
8,Kel'el Ware,2023-24,C,30.0,30.0,32.2,6.3,10.7,0.586,0.6,...,6.5,10.9,22.9,2.7,1.5,4.2,0.175,5.8,2.7,8.5
9,Jared McCain,2023-24,G,36.0,36.0,31.6,4.9,10.5,0.462,2.4,...,0.2,10.1,21.1,3.3,1.9,5.2,0.182,6.0,2.6,8.5


In [9]:
def get_last_name_initial(name):
    parts = name.split()
    return parts[1][0] if parts else ""

# BASKETBALLREFERENCE SCRAPER - FOR G LEAGUE IGNITE
def get_gleague_per_game_stats(player_name, target_season, driver):
    try:
        driver.get(f"https://www.basketball-reference.com/gleague/players/{get_last_name_initial(player_name)}/")
        time.sleep(2)

        soup = BeautifulSoup(driver.page_source, "html.parser")
        player_url = None

        matches = []
        for p in soup.select("p"):
            a_tag = p.find("a")
            if a_tag and player_name.lower() == a_tag.text.strip().lower():
                full_text = p.get_text().strip()

                # extract all 4-digit years
                years = re.findall(r'\d{4}', full_text)
                if years:
                    try:
                        last_year = max(map(int, years))
                    except:
                        last_year = 0

                    if last_year <= target_season:
                        matches.append((last_year, a_tag["href"]))

        if not matches:
            print(f"No valid G-League player match for {player_name}")
            return pd.DataFrame()

        # pick the match with the latest last_year
        best_match = sorted(matches, reverse=True)[0][1]
        player_url = "https://www.basketball-reference.com" + best_match

        if not player_url:
            print(f"No page found for {player_name}")
            return pd.DataFrame()

        driver.get(player_url)
        time.sleep(2)
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # parse player page with beautifulsoup
        ## extract position
        position = ""
        info_div = soup.find("div", id="meta")
        if info_div:
            for p in info_div.find_all("p"):
                strong = p.find("strong")
                if strong and "Position" in strong.text:
                    raw_pos = p.get_text().replace("Position:", "").strip()
                    if "guard" in raw_pos.lower():
                        position = "G"
                    elif "forward" in raw_pos.lower():
                        position = "F"
                    break
        
        table = soup.find("table", id="nbdl_per_game-reg")
        if table is not None:
            # turn into pandas df
            df = pd.read_html(StringIO(str(table)))[0]
            df = df[df['Season'] == draft_year_to_season_string(target_season)]
            df = df.drop(columns=['Tm'])
            df['Name'] = player_name

            # reorder columns
            cols = df.columns.tolist()
            cols.remove("Name")
            df = df[["Name"] + cols]
        else:
            print(f"No per-game stats found for {player_name}")
            df = pd.DataFrame()

        # advanced stats
        advanced_table = soup.find("table", id="nbdl_advanced-reg")
        if advanced_table is not None:
            advanced_df = pd.read_html(StringIO(str(advanced_table)))[0]
            advanced_df = advanced_df.drop(columns=['Tm', 'G', 'MP'])
            advanced_df = advanced_df[advanced_df['Season'] == draft_year_to_season_string(target_season)]
        else:
            print(f"No advanced stats found for {player_name}") 
            advanced_df = pd.DataFrame()

        if not df.empty and not advanced_df.empty:
            result_df = pd.merge(df, advanced_df, on="Season")
        else:
            print("something is empty")
            result_df = pd.DataFrame()
        
        # insert position column if we have data
        if not result_df.empty:
            result_df.insert(1, "Pos", position)
            
        return result_df.reset_index(drop=True)

    except Exception as e:
        print(f"Error for {player_name}: {e}")
        return pd.DataFrame() 

In [10]:
# testing for indiv player
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
get_gleague_per_game_stats("Tyler Smith", 2024, driver)

Unnamed: 0,Name,Pos,Season,G,GS,MP,FG,FGA,FG%,3P,...,BLK%,TOV%,USG%,ORtg,DRtg,Unnamed: 19,OWS,DWS,WS,WS/48
0,Tyler Smith,F,2023-24,27,2,22.0,4.7,9.9,0.481,1.4,...,4.0,10.0,24.4,111,122,,0.2,0.3,0.5,0.043


In [11]:
def get_gleague_prospects(year):
    draft_pool_df = pd.read_csv(f"../data/raw/{year}/prospect_pool_{year}.csv")
    gleague_df = draft_pool_df[draft_pool_df['classification'] == 'G League']
    return gleague_df, gleague_df['name'].tolist()

def collect_gleague_stats(year):
    _, names = get_gleague_prospects(year)
    result_df = pd.DataFrame()
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

    try:
        for name in names:
            try:
                stats_df = get_gleague_per_game_stats(name, year, driver)
                if not stats_df.empty:
                    result_df = pd.concat([result_df, stats_df], ignore_index=True)
            except Exception as e:
                print(f"Error processing {name}: {e}")
                continue
    finally:
        driver.quit()

    # result_df.to_csv(f"../data/processed/{year}/gleague_stats_{year}.csv", index=False)
    return result_df


In [12]:
# testing for 2024 pool
collect_gleague_stats(2024)

Unnamed: 0,Name,Pos,Season,G,GS,MP,FG,FGA,FG%,3P,...,BLK%,TOV%,USG%,ORtg,DRtg,Unnamed: 19,OWS,DWS,WS,WS/48
0,Ron Holland,F,2023-24,14,14,33.6,6.9,15.6,0.445,0.9,...,1.7,13.9,27.7,99,118,,-0.6,0.4,-0.2,-0.018
1,Matas Buzelis,F,2023-24,26,26,32.0,5.4,12.1,0.448,0.9,...,5.8,14.6,21.7,97,121,,-1.0,0.4,-0.6,-0.035
2,Tyler Smith,F,2023-24,27,2,22.0,4.7,9.9,0.481,1.4,...,4.0,10.0,24.4,111,122,,0.2,0.3,0.5,0.043


In [13]:
# TANKATHON SCRAPING

def normalise_name(name):
    # normalise accents, replace apostrophes with spacec, join with dash
    name = unicodedata.normalize("NFKD", name)
    name = name.encode("ascii", "ignore").decode("utf-8")
    name = name.lower().replace("'", " ")
    name = name.lower().replace(".", "")
    return "-".join(name.split())

def extract_stats_section(soup, header_match_text):
    headers = soup.find_all("div", class_="stats-header")
    for header in headers:
        if header_match_text.lower() in header.get_text().lower():
            return header.find_next_sibling("div", class_="stats")
    return None

def parse_stats_div(stats_div):
    stats = {}
    for container in stats_div.find_all("div", class_="stat-container"):
        label = container.find("div", class_="stat-label")
        value = container.find("div", class_="stat-data")
        if label and value:
            stats[label.text.strip()] = value.text.strip()
    return stats

def classify_position(raw_position):
    raw_position = raw_position.upper()
    if "G" in raw_position:
        return "G"
    elif "F" in raw_position:
        return "F"
    elif "C" in raw_position:
        return "C"
    return ""

def get_tankathon_profile_stats(player_name, driver):
    try:
        normalised_name = normalise_name(player_name)
        url = f"https://www.tankathon.com/players/{normalised_name}"
        driver.get(url)

        WebDriverWait(driver, 4).until(
            EC.presence_of_element_located((By.CLASS_NAME, "data-block"))
        )

        soup = BeautifulSoup(driver.page_source, "html.parser")
        
        # extract position
        position = ""
        for block in soup.select("div.data-block"):
            label = block.find("div", class_="label")
            if label and "position" in label.text.lower():
                data_div = block.find("div", class_="data")
                if data_div:
                    raw_position = data_div.text.strip()
                    position = classify_position(raw_position)
                break

        # extract age at draft
        age = ""
        for block in soup.select("div.data-block"):
            label = block.find("div", class_="label")
            if label and "age at draft" in label.text.lower():
                data_div = block.find("div", class_="data")
                if data_div:
                    match = re.search(r"\d+\.\d+", data_div.text)
                    if match:
                        age = float(match.group())
                break

        # get stat sections
        per_game_div = extract_stats_section(soup, "per game averages")
        adv1_div = extract_stats_section(soup, "advanced stats i")
        adv2_div = extract_stats_section(soup, "advanced stats ii")

        # parse into dictionaries
        per_game_stats = parse_stats_div(per_game_div) if per_game_div else {}
        adv1_stats = parse_stats_div(adv1_div) if adv1_div else {}
        adv2_stats = parse_stats_div(adv2_div) if adv2_div else {}

        all_stats = {**per_game_stats, **adv1_stats, **adv2_stats}
        if not all_stats:
            print(f"No stats found for {player_name} on Tankathon")
            return pd.DataFrame()

        df = pd.DataFrame([all_stats])
        df.insert(0, "Name", player_name)
        df.insert(1, "Age", age)
        df.insert(2, "Pos", position)
        return df

    except Exception as e:
        print(f"Error scraping Tankathon profile for {player_name}: {e}")
        return pd.DataFrame()

In [14]:
# testing for indiv player
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
get_tankathon_profile_stats("Alexandre Sarr", driver)

Unnamed: 0,Name,Age,Pos,G,MP,FGM-FGA,FG%,3PM-3PA,3P%,FTM-FTA,...,AST/TO,PER,OWS/40,DWS/40,WS/40,ORTG,DRTG,OBPM,DBPM,BPM
0,Alexandre Sarr,19.15,F,30,18.0,3.6-7.1,0.5,0.5-1.9,0.276,1.9-2.7,...,1.0,20.96,,,,116.0,107.9,,,


In [15]:
def get_noncollege_prospects(year):
    draft_pool_df = pd.read_csv(f"../data/raw/{year}/prospect_pool_{year}.csv")
    noncollege_df = draft_pool_df[draft_pool_df['classification'].isin(['International', 'Overtime Elite'])]
    return noncollege_df, noncollege_df['name'].tolist()

def collect_noncollege_stats(year):
    _, names = get_noncollege_prospects(year)
    result_df = pd.DataFrame()
    driver = init_driver()

    try:
        for name in names:
            try:
                stats_df = get_tankathon_profile_stats(name, driver)
                if not stats_df.empty:
                    result_df = pd.concat([result_df, stats_df], ignore_index=True)
            except Exception as e:
                print(f"Error processing {name}: {e}")
                continue
    finally:
        driver.quit()

    # result_df.to_csv(f"../data/processed/{year}/noncollege_stats_{year}.csv", index=False)
    result_df = result_df.fillna("")  
    return result_df


In [16]:
collect_noncollege_stats(2024)

Unnamed: 0,Name,Age,Pos,G,MP,FGM-FGA,FG%,3PM-3PA,3P%,FTM-FTA,...,AST/TO,PER,OWS/40,DWS/40,WS/40,ORTG,DRTG,OBPM,DBPM,BPM
0,Zaccharie Risacher,19.2,F,65,23.3,3.9-8.3,0.47,1.4-3.7,0.387,1.9-2.7,...,0.57,15.19,,,,108.5,104.0,,,
1,Alexandre Sarr,19.15,F,30,18.0,3.6-7.1,0.5,0.5-1.9,0.276,1.9-2.7,...,1.0,20.96,,,,116.0,107.9,,,
2,Tidjane Salaun,18.86,F,54,23.4,3.4-8.3,0.406,1.4-4.4,0.316,1.6-2.2,...,0.7,13.07,,,,105.0,110.0,,,
3,Nikola Topić,18.86,G,23,27.8,5.3-10.6,0.498,1.1-3.7,0.306,2.8-3.2,...,2.4,19.85,,,,121.4,107.9,,,
4,AJ Johnson,19.55,G,29,8.2,1.1-3.2,0.355,0.3-1.2,0.278,0.2-0.4,...,1.15,7.08,,,,85.8,112.2,,,
5,Pacome Dadiet,18.9,G,59,15.3,2.4-4.7,0.502,0.8-2.3,0.358,1.0-1.4,...,0.71,14.31,,,,115.9,110.7,,,
6,Juan Núñez,20.04,G,54,23.1,3.7-7.9,0.47,0.8-2.6,0.319,1.6-2.7,...,2.14,17.31,,,,108.5,106.2,,,
7,Bobi Klintman,21.29,F,26,21.6,3.3-7.6,0.434,1.2-3.5,0.337,2.3-2.9,...,0.53,13.86,,,,103.2,108.4,,,
8,Nikola Djurisic,20.32,G,36,29.9,4.9-10.9,0.444,1.1-3.6,0.305,3.9-5.2,...,1.07,13.67,,,,102.4,112.7,,,
9,Melvin Ajinca,19.98,G,35,24.8,3.3-8.2,0.399,2.0-5.6,0.357,1.6-2.0,...,1.0,12.64,,,,114.5,107.4,,,


In [17]:
collect_noncollege_stats(2023)

Unnamed: 0,Name,Age,Pos,G,MP,FGM-FGA,FG%,3PM-3PA,3P%,FTM-FTA,...,AST/TO,PER,OWS/40,DWS/40,WS/40,ORTG,DRTG,OBPM,DBPM,BPM
0,Victor Wembanyama,19.45,C,44,32.2,7.3-15.6,0.468,1.3-4.7,0.272,5.0-6.1,...,0.84,24.48,,,,109.8,100.2,,,
1,Amen Thompson,20.38,G,21,30.0,6.3-11.7,0.539,0.8-3.0,0.254,3.1-4.6,...,2.13,,,,,,,,,
2,Ausar Thompson,20.38,G,21,29.2,6.4-13.6,0.472,1.6-4.7,0.333,3.0-4.3,...,1.78,,,,,,,,,
3,Bilal Coulibaly,18.9,F,53,24.0,3.9-7.4,0.527,0.8-2.2,0.336,2.3-3.3,...,0.88,17.53,,,,114.4,100.7,,,
4,James Nnaji,18.85,C,56,9.0,1.4-2.0,0.717,0.0-0.0,,0.6-1.2,...,0.38,16.98,,,,113.3,103.5,,,
5,Tristan Vukčević,20.27,F,40,11.6,2.1-3.7,0.558,0.6-1.5,0.373,0.9-1.1,...,1.13,19.37,,,,122.4,106.0,,,
6,Rayan Rupert,19.05,G,31,18.1,2.2-6.0,0.369,0.8-2.6,0.312,1.5-2.1,...,0.74,9.96,,,,97.7,105.1,,,
7,Tarik Biberovic,22.39,F,36,13.1,1.6-4.1,0.385,0.9-2.5,0.348,0.5-0.6,...,0.76,9.54,,,,104.4,113.2,,,
8,Malcolm Cazalon,21.81,G,39,26.6,4.5-10.0,0.45,1.6-5.1,0.32,2.3-2.9,...,1.42,16.53,,,,108.8,110.1,,,
9,Nadir Hifi,20.93,G,33,30.5,5.6-12.1,0.465,1.8-5.4,0.345,3.7-4.4,...,1.62,18.38,,,,116.3,110.2,,,


In [18]:
def get_age_from_tankathon(player_name, driver):
    try:
        normalised_name = normalise_name(player_name)
        url = f"https://www.tankathon.com/players/{normalised_name}"
        driver.get(url)

        WebDriverWait(driver, 4).until(
            EC.presence_of_element_located((By.CLASS_NAME, "data-block"))
        )

        soup = BeautifulSoup(driver.page_source, "html.parser")

        for block in soup.select("div.data-block"):
            label = block.find("div", class_="label")
            if label and "age at draft" in label.text.lower():
                data_div = block.find("div", class_="data")
                if data_div:
                    match = re.search(r"\d+\.\d+", data_div.text)
                    if match:
                        return float(match.group())
        return ""

    except Exception as e:
        print(f"Error scraping age for {player_name}: {e}")
        return ""
    
def add_ages_to_df(df, driver):
    ages = []
    for name in df['Name']:
        age = get_age_from_tankathon(name, driver)
        ages.append(age)
    df.insert(1, "Age", ages)
    return df

In [19]:
# COMBINE ALL SCRAPER DFs FOR SINGLE CSV 
def collect_draftpool_stats(year):
    college_stats = collect_college_stats(year)
    gleague_stats = collect_gleague_stats(year)
    noncollege_stats = collect_noncollege_stats(year)

    driver = init_driver()
    try:
        college_stats = add_ages_to_df(college_stats, driver)
        gleague_stats = add_ages_to_df(gleague_stats, driver)
    finally:
        driver.quit()
    
    combined_df = pd.concat([college_stats, gleague_stats, noncollege_stats], ignore_index=True)

    combined_df.to_csv(f"../data/processed/{year}/draftpool_stats_{year}.csv", index=False)
    return combined_df

In [20]:
collect_draftpool_stats(2024)

Unnamed: 0,Name,Age,Season,Pos,G,GS,MP,FG,FGA,FG%,...,Effective FG%EFG%,3PA Rate3PAR,FTA RateFTAR,Proj NBA 3P%NBA 3P%,AST/USG,AST/TO,OWS/40,DWS/40,ORTG,DRTG
0,Reed Sheppard,19.99,2023-24,G,33.0,5.0,28.9,4.3,8.0,0.536,...,,,,,,,,,,
1,Stephon Castle,19.63,2023-24,G,34.0,30.0,27.0,4.0,8.5,0.472,...,,,,,,,,,,
2,Donovan Clingan,20.32,2023-24,C,35.0,33.0,22.5,5.3,8.3,0.639,...,,,,,,,,,,
3,Rob Dillingham,19.46,2023-24,G,32.0,1.0,23.3,5.4,11.3,0.475,...,,,,,,,,,,
4,Zach Edey,22.10,2023-24,C,39.0,39.0,32.0,8.6,13.8,0.623,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69,Ulrich Chomche,18.47,,F,3,,30.0,,,.424,...,.545,.636,.121,,0.81,0.67,,,89.8,75.9
70,Ariel Hukporti,22.19,,C,43,,17.7,,,.563,...,.563,.000,.504,.306,0.31,0.45,,,110.0,100.6
71,Trentyn Flowers,19.28,,G,21,,14.4,,,.436,...,.489,.298,.394,.332,0.35,0.42,,,89.7,115.2
72,Ousmane N'Diaye,20.25,,F,13,,17.6,,,.434,...,.519,.642,.245,,0.11,0.17,,,103.2,120.0


In [21]:
# collect 2023 draft stats
collect_draftpool_stats(2023)

No valid G-League player match for Leonard Miller
No valid G-League player match for Sidy Cissoko


Unnamed: 0,Name,Age,Season,Pos,G,GS,MP,FG,FGA,FG%,...,Effective FG%EFG%,3PA Rate3PAR,FTA RateFTAR,Proj NBA 3P%NBA 3P%,AST/USG,AST/TO,OWS/40,DWS/40,ORTG,DRTG
0,Brandon Miller,20.57,2022-23,F,37.0,37.0,32.6,6.0,13.9,0.43,...,,,,,,,,,,
1,Anthony Black,19.41,2022-23,G,36.0,36.0,34.9,4.1,9.1,0.453,...,,,,,,,,,,
2,Jarace Walker,19.79,2022-23,F,36.0,35.0,27.6,4.4,9.4,0.465,...,,,,,,,,,,
3,Taylor Hendricks,19.57,2022-23,F,34.0,34.0,34.7,5.4,11.3,0.478,...,,,,,,,,,,
4,Cason Wallace,19.61,2022-23,G,32.0,32.0,32.2,4.3,9.8,0.446,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,Tristan Vukčević,20.27,,F,40,,11.6,,,.558,...,.633,.401,.306,.362,0.60,1.13,,,122.4,106.0
66,Rayan Rupert,19.05,,G,31,,18.1,,,.369,...,.436,.428,.348,.350,0.36,0.74,,,97.7,105.1
67,Tarik Biberovic,22.39,,F,36,,13.1,,,.385,...,.490,.601,.142,.373,0.29,0.76,,,104.4,113.2
68,Malcolm Cazalon,21.81,,G,39,,26.6,,,.450,...,.531,.506,.288,.368,0.80,1.42,,,108.8,110.1
