In [91]:
from selenium import webdriver
from selenium.webdriver.common.by import By 
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import pandas as pd
import re
from io import StringIO

In [89]:
def draft_year_to_season_string(year):
    start_year = year - 1
    end_year = str(year)[-2:]
    return f"{start_year}-{end_year}"

def clean_name(name):
    # remove anyth in parantheses (eg. (UMD))
    cleaned_name = re.sub(r'\s*\(.*?\)', '', name) 
    # remove suffixes
    cleaned_name = re.sub(r'\s+(jr|sr|ii|iii|iv|v)\.?$', '', cleaned_name, flags=re.IGNORECASE)
    return cleaned_name.strip()

def clean_school_name(school):
    aliases = {
        "UConn": "Connecticut",
        "UNC": "North Carolina",
        "LSU": "Louisiana State",
        "USC": "Southern California"
    }
    return aliases.get(school, school)

def get_per_game_stats(player_name, target_season, school, driver):
    try:
        driver.get("https://www.sports-reference.com/cbb/players/")

        # wait for search bar to appear
        search_bar = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "input[name='search']"))
        )
        search_bar.clear()
        search_bar.send_keys(clean_name(player_name))

        search_bar.send_keys(Keys.RETURN)
        time.sleep(3)
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # handle search page if any (multiple results)
        if soup.find("div", class_="search-item"):
            candidates = []
            cleaned_school = clean_school_name(school)

            for item in soup.select("div.search-item"):
                name_tag = item.select_one("a")
                url = name_tag["href"] if name_tag else None 
                years_text = name_tag.next_sibling.strip() if name_tag and name_tag.next_sibling else ""
                team_tag = item.select_one("div.search-item-team")
                team_text = team_tag.text if team_tag else ""

                # extract last year played 
                try:
                    # \d{4} matches 4 digit number (year)
                    # \) matches the closing parenthesis
                    # $ ensures it's at the end, so only last year is captured, not starting year
                    # () groups the 4 digit num, so we can extract with .group(1)
                    last_year = int(re.search(r'(\d{4})\)$', years_text).group(1))
                except:
                    last_year = 0

                if cleaned_school.lower() in team_text.lower() and "(Men)" in team_text and last_year <= target_season:
                    candidates.append((last_year, url))

            if not candidates:
                print(f"No relevant results found for {player_name}")
                return pd.DataFrame()
            
            best_candidate = sorted(candidates, reverse=True)[0][1]  # get the url of the most recent candidate
            driver.get(f"https://www.sports-reference.com{best_candidate}")
            time.sleep(1)
            soup = BeautifulSoup(driver.page_source, "html.parser")


        # parse player page with beautifulsoup
        soup = BeautifulSoup(driver.page_source, "html.parser")
        table = soup.find("table", id="players_per_game")
        if table is not None:
            # turn into pandas df
            df = pd.read_html(StringIO(str(table)))[0]
            df = df[df['Season'] == draft_year_to_season_string(target_season)]
            df = df.drop(columns=['Team', 'Conf', 'Class', 'Awards'])
            df['Name'] = player_name

            # reorder columns
            cols = df.columns.tolist()
            cols.remove("Name")
            df = df[["Name"] + cols]
        else:
            print(f"No per-game stats found for {player_name}")
            df = pd.DataFrame()

        # advanced stats
        advanced_table = soup.find("table", id="players_advanced")
        if advanced_table is not None:
            advanced_df = pd.read_html(StringIO(str(advanced_table)))[0]
            advanced_df = advanced_df.drop(columns=['Team', 'Conf', 'Class', 'Awards', 'Pos', 'G', 'GS', 'MP'])
            advanced_df = advanced_df[advanced_df['Season'] == draft_year_to_season_string(target_season)]
        else:
            print(f"No advanced stats found for {player_name}") 
            advanced_df = pd.DataFrame()

        if not df.empty and not advanced_df.empty:
            result_df = pd.merge(df, advanced_df, on="Season")
        else:
            print("something is empty")
            result_df = pd.DataFrame()
            
        return result_df.reset_index(drop=True)

    except Exception as e:
        print(f"Error for {player_name}: {e}")
        return pd.DataFrame() 

In [84]:
# testing for indiv player 
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
get_per_game_stats("Devin Carter", 2024, "Providence", driver)

  df = pd.read_html(str(table))[0]
  advanced_df = pd.read_html(str(advanced_table))[0]


Unnamed: 0,Name,Season,Pos,G,GS,MP,FG,FGA,FG%,3P,...,BLK%,TOV%,USG%,OWS,DWS,WS,WS/40,OBPM,DBPM,BPM
0,Devin Carter,2023-24,G,33.0,33.0,35.3,6.6,14.0,0.473,2.5,...,2.8,14.1,28.1,3.3,2.7,6.0,0.206,7.3,5.0,12.3


In [85]:
def get_college_prospects(year):
    draft_pool_df = pd.read_csv(f"../data/raw/{year}/prospect_pool_{year}.csv")
    college_df = draft_pool_df[draft_pool_df['classification'] == 'College']
    return college_df, college_df['name'].tolist(), college_df['team'].tolist()

def collect_college_stats(year):
    _, names, schools = get_college_prospects(year)
    result_df = pd.DataFrame()
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

    try: 
        for name, school in zip(names, schools):
            try:
                stats_df = get_per_game_stats(name, year, school, driver)
                if not stats_df.empty:
                    result_df = pd.concat([result_df, stats_df], ignore_index=True)
            except Exception as e:
                print(f"Error processing {name}: {e}")
                continue
    finally:
        driver.quit()

    #result_df.to_csv(f"../data/processed/{year}/college_stats_{year}.csv", index=False)
    return result_df 

In [90]:
# testing for 2024 pool
collect_college_stats(2024)

  df = pd.read_html(str(table))[0]
  advanced_df = pd.read_html(str(advanced_table))[0]
  df = pd.read_html(str(table))[0]
  advanced_df = pd.read_html(str(advanced_table))[0]
  df = pd.read_html(str(table))[0]
  advanced_df = pd.read_html(str(advanced_table))[0]
  df = pd.read_html(str(table))[0]
  advanced_df = pd.read_html(str(advanced_table))[0]
  df = pd.read_html(str(table))[0]
  advanced_df = pd.read_html(str(advanced_table))[0]
  df = pd.read_html(str(table))[0]
  advanced_df = pd.read_html(str(advanced_table))[0]
  df = pd.read_html(str(table))[0]
  advanced_df = pd.read_html(str(advanced_table))[0]
  df = pd.read_html(str(table))[0]
  advanced_df = pd.read_html(str(advanced_table))[0]
  df = pd.read_html(str(table))[0]
  advanced_df = pd.read_html(str(advanced_table))[0]
  df = pd.read_html(str(table))[0]
  advanced_df = pd.read_html(str(advanced_table))[0]
  df = pd.read_html(str(table))[0]
  advanced_df = pd.read_html(str(advanced_table))[0]
  df = pd.read_html(str(table))[

Unnamed: 0,Name,Season,Pos,G,GS,MP,FG,FGA,FG%,3P,...,BLK%,TOV%,USG%,OWS,DWS,WS,WS/40,OBPM,DBPM,BPM
0,Reed Sheppard,2023-24,G,33.0,5.0,28.9,4.3,8.0,0.536,2.3,...,2.5,18.3,18.0,3.0,1.4,4.5,0.188,6.4,5.0,11.4
1,Stephon Castle,2023-24,G,34.0,30.0,27.0,4.0,8.5,0.472,0.6,...,2.1,13.0,22.0,2.5,1.7,4.3,0.186,3.4,3.1,6.5
2,Donovan Clingan,2023-24,C,35.0,33.0,22.5,5.3,8.3,0.639,0.1,...,11.4,7.3,25.1,3.8,2.2,6.0,0.302,8.8,6.2,15.0
3,Rob Dillingham,2023-24,G,32.0,1.0,23.3,5.4,11.3,0.475,2.0,...,0.3,13.5,30.3,2.6,0.7,3.3,0.174,5.6,0.7,6.2
4,Zach Edey,2023-24,C,39.0,39.0,32.0,8.6,13.8,0.623,0.0,...,6.9,10.8,33.4,7.9,2.6,10.5,0.336,13.1,3.7,16.8
5,Cody Williams,2023-24,F,24.0,18.0,28.4,4.5,8.1,0.552,0.7,...,2.4,17.2,20.8,1.2,0.8,2.0,0.118,2.4,1.5,3.9
6,Devin Carter,2023-24,G,33.0,33.0,35.3,6.6,14.0,0.473,2.5,...,2.8,14.1,28.1,3.3,2.7,6.0,0.206,7.3,5.0,12.3
7,Carlton Carrington,2023-24,G,33.0,33.0,33.2,4.8,11.7,0.412,2.0,...,0.9,13.0,23.0,2.2,1.4,3.6,0.133,2.6,1.3,3.9
8,Kel'el Ware,2023-24,C,30.0,30.0,32.2,6.3,10.7,0.586,0.6,...,6.5,10.9,22.9,2.7,1.5,4.2,0.175,5.8,2.7,8.5
9,Jared McCain,2023-24,G,36.0,36.0,31.6,4.9,10.5,0.462,2.4,...,0.2,10.1,21.1,3.3,1.9,5.2,0.182,6.0,2.6,8.5
