# 01 Data Collection
### NBA Combine Measurements

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

All of the NBA Combine Measurements were collected from NBA.com and scraped using Data Miner since it is stored as dynamic JavaScript and not HTML. They were then saved to individual csv files

In [2]:
rawpath = '../data/raw/Combine CSVs'
combine_data = pd.DataFrame()
for year1 in range(2000, 2025):
    year2 = str(year1 + 1)[-2:]
    link = f"{year1}-{year2}.csv"
    path = f"{rawpath}/{link}"
    df = pd.read_csv(path)
    df['DRAFT YR'] = year1
    combine_data = pd.concat([combine_data, df], ignore_index = True)
    

In [3]:
combine_data.to_csv('../data/raw/combine_data.csv', index=False)

### NCAA Statistics (2003-2024)

The way that the column names are stored varies, so I had to manually add those names and append them to the dataset

In [8]:
ncaa_colnames = [
'#',	
'Player',
'Team',	
'GP',
'MPG',
'PPG',
'FGM',
'FGA',
'FG%',
'3PM',
'3PA',
'3P%',
'FTM',
'FTA',
'FT%',
'ORB',
'DRB',
'RPG',
'APG',
'SPG',
'BPG',
'TOV',
'PF']

In [9]:
def get_ncaa(year):
    '''
    Returns a pandas DataFrame containing statistics for the top 1,000 scorers (PPG) in a collegiate season
    '''
    df_yr = pd.DataFrame()
    for page in range(1,6):
        url = f'https://basketball.realgm.com/ncaa/stats/{year}/Averages/Qualified/All/Season/All/per/desc/{page}/'
        headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36'}
        r = requests.get(url, headers=headers)
        ncaa = BeautifulSoup(r.text, 'html')
        
        tbrows = ncaa.find_all("tr")[1:] # Excluding column names
        for tbrow in tbrows:
            row = [r.string for r in tbrow]
            row_df = pd.DataFrame([row], columns=ncaa_colnames)
            df_yr = pd.concat([df_yr, row_df], ignore_index=True)
    df_yr['Draft Yr'] = year
    return df_yr 

In [10]:
ncaa_stats = pd.DataFrame()
for i in range(2003,2025):
    df = get_ncaa(i)
    ncaa_stats = pd.concat([ncaa_stats, df], ignore_index = True)

ncaa_stats.to_csv('../data/raw/ncaa_data.csv', index=False)