# 01 Data Collection
### NBA Combine Measurements

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

Concatenate all of the NBA combine data

In [12]:
rawpath = '../data/raw/'
combine_data = pd.DataFrame()
for year1 in range(2000, 2025):
    year2 = str(year1 + 1)[-2:]
    link = f"{year1}-{year2}"
    path = f"{rawpath}{link}.csv"
    df = pd.read_csv(path)
    df['DRAFT YR'] = year1
    combine_data = pd.concat([combine_data, df], ignore_index = True)
    

In [13]:
combine_data.to_csv('../data/processed/combine_data.csv', index=False)

### NCAA Statistics (2003-2024)

In [26]:
ncaa_colnames = [ # Probably change this
'#',	
'Player',
'Team',	
'GP',
'MPG',
'PPG',
'FGM',
'FGA',
'FG%',
'3PM',
'3PA',
'3P%',
'FTM',
'FTA',
'FT%',
'ORB',
'DRB',
'RPG',
'APG',
'SPG',
'BPG',
'TOV',
'PF']

In [36]:
ncaa_stats = pd.DataFrame()
for i in range(2003,2025):
    df = get_ncaa(i)
    ncaa_stats = pd.concat([ncaa_stats, df], ignore_index = True)

ncaa_stats.to_csv('../data/processed/ncaa_data.csv', index=False)

## Example: 2015 NBA Draft

### NBA Combine Measurements

In [3]:
path = '../data/raw/2015-16.csv'
df = pd.read_csv(path)
df['DRAFT YR'] = 2015

### NCAA Stats

#### Getting data from page 1

In [7]:
year = 2015
url = f'https://basketball.realgm.com/ncaa/stats/{year}/Averages/Qualified/All/Season/All/per/desc/1/'
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36'}
r = requests.get(url, headers=headers)
ncaa15 = BeautifulSoup(r.text, 'html')

In [20]:
df_2015 = pd.DataFrame()
tbrows = ncaa15.find_all("tr")[1:] # Exclude col names
for tbrow in tbrows:
    row = [r.string for r in tbrow]
    row_df = pd.DataFrame([row], columns=ncaa_colnames)
    df_2015 = pd.concat([df_2015, row_df], ignore_index=True)
df_2015    

Unnamed: 0,#,Player,Team,GP,MPG,PPG,FGM,FGA,FG%,3PM,...,FTA,FT%,ORB,DRB,RPG,APG,SPG,BPG,TOV,PF
0,1,Tyler Harvey,EWASH,32,36.9,23.1,7.2,15.3,.469,4.0,...,5.5,.852,0.6,3.0,3.6,2.6,1.1,0.1,2.0,1.9
1,2,Zeek Woodley,NSU,32,31.5,22.2,8.2,14.6,.560,1.2,...,5.7,.813,1.7,3.3,5.0,1.1,1.0,0.7,1.2,2.3
2,3,Tyler Haws,BYU,35,32.1,22.2,7.4,15.4,.480,1.5,...,6.7,.873,1.3,3.1,4.5,2.4,0.9,0.3,2.0,1.4
3,4,Denzel Livingston,UIW,29,36.9,21.5,6.6,14.4,.455,1.6,...,8.5,.805,1.5,4.4,5.9,2.9,2.6,1.3,2.8,3.0
4,5,Damion Lee,DREX,27,38.1,21.4,6.3,14.4,.438,2.4,...,7.2,.887,0.9,5.2,6.1,2.3,1.5,0.3,1.9,2.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,196,Ike Nwamu,MER,35,33.2,15.1,4.9,11.4,.428,2.3,...,4.1,.761,0.7,2.6,3.3,1.6,0.7,0.4,2.1,1.8
196,197,Jay Harris,UIC,30,32.5,15.1,4.1,10.6,.384,1.7,...,5.9,.898,0.6,2.4,3.0,3.7,1.3,0.1,2.8,1.9
197,198,Marcquise Reed,RMU,35,27.8,15.1,5.6,11.5,.490,0.9,...,3.7,.779,0.6,1.9,2.5,2.1,1.9,0.0,2.6,2.1
198,199,Dez Wells,UMD,28,30.5,15.1,5.3,11.3,.464,0.9,...,4.6,.806,1.3,4.0,5.3,2.8,1.0,0.4,3.3,2.3


In [35]:
def get_ncaa(year):
    page = 1
    df_yr = pd.DataFrame()
    for page in range(1,4):
        url = f'https://basketball.realgm.com/ncaa/stats/{year}/Averages/Qualified/All/Season/All/per/desc/{page}/'
        headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36'}
        r = requests.get(url, headers=headers)
        ncaa = BeautifulSoup(r.text, 'html')
    
        tbrows = ncaa.find_all("tr")[1:] # Exclude col names
        for tbrow in tbrows:
            row = [r.string for r in tbrow]
            row_df = pd.DataFrame([row], columns=ncaa_colnames)
            df_yr = pd.concat([df_yr, row_df], ignore_index=True)
    df_yr['Draft Yr'] = year
    return df_yr   