# Explore Data
Notebook for initial pulling and performing EDA on NFL Combine data to compare with overall NFL success.

In [5]:
# !pip install --upgrade pip
# !pip install numpy
# !pip install pandas
# !pip install matplotlib
# !pip install seaborn
# !pip install statsmodels
# !pip install beautifulsoup4

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import requests
from bs4 import BeautifulSoup
import re

%matplotlib inline

In [6]:
url = r'https://nflcombineresults.com/nflcombinedata.php?year=all&pos=QB&college='

# Get page content
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

In [29]:
def get_combine(soup=None):
    """Function to grab all Quarterback combine data. Returns DataFrame of each QB that
       participated in the NFL Combine from 1987-present. The data returned includes:
        - year: Year of participation
        - name: Player Name
        - college: College attended
        - pos: Player Position (QB in this case)
        - height: Player Height (inches)
        - weight: Player Weight (lbs)
        - wonderlic: Wonderlic Cognitive Ability Test Score (0-50 overall, 20 avg intelligence)
        - 40_yard: 40 Yard Dash Time (s)
        - bench_press: Number of 225 lb Repititions
        - vert_leap: Standing Vertical Jump Height (inches)
        - broad_jump: Standing Long Jump Distance (inches)
        - shuttle: 20 Yard Shuttle Time (s)
        - 3cone: 3-Cone Drill Time (s)"""
    
    # Check if soup is not provided
    if soup is None:
        durl = r'https://nflcombineresults.com/nflcombinedata.php?year=all&pos=QB&college='
        page = requests.get(durl)
        soup = BeautifulSoup(page.content, 'html.parser')
    
    # Find table on the webpage and all trs
    table = soup.find('table', attrs={'class': 'sortable'})
    trs = table.find_all('tr')
    
    # Parse how to go thru trs and tds
    rows = []
    for i, tr in enumerate(trs):
        tds = tr.find_all('td')

        # First tr is the header row
        if i==0:
            header = [td.text.lower()
                             .replace(' (in)','')
                             .replace(' (lbs)','')
                             .replace(' ','_')
                      for td in tds]
            continue
        # Break loop if tr is looking at footer
        elif i==len(trs)-1:
            break
            
        # Read each tr as a row
        rows.append([td.text for td in tds if i!=0])

    return pd.DataFrame(rows, columns=header)

In [31]:
get_combine(soup).sort_values('height')

Unnamed: 0,year,name,college,pos,height,weight,wonderlic,40_yard,bench_press,vert_leap,broad_jump,shuttle,3cone
518,1998,Chris Mccoy,Navy,QB,69.80,196,,4.80,15,29.5,109,,
59,2019,Kyler Murray,Oklahoma,QB,70.13,207,,,,,,,
475,2000,Joe Hamilton,Georgia Tech,QB,70.30,192,18,4.84,,,106,,
85,2018,Quinton Flowers,South Florida,QB,70.38,214,,4.63,,,112,4.57,6.81
543,1996,Dave Dickenson,Montana,QB,70.40,185,,5.19,,29.0,108,4.51,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
49,2019,Tyree Jackson,Buffalo,QB,79.00,249,,4.59,,34.5,120,4.28,7.09
592,1994,Cree Morris,Saint Mary's (CA),QB,79.00,239,,5.36,,26.5,99,4.73,
224,2013,Mike Glennon,North Carolina State,QB,79.13,225,26,4.94,,26.5,102,4.52,7.49
157,2016,Travis Wilson,Utah,QB,79.25,232,,4.83,,31.5,112,4.25,7.05
