# 06-nfl-college-data

In [1]:
#import relevant packages
import requests as rq
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
import janitor

In [2]:
def colleges():
    
    #initialize empty dataframe
    college_df = pd.DataFrame(columns = ['college_link', 'college_name'])

    #set the relevant url
    url = 'https://www.pro-football-reference.com/schools/'
    #use get to access the url and save the page
    page = rq.get(url)
    #save the html content of the page
    soup = bs(page.content, 'html.parser')
    
    #initialize empty lists for scraped data to be stored
    college_link = []
    college_name = []
    
    #grab the school data from the html content
    for college in soup.find_all('td', class_ = 'left'):
        if(college.find('a') != None):
            college_link.append(college.find('a')['href'])
            college_name.append(college.find('a').get_text())
         
    #initialize empty dataframe to store data    
    college_df = pd.DataFrame({
        'college_link': college_link,
        'college_name': college_name
    })
    
    return(college_df)

In [3]:
def college_players(college_df):
    
    all_player_df = pd.DataFrame(columns = ['college_link', 'player_link', 'player_name', 'pos', 'ht', 'wt', 'yrs', 'yrs_range', 'nfl_draft'])
    
    #iterate over each school link in the list of schools provided
    for c in college_df.college_link:
        
        #set the relevant url, must do in parts then join together
        url_parts = ['https://www.pro-football-reference.com', str(c)]
        url = ''.join(url_parts)
        #use get to access the url and save the page
        page = rq.get(url)
        #save the html content of the page
        soup = bs(page.content, 'html.parser')

        #initialize empty lists for scraped data to be stored
        player_link = []
        player_name = []
        pos = []
        ht = []
        wt = []
        yrs = []
        yrs_range = []
        nfl_draft = []

        #grab player link and name data from html content
        for player in soup.find_all('td', class_ = 'left'):
            if(player.get('data-stat') == 'player'):
                player_link.append(player.find('a')['href'])
                player_name.append(player.find('a').get_text())

        #grab player position data from html content
        for player in soup.find_all('th', scope = 'row'):
            pos.append(player.get_text())

        #grab player ht, wt, experience, years, and draft info data from html content    
        for player in soup.find_all('td', class_ = 'right'):
            if(player.get('data-stat') == 'height'):
                ht.append(player.get_text())
            if(player.get('data-stat') == 'weight'):
                wt.append(player.get_text())
            if(player.get('data-stat') == 'experience'):
                yrs.append(player.get_text())
            if(player.get('data-stat') == 'years_played'):
                yrs_range.append(player.get_text())
            if(player.get('data-stat') == 'nfl_draft_info'):
                nfl_draft.append(player.get_text())

        #initialize empty dataframe to store data
        player_df = pd.DataFrame({
            'player_link': player_link,
            'player_name': player_name,
            'pos': pos,
            'ht': ht,
            'wt': wt,
            'yrs': yrs,
            'yrs_range': yrs_range,
            'nfl_draft': nfl_draft
        })

        #if there are null values, return this error message
        if player_df.isnull().values.any() == True:
            return('Null values detected in ' + str(c))

        #repeat the link of the current school for the length of the school_player_df
        college_link_df = pd.DataFrame(np.repeat(c, [len(player_df)]), columns = ['college_link'])

        #concatenate links and player info, append to large dataframe
        all_player_df = all_player_df.append(pd.concat([college_link_df, player_df], axis = 1), ignore_index = True)
         
    return(all_player_df)

In [4]:
all_colleges = colleges()

In [5]:
college_players(all_colleges[all_colleges.college_name == 'Vanderbilt'])

Unnamed: 0,college_link,player_link,player_name,pos,ht,wt,yrs,yrs_range,nfl_draft
0,/schools/vanderbilt/,/players/A/AgeeSa20.htm,Sam Agee,FB,6-1,218,2,1938-1939,
1,/schools/vanderbilt/,/players/A/ArnoJi20.htm,Jim Arnold,P,6-2,215,12,1983-1994,Kansas City Chiefs / 5th / 119th pick / 1983
2,/schools/vanderbilt/,/players/A/AsheBo20.htm,Bob Asher,T-G,6-5,250,5,1970-1975,Dallas Cowboys / 2nd / 27th pick / 1970
3,/schools/vanderbilt/,/players/B/BardBr00.htm,Brandon Barden,TE,6-5,240,2,2012-2014,
4,/schools/vanderbilt/,/players/B/BattAi20.htm,Ainsley Battles,DB,5-11,204,4,2000-2004,
...,...,...,...,...,...,...,...,...,...
105,/schools/vanderbilt/,/players/W/WinsDe20.htm,DeMond Winston,LB,6-2,239,4,1990-1994,New Orleans Saints / 4th / 98th pick / 1990
106,/schools/vanderbilt/,/players/W/WolfWi00.htm,Will Wolford,T-G,6-5,294,13,1986-1998,Buffalo Bills / 1st / 20th pick / 1986
107,/schools/vanderbilt/,/players/W/WoodCa20.htm,Carl Woods,RB,5-11,200,1,1987-1987,
108,/schools/vanderbilt/,/players/W/WynnJo00.htm,Jonathan Wynn,DE,6-4,260,1,2019-2019,
