# 06-nfl-college-data

> Data scraping for colleges of all NFL players (appeared in a game) since beginning of league

The following notebook scrapes PFR data for all players, and the colleges which they attended, in the NFL dating back to 1920 (wow!). This data will be joined to the NFL player high school data. The goal of this analysis is to determine where NFL talent is coming from across the country and which colleges that they are attending. This code takes a while to run if all colleges are included. In the future, multiprocessing will be added as an ehancement to improve scraping time. 

In [1]:
#import relevant packages
import requests as rq
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np

In [2]:
#Define function to scrape each college that has had a player make it to the NFL, no input necessary
def colleges():
    
    #initialize empty dataframe
    college_df = pd.DataFrame(columns = ['college_link', 'college_name'])

    #set the relevant url
    url = 'https://www.pro-football-reference.com/schools/'
    #use get to access the url and save the page
    page = rq.get(url)
    #save the html content of the page
    soup = bs(page.content, 'html.parser')
    
    #initialize empty lists for scraped data to be stored
    college_link = []
    college_name = []
    
    #grab the school data from the html content
    for college in soup.find_all('td', class_ = 'left'):
        if(college.find('a') != None):
            college_link.append(college.find('a')['href'])
            college_name.append(college.find('a').get_text())
         
    #initialize empty dataframe to store data    
    college_df = pd.DataFrame({
        'college_link': college_link,
        'college_name': college_name
    })
    
    return(college_df)

In [3]:
#Define function to scrape each college link for list of players, input is college dataframe from previous function
def college_players(college_df):
    
    #create empty dataframe to store scraped data
    all_player_df = pd.DataFrame(columns = ['college_link', 'player_link', 'player_name', 'pos', 'ht', 'wt', 'yrs', 'yrs_range', 'nfl_draft'])
    
    #iterate over each school link in the list of schools provided
    for c in college_df.college_link:
        
        #set the relevant url, must do in parts then join together
        url_parts = ['https://www.pro-football-reference.com', str(c)]
        url = ''.join(url_parts)
        #use get to access the url and save the page
        page = rq.get(url)
        #save the html content of the page
        soup = bs(page.content, 'html.parser')

        #initialize empty lists for scraped data to be stored
        player_link = []
        player_name = []
        pos = []
        ht = []
        wt = []
        yrs = []
        yrs_range = []
        nfl_draft = []

        #grab player link and name data from html content
        for player in soup.find_all('td', class_ = 'left'):
            if(player.get('data-stat') == 'player'):
                player_link.append(player.find('a')['href'])
                player_name.append(player.find('a').get_text())

        #grab player position data from html content
        for player in soup.find_all('th', scope = 'row'):
            pos.append(player.get_text())

        #grab player ht, wt, experience, years, and draft info data from html content    
        for player in soup.find_all('td', class_ = 'right'):
            if(player.get('data-stat') == 'height'):
                ht.append(player.get_text())
            if(player.get('data-stat') == 'weight'):
                wt.append(player.get_text())
            if(player.get('data-stat') == 'experience'):
                yrs.append(player.get_text())
            if(player.get('data-stat') == 'years_played'):
                yrs_range.append(player.get_text())
            if(player.get('data-stat') == 'nfl_draft_info'):
                nfl_draft.append(player.get_text())

        #initialize empty dataframe to store data
        player_df = pd.DataFrame({
            'player_link': player_link,
            'player_name': player_name,
            'pos': pos,
            'ht': ht,
            'wt': wt,
            'yrs': yrs,
            'yrs_range': yrs_range,
            'nfl_draft': nfl_draft
        })

        #if there are null values, return this error message
        if player_df.isnull().values.any() == True:
            return('Null values detected in ' + str(c))

        #repeat the link of the current school for the length of the school_player_df
        college_link_df = pd.DataFrame(np.repeat(c, [len(player_df)]), columns = ['college_link'])

        #concatenate links and player info, append to large dataframe
        all_player_df = all_player_df.append(pd.concat([college_link_df, player_df], axis = 1), ignore_index = True)
         
    return(all_player_df)

In [160]:
#Define function to remove duplicate players (players which attended multiple colleges), input is college player dataframe from previous function
def college_player_cleaning(college_player_df):
    
    #drop any perfectly duplicate observations (doesn't catch players who transferred, just repeated observations)
    college_player_df = college_player_df.drop_duplicates() 

    #get counts of the amount of times that each player appears, filter to only include those who appear more than once
    duplicated_players = college_player_df.groupby(['player_link']).filter(lambda x: x['college_link'].count() > 1)
    #create column that indicates these are duplicated players
    duplicated_players.insert(9, 'duplicated', True)

    #merge duplicated players back into college player dataframe (shows which players were duplicated)
    merged_players = college_player_df.merge(duplicated_players, how = 'left')

    #check to ensure that the amount of rows in the input dataframe matches the amount of rows in the previously joined dataframe
    if len(merged_players) != len(college_player_df):
        return('Input and merge tables not of equal length.')

    #separate the players with single and duplicate observations
    single_players = merged_players[merged_players['duplicated'].isnull()].reset_index(drop = True).drop(['duplicated'], axis = 1)
    duplicated_players = duplicated_players.reset_index(drop = True).drop(['duplicated'], axis = 1)

    #check to ensure that the sum of rows in the newly split dataframes matches the amount of rows in the previously joined dataframe
    if len(merged_players) != len(single_players) + len(duplicated_players):
        return('Merged tables and single + duplicate tables not of equal length.')

    #initialize empty lists for scraped data to be stored
    college_list = []
    player_list = []

    #iterate over each diplicated player, we are interested in the first college that each attended
    for p in duplicated_players.player_link.unique():

        #add current player to list of players
        player_list.append(p)

        #set the relevant url, must do in parts then join together
        url_parts = ['https://www.pro-football-reference.com', str(p)]
        url = ''.join(url_parts)
        #use get to access the url and save the page
        page = rq.get(url)
        #save the html content of the page
        soup = bs(page.content, 'html.parser')

        #grab player data from html content, only collecting the first college that each attended
        for attribute in soup.find_all('a'):
            if attribute['href'] in merged_players.college_link.unique():
                college_list.append(attribute['href'])
                break

    #create a dataframe from the player and college data
    duplicate_first_school = pd.DataFrame({
        'college_link': college_list,
        'player_link': player_list
    })

    #inner join to keep only the first college observations for each duplicated player
    duplicates_cleaned = duplicated_players.merge(duplicate_first_school, how = 'inner')

    #combine single and cleaned duplicated dataframes into dataset to be exported
    export = pd.concat([single_players, duplicates_cleaned], axis = 0).reset_index(drop = True)

    #return the export dataset
    return(export)    

In [4]:
#run colleges and save
all_colleges = colleges()

In [6]:
#run college_players function on returned dataframe from above cell and save
all_players = college_players(all_colleges)

In [161]:
#run college_player_cleaning function on returned dataframe from above cell and save
export_df = college_player_cleaning(all_players)

In [162]:
export_df

Unnamed: 0,college_link,ht,nfl_draft,player_link,player_name,pos,wt,yrs,yrs_range
0,/schools/notredame/,6-4,Philadelphia Eagles / 2nd / 57th pick / 2007,/players/A/AbiaVi99.htm,Victor Abiamiri,DE,267,3,2007-2009
1,/schools/notredame/,6-7,Washington Redskins / 3rd / 23rd pick / 1945,/players/A/AdamJo20.htm,John Adams,T,242,5,1945-1949
2,/schools/notredame/,6-2,,/players/A/AdamJo03.htm,Josh Adams,RB,225,3,2018-2020
3,/schools/notredame/,6-2,,/players/A/AdamKe00.htm,Ken Adamson,G,235,3,1960-1962
4,/schools/notredame/,5-10,,/players/A/AlleAr00.htm,Armando Allen,RB,205,2,2011-2012
5,/schools/notredame/,6-6,Houston Oilers / 2nd / 41st pick / 1990,/players/A/AlmxJe20.htm,Jeff Alm,DT,284,4,1990-1993
6,/schools/notredame/,5-10,,/players/A/AndeEd20.htm,Eddie Anderson,E,176,4,1922-1925
7,/schools/notredame/,5-11,,/players/A/AndeHu20.htm,Hunk Anderson,G-C,191,4,1922-1925
8,/schools/notredame/,5-11,Chicago Cardinals / 3rd / 16th pick / 1946,/players/A/AngsEl00.htm,Elmer Angsman,HB,200,7,1946-1952
9,/schools/notredame/,6-1,,/players/A/AtkiGe01.htm,George Atkinson,RB,220,2,2014-2016


In [164]:
#export data to csv
export_df.to_csv('preliminary-data/nfl-player-colleges.csv', index = False)