# 05-nfl-high-school-data

> Data scraping for all NFL players (appeared in a game) since beginning of league

The following notebook scrapes PFR data for all players and the high schools which they attended in the NFL dating back to 1920 (wow!). This data will be used for analysis comparing recruit counts and location density to NFL talent counts and location density. This code takes a few hours to run if all states are included. In the future, multiprocessing will be added as an ehancement to improve scraping time. 

## Data Scraping

In [2]:
#import relevant packages
import requests as rq
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
import janitor

In [3]:
#Define function to scrape each state for list of schools, input is list of state abbreviations
def state_school(state_list):
    
    #initialize empty dataframe
    all_school_df = pd.DataFrame(columns = ['hs_link', 'hs_name', 'hs_city', 'hs_state', 'num_players', 'num_active'])
    
    #iterate over each state in the list of states provided
    for state in state_list:
        
        #set the relevant url, must do in parts then join together
        url_parts = ['https://www.pro-football-reference.com/schools/high_schools.cgi?hs_state=', str(state)]
        url = ''.join(url_parts)
        #use get to access the url and save the page
        page = rq.get(url)
        #save the html content of the page
        soup = bs(page.content, 'html.parser')
        
        #initialize empty lists for scraped data to be stored
        school_link = []
        school_name = []
        school_attributes = [[] for i in range(2)]

        #grab the school data from the html content
        for school in soup.find_all('th', class_ = 'left'):
            school_link.append(school.find('a')['href'])
            school_name.append(school.find('a').get_text())

        #grab the school data from the html content
        for school in soup.find_all('td'):
            school_attributes[0].append(school.get('data-stat'))
            school_attributes[1].append(school.get_text())

        #initialize empty dataframe to store data
        school_df = pd.DataFrame({
            'hs_link': school_link,
            'hs_name': school_name
        })

        #initialize empty dataframe to store data
        attributes_df = pd.DataFrame({
            'attribute': school_attributes[0],
            'value': school_attributes[1]
        })

        #split attributes data into individual columns to be joined to full dataframe
        hs_city = attributes_df[attributes_df.attribute == 'hs_city'].reset_index(drop = True).rename(columns={'value': 'hs_city'}).drop(['attribute'], axis = 1)
        hs_state = attributes_df[attributes_df.attribute == 'hs_state'].reset_index(drop = True).rename(columns={'value': 'hs_state'}).drop(['attribute'], axis = 1)
        num_players = attributes_df[attributes_df.attribute == 'num_players'].reset_index(drop = True).rename(columns={'value': 'num_players'}).drop(['attribute'], axis = 1)
        num_active = attributes_df[attributes_df.attribute == 'num_active'].reset_index(drop = True).rename(columns={'value': 'num_active'}).drop(['attribute'], axis = 1)
        
        #concatenate all attributes for schools in the current state
        state_school_df = pd.concat([school_df, hs_city, hs_state, num_players, num_active], axis = 1)
        
        #if there are null values, return this error message
        if state_school_df.isnull().values.any() == True:
            return('Null values detected in ' + str(state))

        #append current state school results to all school list
        all_school_df = all_school_df.append(state_school_df, ignore_index = True)
    
    #return finalized dataframe
    return(all_school_df)

In [4]:
#Define function to scrape each school link for list of players, input is school dataframe from previous function
def school_player(schools_df):
    
    #initialize empty dataframe
    all_player_df = pd.DataFrame(columns = ['hs_link', 'player_link', 'player_name', 'pos', 'nfl_team', 'year_min', 'year_max']) 
    
    #iterate over each school link in the list of schools provided
    for hs in schools_df.hs_link:
        
        #set the relevant url, must do in parts then join together
        url_parts = ['https://www.pro-football-reference.com/schools/', str(hs)]
        url = ''.join(url_parts)
        #use get to access the url and save the page
        page = rq.get(url)
        #save the html content of the page
        soup = bs(page.content, 'html.parser')

        #initialize empty lists for scraped data to be stored
        player_link = []
        player_name = []
        player_attributes = [[] for i in range(2)]

        #grab the player data from the html content
        for player in soup.find_all('th', scope = 'row'):
            player_link.append(player.find('a')['href'])
            player_name.append(player.find('a').get_text())

        #grab the player data from the html content
        for player in soup.find_all('td'):
            player_attributes[0].append(player.get('data-stat'))
            player_attributes[1].append(player.get_text())

        #initialize empty dataframe to store data
        player_df = pd.DataFrame({
            'player_link': player_link,
            'player_name': player_name
        })

        #initialize empty dataframe to store data
        attributes_df = pd.DataFrame({
            'attribute': player_attributes[0],
            'value': player_attributes[1]
        })

        #split attributes data into individual columns to be joined to full dataframe
        position = attributes_df[attributes_df.attribute.astype(str) == 'pos'].reset_index(drop = True).rename(columns={'value': 'pos'}).drop(['attribute'], axis = 1)
        nfl_team = attributes_df[attributes_df.attribute.astype(str) == 'teams'].reset_index(drop = True).rename(columns={'value': 'nfl_team'}).drop(['attribute'], axis = 1)
        year_min = attributes_df[attributes_df.attribute.astype(str) == 'year_min'].reset_index(drop = True).rename(columns={'value': 'year_min'}).drop(['attribute'], axis = 1)
        year_max = attributes_df[attributes_df.attribute.astype(str) == 'year_max'].reset_index(drop = True).rename(columns={'value': 'year_max'}).drop(['attribute'], axis = 1)

        #concatenate all attributes for players in the current school
        school_player_df = pd.concat([player_df, position, nfl_team, year_min, year_max], axis = 1)

        #if there are null values, return this error message
        if school_player_df.isnull().values.any() == True:
            return('Null values detected in ' + str(hs))

        #repeat the link of the current school for the length of the school_player_df
        hs_link_df = pd.DataFrame(np.repeat(hs, [len(school_player_df)]), columns = ['hs_link'])

        #concatenate links and player info, append to large dataframe
        all_player_df = all_player_df.append(pd.concat([hs_link_df, school_player_df], axis = 1), ignore_index = True)

    #return finalized dataframe    
    return(all_player_df)

In [2]:
#states list to be used in function
states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

In [5]:
states = ['AK']

In [6]:
#run state_school with specified list of states and save
all_schools = state_school(states)

In [7]:
#run school_player function on returned dataframe from above cell and save
all_players = school_player(all_schools)

## Data Cleaning

In [8]:
#remove duplicated player values (result of one school link being listed twice for separate schools)
all_players = all_players[~all_players.duplicated()]

In [9]:
#merge player and school data (inner join)
export = all_schools.merge(all_players, on = 'hs_link')

In [10]:
#remove observations where the school link is the incorrect one, reset index
#drop num_players and num_active because they are often inaccurate
export = export[~((export.hs_link == 'high_schools.cgi?id=93bdb950') & (export.hs_state == 'MI'))].reset_index(drop = True).drop(['num_players', 'num_active'], axis = 1)

In [11]:
#display final data
export

Unnamed: 0,hs_link,hs_name,hs_city,hs_state,player_link,player_name,pos,nfl_team,year_min,year_max
0,high_schools.cgi?id=93b98f93,Lathrop,Fairbanks,AK,/players/T/TongRe20.htm,Reggie Tongue,DB,"KAN,SEA,NYJ,OAK",1996,2005
1,high_schools.cgi?id=93b98f93,Lathrop,Fairbanks,AK,/players/B/BonhSh20.htm,Shane Bonham,DT,"DET,SFO,IND",1994,1999
2,high_schools.cgi?id=93c085dc,East Anchorage,Anchorage,AK,/players/T/TosiMa20.htm,Mao Tosi,DT-DE,ARI,2000,2001
3,high_schools.cgi?id=93bf09cb,Ben Eielson,Fairbanks,AK,/players/N/NeviTo20.htm,Tom Neville,G-T,"GNB,SFO",1986,1992
4,high_schools.cgi?id=93baf296,North Pole,North Pole,AK,/players/C/CollDa20.htm,Daryn Colledge,T,"GNB,ARI,MIA",2006,2014
5,high_schools.cgi?id=2a0de876,Service,Anchorage,AK,/players/O/OverJe00.htm,Jeff Overbaugh,LS,"MIN,ATL",2017,2018
6,high_schools.cgi?id=93bfe2a9,Robert Service,Anchorage,AK,/players/S/SchlMa00.htm,Mark Schlereth,G-C,"WAS,DEN",1989,2000
7,high_schools.cgi?id=93bdffc5,A.J. Dimond,Anchorage,AK,/players/K/KupeCh20.htm,Chris Kuper,G,DEN,2006,2013
8,high_schools.cgi?id=93b9a334,Bartlett,Anchorage,AK,/players/B/BowmZa20.htm,Zackary Bowman,DB,"CHI,NYG,MIA",2008,2015
9,high_schools.cgi?id=93bf10ec,Howkan,Howkan,AK,/players/N/NixxGe20.htm,George Nix,G,BUF,1926,1926


## Write CSV

In [53]:
export.to_csv('preliminary-data/all-nfl-player.csv', index = False)