# Top College Football Recruits by Location

* Collected from top 500 recruits on 247's recruiting service

In [134]:
# standard libraries
import os
import pandas as pd
import numpy as np

# web scrapping
import requests
import urllib.request
import bs4

# visualization
import altair as alt

In [74]:
# iterate through all files
for filename in os.listdir('2020'):
    print(filename)

cf1.html
cf10.html
cf2.html
cf3.html
cf4.html
cf5.html
cf6.html
cf7.html
cf8.html
cf9.html


In [99]:
recruit_list = []
directory = '2020'

# iterate through all files
for filename in os.listdir(directory):
    if filename.endswith(".html"): 
        recruit_file = open(directory + "//" + filename)
        soup = bs4.BeautifulSoup(recruit_file)
        rank_box = soup.findAll('div', attrs={'class':'primary'})
        name_box = soup.findAll('a', attrs={"class": "rankings-page__name-link"})
        location_box = soup.findAll('span', attrs={'class':'meta'})
        score_box = soup.findAll('span', attrs={'class':'score'})
        
        for i in range(len(rank_box)):
            recruit_dict = {'Rank': int(rank_box[i].text.strip()), 
                            'Name': name_box[i].text.strip(), 
                           'Location': location_box[i].text.strip(), 
                           'Score': float(score_box[i].text.strip())}
        
            recruit_list.append(recruit_dict)
        
#         for i in name_box:
#             names.append(i.text.strip())
            
#         for i in location_box:
#             locations.append(i.text.strip())

In [103]:
recruit_list[:5]

[{'Rank': 1,
  'Name': 'Bryan Bresee',
  'Location': 'Damascus (Damascus, MD)',
  'Score': 0.999},
 {'Rank': 2,
  'Name': 'Zachary Evans',
  'Location': 'North Shore (Houston, TX)',
  'Score': 0.9987},
 {'Rank': 3,
  'Name': 'Justin Flowe',
  'Location': 'Upland (Upland, CA)',
  'Score': 0.9985},
 {'Rank': 4,
  'Name': 'Paris Johnson Jr.',
  'Location': 'St. Xavier (Cincinnati, OH)',
  'Score': 0.998},
 {'Rank': 5,
  'Name': 'Julian Fleming',
  'Location': 'Southern Columbia (Catawissa, PA)',
  'Score': 0.9978}]

In [153]:
# create dataframe
df = pd.DataFrame(recruit_list)

# reorder columns
df = df[['Rank', 'Name', 'Location', 'Score']]

# add geographic features
df['Location'] = df['Location'].str.replace(')', '')
df['HS'] = df['Location'].str.split('(').str[0]
df['CityState'] = df['Location'].str.split('(').str[1]
df['City'] = df['CityState'].str.split(',').str[0]
df['State'] = df['CityState'].str.split(',').str[1]
df['State'] = df['State'].str.strip()
df = df.drop(['Location'], axis=1)

# view first 20 rows
df.head(20)

Unnamed: 0,Rank,Name,Score,HS,CityState,City,State
0,1,Bryan Bresee,0.999,Damascus,"Damascus, MD",Damascus,MD
1,2,Zachary Evans,0.9987,North Shore,"Houston, TX",Houston,TX
2,3,Justin Flowe,0.9985,Upland,"Upland, CA",Upland,CA
3,4,Paris Johnson Jr.,0.998,St. Xavier,"Cincinnati, OH",Cincinnati,OH
4,5,Julian Fleming,0.9978,Southern Columbia,"Catawissa, PA",Catawissa,PA
5,6,DJ Uiagalelei,0.9976,St. John Bosco,"Bellflower, CA",Bellflower,CA
6,7,Sav'ell Smalls,0.9972,Kennedy Catholic,"Burien, WA",Burien,WA
7,8,Jordan Burch,0.9971,Hammond School,"Columbia, SC",Columbia,SC
8,9,Myles Murphy,0.9966,Hillgrove,"Powder Springs, GA",Powder Springs,GA
9,10,Elias Ricks,0.9965,Mater Dei,"Santa Ana, CA",Santa Ana,CA


In [133]:
# export to csv
df.to_csv(r'cf_recruits_2020.csv')

# Maps

In [164]:
pop_df = pd.read_csv('US_Population_Density_by_State.csv')
pop_df.columns = ['State', 'State Code', 'FIPS', 'Pop Density', 'Population', 'Area']

pop_df['Pop Density'] = pop_df['Pop Density'].str.replace(',', '')
pop_df['Pop Density'] = pop_df['Pop Density'].astype('uint32')

pop_df['Population'] = pop_df['Population'].str.replace(',', '')
pop_df['Population'] = pop_df['Population'].astype('uint32')

pop_df['Area'] = pop_df['Area'].str.replace(',', '')
pop_df['Area'] = pop_df['Area'].astype('uint32')
pop_df.head()

Unnamed: 0,State,State Code,FIPS,Pop Density,Population,Area
0,District of Columbia,DC,11,11535,703608,61
1,New Jersey,NJ,34,1228,9032872,7354
2,Rhode Island,RI,44,1027,1061712,1034
3,Massachusetts,MA,25,884,6895917,7800
4,Connecticut,CT,9,741,3588683,4842


In [166]:
# dictionary for state names and codes for visualizations
state_geo_dict = pd.Series(pop_df['State Code'].values,index=pop_df['FIPS']).to_dict()
inv_state_geo_dict = {v: k for k, v in state_geo_dict.items()}

In [167]:
inv_state_geo_dict

{'DC': 11,
 'NJ': 34,
 'RI': 44,
 'MA': 25,
 'CT': 9,
 'MD': 24,
 'DE': 10,
 'NY': 36,
 'FL': 12,
 'PA': 42,
 'OH': 39,
 'CA': 6,
 'IL': 17,
 'HI': 15,
 'VA': 51,
 'NC': 37,
 'IN': 18,
 'GA': 13,
 'MI': 26,
 'SC': 45,
 'TN': 47,
 'NH': 33,
 'KY': 21,
 'WA': 53,
 'TX': 48,
 'LA': 22,
 'WI': 55,
 'AL': 1,
 'MO': 29,
 'WV': 54,
 'MN': 27,
 'VT': 50,
 'MS': 28,
 'AZ': 4,
 'AR': 5,
 'IA': 19,
 'OK': 40,
 'CO': 8,
 'OR': 41,
 'ME': 23,
 'UT': 49,
 'KS': 20,
 'NV': 32,
 'NE': 31,
 'ID': 16,
 'NM': 35,
 'SD': 46,
 'ND': 38,
 'MT': 30,
 'WY': 56,
 'AK': 2}

In [174]:
# dictionary for state names and codes for visualizations
df['id'] = df['State'].map(inv_state_geo_dict).fillna(0).astype('int')

In [175]:
df.head()

Unnamed: 0,Rank,Name,Score,HS,CityState,City,State,id
0,1,Bryan Bresee,0.999,Damascus,"Damascus, MD",Damascus,MD,24
1,2,Zachary Evans,0.9987,North Shore,"Houston, TX",Houston,TX,48
2,3,Justin Flowe,0.9985,Upland,"Upland, CA",Upland,CA,6
3,4,Paris Johnson Jr.,0.998,St. Xavier,"Cincinnati, OH",Cincinnati,OH,39
4,5,Julian Fleming,0.9978,Southern Columbia,"Catawissa, PA",Catawissa,PA,42


In [177]:
df['State'].value_counts()

FL    74
TX    64
GA    50
CA    44
AL    24
LA    24
TN    16
MI    16
MD    16
NC    15
NJ    13
OH    13
AZ    12
VA    11
MO     9
WA     8
MS     8
DC     8
KY     7
CO     7
IL     6
CT     6
SC     6
MA     5
OK     4
NY     4
IN     3
UT     3
HI     3
KS     3
WI     3
MN     2
NV     2
AR     2
PA     2
SD     1
NE     1
IA     1
ON     1
NH     1
ID     1
OR     1
Name: State, dtype: int64

In [None]:
states = alt.topo_feature(data.us_10m.url, 'states')
source = df

foreground = alt.Chart(states).mark_geoshape(stroke='black').encode(
    color=alt.Color('avg_prem:Q', sort="descending",  scale=alt.Scale(scheme='viridis'), legend=alt.Legend(title="Avr Premium", tickCount=6))
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(source, 'id', ['avg_prem'])
).project(
    type='albersUsa'
).properties(
    title="Average Dental Insurance Premium by State", 
    width=700,
    height=350
)  


background = alt.Chart(states).mark_geoshape(
    fill='lightgray',
    stroke='black'
).properties(
    width=700,
    height=350
).project('albersUsa')

config = alt.layer(background, foreground).configure_title(fontSize=20, anchor="middle").configure_legend(titleColor='black', titleFontSize=14) 

config