In [1]:
import requests
import string
from bs4 import BeautifulSoup
import time
from pprint import pprint
import json

# Part one of the Data Acquisition is to crawl through the sports reference players list from A to Z.  Gather every player's
# unique identifier and link to their stats page and store them in a json object.

In [2]:
def test_404(link:str) -> bool:
    '''
    Boolean function that takes input of a URL and returns True if the link is 404 does not exist.
    Else returns False
    '''
    code = requests.get(link).status_code
    if code == 404:
        return True
    else:
        return False

In [3]:
def extract_id(link:str) -> str:
    '''
    Function that takes input of the player URL and returns their Unique ID
    '''
    return (link.split("/cfb/players/")[1]).split(".html")[0]

In [4]:
def get_href(all_hrefs, letter:str) -> list:
    '''
    Function that takes input of all hrefs from Beautiful soup object, and input of the current letter of last name.
    Returns Two lists.  links_list is a list of links to player stat pages.  PlayerIds is a list of unique identifierss of players.
    Corresponding link and player ID match at the same index in each list.
    Dependency on function extract_id
    '''
    
    links_list = []
    playerIds = []
    
    for row in all_hrefs:      
        temp_href = str(row.get("href"))
        if '/cfb/players/' in temp_href and '-index' not in temp_href and temp_href != '/cfb/players/':
            tempId = extract_id(temp_href)
            if tempId.split('-')[1][0] == letter:
                playerIds.append(tempId)
                links_list.append(temp_href)
            
    return links_list, playerIds

In [9]:
def get_player_links(letter:str, pageNum:int) -> list:
    '''
    Function that takes input of current letter and page number.  Returns the two lists of playerIDs and links to their page
    per letter and page number.
    Dependency on functions get_href, extract_id, test_404
    '''

    if pageNum == 1:
        url = f"https://www.sports-reference.com/cfb/players/{letter}-index.html"
    else:
        url = f"https://www.sports-reference.com/cfb/players/{letter}-index-{pageNum}.html"
    if test_404(url):
        return 'final', 'final'
    
    request = requests.get(url)
    time.sleep(.3)
    soup = BeautifulSoup(request.text, "html.parser")
    all_hrefs = soup.find_all("a")
    
    return get_href(all_hrefs, letter)

In [6]:
def addwww(player_dict:dict):
    '''
    Function that takes the input of the player dict and adds the leading http for sports reference if the link does not currently have it.
    '''
    
    leadingUrl = 'https://www.sports-reference.com'
    for letter in player_dict:
        for i in range(len(player_dict[letter]['playerIds'])):
            if 'www.' not in player_dict[letter]['Body'][i]['collegeLink']:
                player_dict[letter]['Body'][i]['collegeLink'] = leadingUrl + player_dict[letter]['Body'][i]['collegeLink']
    return player_dict

In [7]:
'''
Setting up empty dict object for the main code.
'''

player_dict = {}
alphabet = list(string.ascii_lowercase)
for letter in alphabet:
    player_dict[letter] = {}
player_dict

{'a': {},
 'b': {},
 'c': {},
 'd': {},
 'e': {},
 'f': {},
 'g': {},
 'h': {},
 'i': {},
 'j': {},
 'k': {},
 'l': {},
 'm': {},
 'n': {},
 'o': {},
 'p': {},
 'q': {},
 'r': {},
 's': {},
 't': {},
 'u': {},
 'v': {},
 'w': {},
 'x': {},
 'y': {},
 'z': {}}

In [10]:
'''
Main code to be run.  Dependency on functions get_href, extract_id, test_404, get_player_links
This code will crawl through sports reference website and acquire all player unique IDs and their corresponding links to their stats page.
This will populate the player dict object.
'''

pageNum = 1
for letter in player_dict:
    print(f"Starting for letter {letter}")
    while True:
        print(f"Crawling for page number {pageNum}")
        links, playerIds = get_player_links(letter, pageNum)

        if playerIds == 'final':
            print(f"\nMoving to next letter\nTotal links gained: {len(player_dict[letter]['playerIds'])}","\n--------------------------------\n")
            pageNum = 1
            break

        else:
            for i in range(len(playerIds)):
                try:
                    player_dict[letter]['playerIds'].append(playerIds[i])
                    player_dict[letter]['Body'].append({'collegeLink' : links[i]})
                except KeyError:
                    player_dict[letter]['playerIds'] = []
                    player_dict[letter]['Body'] = []
                    
        pageNum += 1


Starting for letter a
Crawling for page number 1
Crawling for page number 2
Crawling for page number 3
Crawling for page number 4
Crawling for page number 5
Crawling for page number 6
Crawling for page number 7
Crawling for page number 8
Crawling for page number 9
Crawling for page number 10
Crawling for page number 11

Moving to next letter
Total links gained: 4219 
--------------------------------

Starting for letter b
Crawling for page number 1
Crawling for page number 2
Crawling for page number 3
Crawling for page number 4
Crawling for page number 5
Crawling for page number 6
Crawling for page number 7
Crawling for page number 8


KeyboardInterrupt: 

In [13]:
'''
Add leading http statement to links.
Dependency on addwww function.
'''

player_dict = addwww(player_dict)

In [12]:
'''
Testing player dict object after running main code.
Validate results.
'''

print(len(player_dict['a']['playerIds']))
print(len(player_dict['a']['Body']))
pprint(player_dict['a']['playerIds'])

4219
4219
['daniel-ahara-1',
 'isaako-aaitui-1',
 'antuan-aaron-1',
 'austin-aaron-1',
 'david-aaron-1',
 'doyle-aaron-1',
 'james-aaron-1',
 'jarell-aaron-1',
 'jeff-aaron-1',
 'larry-aaron-1',
 'montel-aaron-1',
 'oliver-aaron-1',
 'grant-aasen-1',
 'abule-abadi-fitzgerald-1',
 'pete-abadie-1',
 'ulysses-abadie-1',
 'michael-abana-1',
 'israel-abanikanda-1',
 'tokumbo-abanikanda-1',
 'keoki-abasial-1',
 'joe-abate-1',
 'ali-abbas-1',
 'jon-abbate-1',
 'richard-abbe-1',
 'christopher-abbes-1',
 'bill-abbey-1',
 'don-abbey-1',
 'chase-abbington-1',
 'bill-abbot-1',
 'aj-abbott-1',
 'aj-abbott-2',
 'aaron-abbott-1',
 'andrew-abbott-1',
 'blake-abbott-1',
 'bo-abbott-1',
 'boone-abbott-1',
 'britton-abbott-1',
 'brooks-abbott-1',
 'brooks-abbott-2',
 'bryce-abbott-1',
 'cliff-abbott-1',
 'jake-abbott-1',
 'jim-abbott-1',
 'mike-abbott-1',
 'mike-abbott-2',
 'nootie-abbott-1',
 'vince-abbott-1',
 'vincent-abbott-1',
 'jared-abbrederis-1',
 'nick-abbs-1',
 'zelos-abby-1',
 'emmanuel-abdall

In [14]:
'''
Testing player dict object after running main code.
Validate results.
'''

pprint(player_dict['a']['Body'])

[{'collegeLink': 'https://www.sports-reference.com/cfb/players/daniel-ahara-1.html'},
 {'collegeLink': 'https://www.sports-reference.com/cfb/players/isaako-aaitui-1.html'},
 {'collegeLink': 'https://www.sports-reference.com/cfb/players/antuan-aaron-1.html'},
 {'collegeLink': 'https://www.sports-reference.com/cfb/players/austin-aaron-1.html'},
 {'collegeLink': 'https://www.sports-reference.com/cfb/players/david-aaron-1.html'},
 {'collegeLink': 'https://www.sports-reference.com/cfb/players/doyle-aaron-1.html'},
 {'collegeLink': 'https://www.sports-reference.com/cfb/players/james-aaron-1.html'},
 {'collegeLink': 'https://www.sports-reference.com/cfb/players/jarell-aaron-1.html'},
 {'collegeLink': 'https://www.sports-reference.com/cfb/players/jeff-aaron-1.html'},
 {'collegeLink': 'https://www.sports-reference.com/cfb/players/larry-aaron-1.html'},
 {'collegeLink': 'https://www.sports-reference.com/cfb/players/montel-aaron-1.html'},
 {'collegeLink': 'https://www.sports-reference.com/cfb/play

In [13]:
'''
Once main code is run and validated.  Save file to player_dict.json
'''


with open("player_dict.json", 'w') as outfile:
    json.dump(player_dict, outfile)

## End web crawling Part 1

## Further testing and validation of results

In [10]:
'''
Load the saved player_dict.json for testing
'''


with open('player_dict.json') as json_file:
    player_dict = json.load(json_file)

In [11]:
'''
Check playerID counts for all letters.
'''

stop = 0 

for row in player_dict:
    print(row)
    print("PlayerIds", len(player_dict[row]['playerIds']))
    for i in range(len(player_dict[row]['playerIds'])):
        if 'proLink' not in player_dict[row]['Body'][i]:
            print(player_dict[row]['Body'][i])
            stop = 1
            break
    if stop == 1:
        break

a
PlayerIds 4219
b
PlayerIds 12603
c
PlayerIds 9264
d
PlayerIds 6121
e
PlayerIds 2396
f
PlayerIds 4582
g
PlayerIds 6319
h
PlayerIds 10027
i
PlayerIds 569
j
PlayerIds 5741
k
PlayerIds 4045
l
PlayerIds 5567
m
PlayerIds 12067
n
PlayerIds 2167
o
PlayerIds 1892
p
PlayerIds 6121
q
PlayerIds 178
r
PlayerIds 6408
s
PlayerIds 12013
t
PlayerIds 5244
u
PlayerIds 339
v
PlayerIds 1342
w
PlayerIds 9381
x
PlayerIds 3
y
PlayerIds 685
z
PlayerIds 483
