In [2]:
import requests
import string
from bs4 import BeautifulSoup
import time
from pprint import pprint
import json
import random

## Part two takes the results from part one via the player_dict.json file and gathers pro links that are present on the page.
## Pro links will be saved to the dictionary object.

In [4]:
'''
Load in player_dict.json file to a dictionary object.
'''

with open('player_dict.json') as json_file:
    player_dict = json.load(json_file)

In [5]:
def get_hrefPro(all_hrefs) -> str:
    '''
    Takes input of all hrefs for a single player url.  If the pro link is on the page then
    it will retrun a string of the pro link URL.  Else it will return None.
    '''
    
    for href in all_hrefs:
        temp_href = str(href.get("href"))
        if 'pro-football-reference.com/players' in temp_href:
            return temp_href
    return None


In [6]:
def get_player_proLink(collegeLink:str) -> str:
    '''
    Takes input of a single college player url link and will return their pro link if it exists.
    Else it will return None.
    Dependency on function get_hrefPro.
    '''
    
    request = requests.get(collegeLink)
    time.sleep(.3)
    soup = BeautifulSoup(request.text, "html.parser")
    all_hrefs = soup.find_all("a")

    return get_hrefPro(all_hrefs)
 


In [7]:
'''
Test URL and results.
'''

test = get_player_proLink('https://www.sports-reference.com/cfb/players/amari-cooper-1.html')
print(test)

https://www.pro-football-reference.com/players/C/CoopAm00.htm


In [76]:
def save_json(number:int):
    '''
    Function to be used in main code that will save progress for every 500 players checked.
    '''
    
    if number % 500 == 0:
        with open("player_dict.json", 'w') as outfile:
            json.dump(player_dict, outfile)
        print("Saved JSON")
    return
    

In [100]:
'''
Main code that will go through all college players in the player_dict object and check if the there is a pro link on their page.
If there is a pro link on the page then it will acquire the pro link and add it to the player_dict object.  if there is no
pro link on the page then the value of prolink will value it with None.  This main code will also save the player_dict object
to the player_dict.json file.  The code will skip over rows where the prolink is already valued allowing for this code to be stopped
and then picked up again and run later from where it left off.
Dependency on functions save_json, get_player_proLink, get_hrefPro
'''

rand_list = [.2,.4,.6,.7,.8,.3,.1,1,.9, 3,.6,.7,.8]
collegeCount = 0
proCount = 0

for letter in player_dict:
    print(f"Starting for letter {letter}\nTotal in letter {len(player_dict[letter]['playerIds'])}")
    collegeCount += len(player_dict[letter]['playerIds'])
    tempProCount = 0
    tempRandnum = 0
    for i in range(len(player_dict[letter]['playerIds'])):
        
        if 'proLink' in player_dict[letter]['Body'][i]:
            if player_dict[letter]['Body'][i]['proLink'] == None:
                continue
            else:
                tempProCount += 1
                continue
        
        print(f"Checking for player {player_dict[letter]['playerIds'][i]}\nCount {i} out of {len(player_dict[letter]['playerIds'])}")
        proLink = get_player_proLink(player_dict[letter]['Body'][i]['collegeLink'])
        player_dict[letter]['Body'][i]['proLink'] = proLink
        
        tempRandnum = random.choice(rand_list)
        
        if proLink == None:
            print(f"No professional stats found.\nWating {tempRandnum}")
        else:
            tempProCount += 1
            print(f"Professional stats found. Added Link\nWating {tempRandnum}")
        print("--------------------------\n")
    
        time.sleep(tempRandnum)
        save_json(i)
    
    print(f"Total Professional Links found for letter {letter} {tempProCount}","\n-----------------------------\n")
    proCount+=tempProCount
    
    with open("player_dict.json", 'w') as outfile:
        json.dump(player_dict, outfile)
    
    if tempRandnum != 0:
        time.sleep(30)

print(f"Total count of all College links {collegeCount}")      
print(f"Total count of all professional links {proCount}")

Starting for letter a
Total in letter 4219
Total Professional Links found for letter a 440 
-----------------------------

Starting for letter b
Total in letter 12603
Total Professional Links found for letter b 1372 
-----------------------------

Starting for letter c
Total in letter 9264
Total Professional Links found for letter c 959 
-----------------------------

Starting for letter d
Total in letter 6121
Total Professional Links found for letter d 623 
-----------------------------

Starting for letter e
Total in letter 2396
Total Professional Links found for letter e 245 
-----------------------------

Starting for letter f
Total in letter 4582
Total Professional Links found for letter f 445 
-----------------------------

Starting for letter g
Total in letter 6319
Total Professional Links found for letter g 658 
-----------------------------

Starting for letter h
Total in letter 10027
Total Professional Links found for letter h 1048 
-----------------------------

Starting for

In [56]:
'''
Test player_dict object.  Validate reuslts.
'''

player_dict['a']['Body'][2247]

{'collegeLink': 'https://www.sports-reference.com/cfb/players/andre-anderson-2.html',
 'proLink': 'https://www.pro-football-reference.com/players/A/AndeAn23.htm'}

In [83]:
'''
Validate player_dict counts for each letter.
'''

for row in player_dict:
    print(row)
    print("PlayerIds", len(player_dict[row]['playerIds']))
    print("Body", len(player_dict[row]['Body']),'\n')

a
PlayerIds 4219
Body 4219 

b
PlayerIds 12603
Body 12603 

c
PlayerIds 9264
Body 9264 

d
PlayerIds 6121
Body 6121 

e
PlayerIds 2396
Body 2396 

f
PlayerIds 4582
Body 4582 

g
PlayerIds 6319
Body 6319 

h
PlayerIds 10027
Body 10027 

i
PlayerIds 569
Body 569 

j
PlayerIds 5741
Body 5741 

k
PlayerIds 4045
Body 4045 

l
PlayerIds 5567
Body 5567 

m
PlayerIds 12067
Body 12067 

n
PlayerIds 2167
Body 2167 

o
PlayerIds 1892
Body 1892 

p
PlayerIds 6121
Body 6121 

q
PlayerIds 178
Body 178 

r
PlayerIds 6408
Body 6408 

s
PlayerIds 12013
Body 12013 

t
PlayerIds 5244
Body 5244 

u
PlayerIds 339
Body 339 

v
PlayerIds 1342
Body 1342 

w
PlayerIds 9381
Body 9381 

x
PlayerIds 3
Body 3 

y
PlayerIds 685
Body 685 

z
PlayerIds 483
Body 483 

