## 1. Set up the scraper
* Download data from nfl.com
* Use internal api with Bearer Key

### 1.1 Get a working example for Bearer token

In [2]:
import time
from browsermobproxy import Server
from selenium import webdriver
import json
import os
BMP_PATH = os.path.abspath('browsermob-proxy-2.1.4/bin/browsermob-proxy.bat')

# Start BrowserMob Proxy
server = Server(BMP_PATH)
server.start()
proxy = server.create_proxy()

# Configure Chrome with the proxy
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument(f'--proxy-server={proxy.proxy}')
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument('--headless')
driver = webdriver.Chrome(options=chrome_options)
# Navigate to the website
driver.get("https://nfl.com/scores")

# Start capturing network traffic
proxy.new_har("nfl", options={'captureHeaders': True})

# Wait for the header with the name "Authorization"
timeout = 60  # Adjust the timeout as needed
def get_auth_token():
    start_time = time.time()
    while time.time() - start_time < timeout:
        har = proxy.har
        for entry in har['log']['entries']:
            request_headers = entry['request']['headers']
            for header in request_headers:
                if header['name'] == 'Authorization':
                    return header['value']
        time.sleep(1)
    return None
auth_token = get_auth_token()

# Store the auth token
auth_path = 'auth.json'
auth_json = {
    'time': time.time(),
    'token': auth_token
}
with open(auth_path, 'w') as f:
    json.dump(auth_json, f)

# Stop capturing network traffic
har = proxy.har

# Close the browser and BrowserMob Proxy
driver.quit()
server.stop()

### 1.2 Create a class that manages requests to nfl.com

In [5]:
import time
from browsermobproxy import Server
from selenium import webdriver
from pathlib import Path
import json
import requests
import os

class NFLClient():
    """Manages access to the nfl.com api"""
    BMP_PATH = os.path.abspath('browsermob-proxy-2.1.4/bin/browsermob-proxy.bat')
    AUTH_PATH = Path('auth.json')
    AUTH_ENDPOINT = 'https://nfl.com/scores'
    API_ROOT = 'https://api.nfl.com'
    HAR_DIR = Path('nfl_client_data/har_files')
    URL_JSON_PATH = Path('nfl_client_data/urls.json')
    TOKEN_EXPIRE_RATE = 60*60
    TOKEN_AUTH_TIMEOUT = 60

    # Har download variables
    har = {}
    har_path = None
    server = None
    proxy = None
    driver = None

    # Auth variables
    auth_token = None
    last_auth_download_time = 0
    headers = {}

    def __init__(self):
        os.makedirs(self.HAR_DIR, exist_ok=True)

    def load_auth_token(self):
        """Loads the auth token into the client."""
        if os.path.exists(self.AUTH_PATH):
            with open(self.AUTH_PATH, 'r') as f:
                auth_json = json.load(f)
            self.auth_token = auth_json['token']
            self.last_auth_download_time = auth_json['time']
        if self.last_auth_download_time < time.time()-self.TOKEN_EXPIRE_RATE or not self.auth_token:
            self.download_auth_token()

    def prep_proxy(self, endpoint=None):
        """Prepares the driver and proxy to get the har"""
        if not endpoint:
            endpoint = self.AUTH_ENDPOINT
        print(f'Getting the {endpoint} proxy')
        # Start BrowserMob Proxy
        self.server = Server(self.BMP_PATH)
        self.server.start()
        self.proxy = self.server.create_proxy()

        # Configure Chrome with the proxy
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument(f'--proxy-server={self.proxy.proxy}')
        chrome_options.add_argument('--ignore-certificate-errors')
        chrome_options.add_argument('--headless')
        self.driver = webdriver.Chrome(options=chrome_options)
        # Navigate to the website
        self.driver.get(endpoint)

        # Start capturing network traffic
        self.proxy.new_har("nfl", options={'captureHeaders': True})

        # Create the path for storing har data
        try:
            raw_path = endpoint.split('.com')[1]
            raw_path = raw_path.split('?')[0]
            str_path = raw_path.replace('/', '__')
        except:
            str_path = 'nfl'
        self.har_path = Path(str_path+'.har')

    def close_proxy(self):
        """Closes the driver and proxy"""
        self.server.stop()
        self.driver.quit()

    def wait_for_auth_token(self):
        """Downloads the auth token using the scores page of nfl.com"""
        # Wait for the header with the name "Authorization"
        def get_auth_token():
            start_time = time.time()
            while time.time() - start_time < self.TOKEN_AUTH_TIMEOUT:
                self.har = self.proxy.har
                for entry in self.har['log']['entries']:
                    request_headers = entry['request']['headers']
                    for header in request_headers:
                        if header['name'] == 'Authorization':
                            return header['value']
                time.sleep(1)
            return None
        self.auth_token = get_auth_token()

    def store_auth_token(self):
        """Stores the current auth token"""
        # Store the auth token
        self.time = time.time()
        auth_json = {
            'time': self.time,
            'token': self.auth_token
        }
        with open(self.AUTH_PATH, 'w') as f:
            json.dump(auth_json, f)

    def store_har(self):
        """Stores the current har"""
        # Store the har as well
        with open(self.har_path, 'w') as f:
            json.dump(self.har, f)

    def download_auth_token(self):
        """Downloads the auth token"""
        self.prep_proxy()         
        self.wait_for_auth_token()
        self.store_auth_token()
        self.store_har()

    def download_endpoints(self, endpoint=None):
        """Downloads the endpoints found in har, stores json, updates readme, updates Mixin for the class"""
        if not self.har:
            self.prep_proxy(endpoint)
            self.wait_for_auth_token()
            self.store_auth_token()
            self.store_har()
            self.close_proxy()

        # Gather the list of endpoints
        url_list = []
        for entry in self.har['log']['entries']:
            request = entry['request']
            raw_url = request['url']
            if self.API_ROOT in raw_url:
                url_split = raw_url.split('?')
                full_url = url_split[0]
                new_endpoint = ''
                base_split = full_url.split('.com') 
                if len(base_split) > 1:
                    new_endpoint = base_split[1]
                param_list = []
                if len(url_split) > 1:
                    for param in url_split[1]:
                        param, val = param.split('=')
                        param_list.append({
                            'param': param,
                            'val': val
                        }) 
                url_list.append({
                    'raw_url': raw_url,
                    'endpoint': new_endpoint,
                    'params': param_list
                })

        # Store the url list json
        with open(self.URL_JSON_PATH, 'w') as f:
            json.dump(url_list, f)

    def request(self, endpoint) -> dict:
        """Request an endpoint"""
        self.load_auth_token()
        try:
            return requests.get(endpoint, headers=self.headers)
        except Exception as e:
            return {
                'error': e
            }   

In [6]:
client = NFLClient()
client.load_auth_token()
client.auth_token

Getting the https://nfl.com/scores proxy


'Bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJjbGllbnRJZCI6ImU1MzVjN2MwLTgxN2YtNDc3Ni04OTkwLTU2NTU2ZjhiMTkyOCIsImNsaWVudEtleSI6IjRjRlVXNkRtd0pwelQ5TDdMckczcVJBY0FCRzVzMDRnIiwiaXNzIjoiTkZMIiwiZGV2aWNlSWQiOiJlZDgyNWI5Yi1jNDg3LTQyZTEtYTc5Mi1iNjg0NDQ1NzNhODYiLCJwbGFucyI6W3sicGxhbiI6ImZyZWUiLCJleHBpcmF0aW9uRGF0ZSI6IjIwMjQtMTEtMDQiLCJzb3VyY2UiOiJORkwiLCJzdGFydERhdGUiOiIyMDIzLTExLTA1Iiwic3RhdHVzIjoiQUNUSVZFIiwidHJpYWwiOmZhbHNlfV0sIkRpc3BsYXlOYW1lIjoiV0VCX0RFU0tUT1BfREVTS1RPUCIsIk5vdGVzIjoiIiwiZm9ybUZhY3RvciI6IkRFU0tUT1AiLCJsdXJhQXBwS2V5IjoiU1pzNTdkQkdSeGJMNzI4bFZwN0RZUSIsInBsYXRmb3JtIjoiREVTS1RPUCIsInByb2R1Y3ROYW1lIjoiV0VCIiwiY2l0eSI6ImFwcGxlIHZhbGxleSIsImNvdW50cnlDb2RlIjoiVVMiLCJkbWFDb2RlIjoiODAzIiwiaG1hVGVhbXMiOlsiMTA0MDI1MTAtODkzMS0wZDVmLTk4MTUtNzliYjc5NjQ5YTY1IiwiMTA0MDI1MjAtOTZiZi1lOWYyLTRmNjgtODUyMWNhODk2MDYwIiwiMTA0MDQ0MDAtM2IzNS0wNzNmLTE5N2UtMTk0YmI4MjQwNzIzIl0sInJlZ2lvbiI6IkNBIiwiYnJvd3NlciI6IkNocm9tZSBIZWFkbGVzcyIsImNlbGx1bGFyIjpmYWxzZSwiZW52aXJvbm1lbnQiOiJwcm9kdWN0aW9uIiwicm9sZXM

In [103]:
import json


def get_token():
    with open('nfl.har', 'r') as f:
        har_json = json.load(f)
    for entry in har_json['log']['entries']:
        for header in entry['request']['headers']:
            if header['name'].lower() == 'authorization':
                return header['value']
get_token()


'Bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJjbGllbnRJZCI6ImU1MzVjN2MwLTgxN2YtNDc3Ni04OTkwLTU2NTU2ZjhiMTkyOCIsImNsaWVudEtleSI6IjRjRlVXNkRtd0pwelQ5TDdMckczcVJBY0FCRzVzMDRnIiwiaXNzIjoiTkZMIiwiZGV2aWNlSWQiOiI3ZWRmZDk4Yy03MDY0LTQyMTUtYjFiMi0yOTY4MGFhODgyNzgiLCJwbGFucyI6W3sicGxhbiI6ImZyZWUiLCJleHBpcmF0aW9uRGF0ZSI6IjIwMjQtMTEtMDQiLCJzb3VyY2UiOiJORkwiLCJzdGFydERhdGUiOiIyMDIzLTExLTA0Iiwic3RhdHVzIjoiQUNUSVZFIiwidHJpYWwiOmZhbHNlfV0sIkRpc3BsYXlOYW1lIjoiV0VCX0RFU0tUT1BfREVTS1RPUCIsIk5vdGVzIjoiIiwiZm9ybUZhY3RvciI6IkRFU0tUT1AiLCJsdXJhQXBwS2V5IjoiU1pzNTdkQkdSeGJMNzI4bFZwN0RZUSIsInBsYXRmb3JtIjoiREVTS1RPUCIsInByb2R1Y3ROYW1lIjoiV0VCIiwiY2l0eSI6ImFwcGxlIHZhbGxleSIsImNvdW50cnlDb2RlIjoiVVMiLCJkbWFDb2RlIjoiODAzIiwiaG1hVGVhbXMiOlsiMTA0MDI1MTAtODkzMS0wZDVmLTk4MTUtNzliYjc5NjQ5YTY1IiwiMTA0MDI1MjAtOTZiZi1lOWYyLTRmNjgtODUyMWNhODk2MDYwIiwiMTA0MDQ0MDAtM2IzNS0wNzNmLTE5N2UtMTk0YmI4MjQwNzIzIl0sInJlZ2lvbiI6IkNBIiwiYnJvd3NlciI6IkNocm9tZSIsImNlbGx1bGFyIjpmYWxzZSwiZW52aXJvbm1lbnQiOiJwcm9kdWN0aW9uIiwicm9sZXMiOlsiZnJlZSJ

In [93]:
from browsermobproxy import Server
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Start BrowserMob Proxy
server = Server(BMP_PATH)
server.start()
proxy = server.create_proxy()

# Configure Chrome with the proxy
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument(f'--proxy-server={proxy.proxy}')
chrome_options.add_argument('--ignore-certificate-errors')
driver = webdriver.Chrome(options=chrome_options)

# Navigate to the website
driver.get("https://nfl.com")

# Start capturing network traffic
proxy.new_har("nfl", options={'captureHeaders': True})

# Perform interactions on the website as needed

# Wait for the "Authorization" header to appear in the HAR
def is_authorization_in_har(har):
    for entry in har['log']['entries']:
        for header in entry['request']['headers']:
            if header['name'] == 'Authorization':
                return True
    return False

wait = WebDriverWait(driver, 60)  # Adjust the timeout as needed
try:
    wait.until(lambda driver: is_authorization_in_har(proxy.har))
    print("Authorization header found in the HAR")
except Exception:
    print("Authorization header not found in the HAR")

# Close the browser and BrowserMob Proxy
driver.quit()
server.stop()


Authorization header found in the HAR


In [38]:
from selenium import webdriver


options = webdriver.ChromeOptions()
options.add_argument('--headless')

driver = webdriver.Chrome(options)

root_url = 'https://google.com'
root_url = 'https://nfl.com'
driver.get(root_url)

In [48]:
driver.get_network_conditions()

WebDriverException: Message: unknown error: network conditions must be set before it can be retrieved
  (Session info: headless chrome=118.0.5993.120)
Stacktrace:
	GetHandleVerifier [0x00007FF7D1028EF2+54786]
	(No symbol) [0x00007FF7D0F95612]
	(No symbol) [0x00007FF7D0E4A64B]
	(No symbol) [0x00007FF7D0EB8E10]
	(No symbol) [0x00007FF7D0EABE30]
	(No symbol) [0x00007FF7D0E80941]
	(No symbol) [0x00007FF7D0E81B84]
	GetHandleVerifier [0x00007FF7D1377F52+3524194]
	GetHandleVerifier [0x00007FF7D13CD800+3874576]
	GetHandleVerifier [0x00007FF7D13C5D7F+3843215]
	GetHandleVerifier [0x00007FF7D10C5086+694166]
	(No symbol) [0x00007FF7D0FA0A88]
	(No symbol) [0x00007FF7D0F9CA94]
	(No symbol) [0x00007FF7D0F9CBC2]
	(No symbol) [0x00007FF7D0F8CC83]
	BaseThreadInitThunk [0x00007FFD49F27344+20]
	RtlUserThreadStart [0x00007FFD4A5E26B1+33]


In [50]:
driver.quit()

In [34]:
import requests
import json

# Define the access token
access_token = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJjbGllbnRJZCI6ImU1MzVjN2MwLTgxN2YtNDc3Ni04OTkwLTU2NTU2ZjhiMTkyOCIsImNsaWVudEtleSI6IjRjRlVXNkRtd0pwelQ5TDdMckczcVJBY0FCRzVzMDRnIiwiaXNzIjoiTkZMIiwiZGV2aWNlSWQiOiI5NTIzOGE1YS05MTIxLTQzNTEtOTY4MC0wNzZlN2Y2ZTQ1NmYiLCJwbGFucyI6W3sicGxhbiI6ImZyZWUiLCJleHBpcmF0aW9uRGF0ZSI6IjIwMjQtMTEtMDMiLCJzb3VyY2UiOiJORkwiLCJzdGFydERhdGUiOiIyMDIzLTExLTA0Iiwic3RhdHVzIjoiQUNUSVZFIiwidHJpYWwiOmZhbHNlfV0sIkRpc3BsYXlOYW1lIjoiV0VCX0RFU0tUT1BfREVTS1RPUCIsIk5vdGVzIjoiIiwiZm9ybUZhY3RvciI6IkRFU0tUT1AiLCJsdXJhQXBwS2V5IjoiU1pzNTdkQkdSeGJMNzI4bFZwN0RZUSIsInBsYXRmb3JtIjoiREVTS1RPUCIsInByb2R1Y3ROYW1lIjoiV0VCIiwiY2l0eSI6ImxvcyBhbmdlbGVzIiwiY291bnRyeUNvZGUiOiJVUyIsImRtYUNvZGUiOiI4MDMiLCJobWFUZWFtcyI6WyIxMDQwMjUxMC04OTMxLTBkNWYtOTgxNS03OWJiNzk2NDlhNjUiLCIxMDQwMjUyMC05NmJmLWU5ZjItNGY2OC04NTIxY2E4OTYwNjAiLCIxMDQwNDQwMC0zYjM1LTA3M2YtMTk3ZS0xOTRiYjgyNDA3MjMiXSwicmVnaW9uIjoiQ0EiLCJicm93c2VyIjoiQ2hyb21lIiwiY2VsbHVsYXIiOmZhbHNlLCJlbnZpcm9ubWVudCI6InByb2R1Y3Rpb24iLCJyb2xlcyI6WyJmcmVlIl0sImV4cCI6MTY5OTA2NTUwMH0.3a0m-SwXS5ElV9NHNFy2EXwTUDtDCzZkpdPpYs-eW_g'
headers = {
    'Authorization': access_token
}

def request(api_url):
    return requests.get(api_url, headers={
        'Authorization': access_token
    })

### 1.2 Get team data through the teams endpoint

In [35]:
api_url = 'https://api.nfl.com/football/v2/teams?limit=100&season=2023'
request(api_url)

<Response [200]>