In [154]:
import pandas as pd
from bs4 import BeautifulSoup
import pickle
import time
import random
from dotenv import load_dotenv
import os
import requests
import re

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

from models import Player, Game, QbGameStats, SkillGameStats

In [125]:
def get_page(url):
    resp = requests.get(url)
    return resp
    
def get_soup(resp):
    soup = BeautifulSoup(resp.text, 'html.parser')
    return soup

def request_soup(url):
    resp = get_page(url)
    return get_soup(resp)

def extract_number(text):
    match = re.search(r'\d+', text)
    if match:
        return int(match.group())
    else:
        return None

In [4]:
#load hrefs
with open('player_hrefs.pkl', 'rb') as f:
    player_hrefs  = pickle.load(f)

1154

In [139]:
soup = request_soup('https://pro-football-reference.com/players/B/BradTo00/gamelog')

In [140]:
player_info = soup.find('div', {'id':'info'})
player_data = {}

player_data['name'] = player_info.find('h1').text.strip()

p_tags = player_info.find_all('p')

#position
pos_tag = player_info.find('strong', string='Position')
if pos_tag:
    pos = pos_tag.find_next_sibling(string=True)
    if next_text:
        player_data['name'] = pos.strip(": ").strip()

#Dimensions
dim_str = p_tags[2].text
cm_start = dim_str.find('(') + 1
cm_end = dim_str.find('c')
kg_start = cm_end + 4
kg_end = dim_str.find("k")

cm_str = dim_str[cm_start:cm_end]
kg_str = dim_str[kg_start:kg_end]

player_data['height_cm'] = int(cm_str)
player_data['weight_kg'] = int(kg_str)

#Date of Birth
bday_start_index = str(p_tags[3]).find('data-birth=') + len('data-birth=') + 1
bday_end_index = bday_start_index + 10
bday_str = str(p_tags[3])[bday_start_index : bday_end_index]
player_data['dob'] = bday_str

#College
player_data['college'] = p_tags[4].find('a').text.strip()

#drafted
overall_start_index = p_tags[7].text.find("(") + 1
overall_end_index = p_tags[7].text.find(")")
overall_str = p_tags[7].text[overall_start_index:overall_end_index]
player_data['drafted_overall'] = extract_number(overall_str)

player_data['drafted_year'] = None
if player_data['drafted_overall']:
    year_str = p_tags[7].text[overall_end_index:]
    try:
        player_data['drafted_year'] = extract_number(year_str)
    except:
        print(f"Issue with drafted year for {player_data['name']}")     

player_data

{'name': 'QB',
 'height_cm': 193,
 'weight_kg': 102,
 'dob': '1977-08-03',
 'college': 'Michigan',
 'drafted_overall': 199,
 'drafted_year': 2000}

In [147]:
table = soup.find('table', {'class': 'stats_table'}).find('tbody')
stats_tables = table.findAll('tr', {'id': re.compile("^stats")})
dnps = table.findAll('tr', {'id': re.compile("^injury")})

player_game_data = []
for data in stats_tables:
    game_stats = {}
    game_stats['year'] = data.find('td', {"data-stat" : "year_id"}).text
    game_stats['date'] = data.find('td', {"data-stat" : "game_date"}).text
    game_stats['week'] = data.find('td', {"data-stat" : "week_num"}).text
    game_stats['age'] = data.find('td', {"data-stat" : "age"}).text
    game_stats['team'] = data.find('td', {"data-stat" : "team"}).text
    game_stats['away_game'] = data.find('td', {"data-stat": "game_location"}).text.strip() == '@'
    game_stats['opp'] = data.find('td', {"data-stat" : "opp"}).text
    game_stats['started'] = data.find('td', {"data-stat" : "gs"}).text
    game_stats['pass_comp'] = data.find('td', {"data-stat" : "pass_cmp"}).text
    game_stats['pass_att'] = data.find('td', {"data-stat" : "pass_att"}).text
    game_stats['pass_cmp_perc'] = data.find('td', {"data-stat" : "pass_cmp_perc"}).text
    game_stats['pass_yds'] = data.find('td', {"data-stat" : "pass_yds"}).text
    game_stats['pass_td'] = data.find('td', {"data-stat" : "pass_td"}).text
    game_stats['pass_int'] = data.find('td', {"data-stat" : "pass_int"}).text
    game_stats['pass_sacked'] = data.find('td', {"data-stat" : "pass_sacked"}).text
    game_stats['pass_yds_per_att'] = data.find('td', {"data-stat" : "pass_yds_per_att"}).text
    game_stats['pass_adj_yds_per_att'] = data.find('td', {"data-stat" : "pass_adj_yds_per_att"}).text
    
    game_stats['rush_att'] = data.find('td', {"data-stat" : "rush_att"}).text
    game_stats['rush_yds'] = data.find('td', {"data-stat" : "rush_yds"}).text
    game_stats['rush_td'] = data.find('td', {"data-stat" : "rush_td"}).text
    
    game_stats['fumbles'] = data.find('td', {"data-stat" : "fumbles"}).text

    player_game_data.append(game_stats)

In [9]:
for href in player_hrefs:
    player_url = f'pro-football-reference.com{href}'
    print(player_url)

pro-football-reference.com/players/A/AndeMi00.htm
pro-football-reference.com/players/G/GoedDa00.htm
pro-football-reference.com/players/H/HarbJi00.htm
pro-football-reference.com/players/T/ThomCh03.htm
pro-football-reference.com/players/W/WarrPe00.htm
pro-football-reference.com/players/L/LemoCl00.htm
pro-football-reference.com/players/W/WashNa00.htm
pro-football-reference.com/players/D/DarkOr00.htm
pro-football-reference.com/players/S/SlatSt00.htm
pro-football-reference.com/players/M/MossRa00.htm
pro-football-reference.com/players/B/BrowMa03.htm
pro-football-reference.com/players/K/KeenCa00.htm
pro-football-reference.com/players/S/SandAc00.htm
pro-football-reference.com/players/M/McElGr00.htm
pro-football-reference.com/players/P/PalmJo00.htm
pro-football-reference.com/players/G/GageJu00.htm
pro-football-reference.com/players/J/JohnBr02.htm
pro-football-reference.com/players/C/CassMa00.htm
pro-football-reference.com/players/F/FishTo00.htm
pro-football-reference.com/players/J/JackBr00.htm


In [141]:
player_data

{'name': 'QB',
 'height_cm': 193,
 'weight_kg': 102,
 'dob': '1977-08-03',
 'college': 'Michigan',
 'drafted_overall': 199,
 'drafted_year': 2000}

In [146]:
player_game_data

[{'year': '2000',
  'date': '2000-11-23',
  'week': '13',
  'age': '23.112',
  'team': 'NWE',
  'away_location': True,
  'opp': 'DET',
  'started': '',
  'pass_comp': '1',
  'pass_att': '3',
  'pass_cmp_perc': '33.33',
  'pass_yds': '6',
  'pass_td': '0',
  'pass_int': '0',
  'pass_sacked': '0',
  'pass_yds_per_att': '2.00',
  'pass_adj_yds_per_att': '2.00',
  'rush_att': '0',
  'rush_yds': '0',
  'rush_td': '0',
  'fumbles': '0'},
 {'year': '2001',
  'date': '2001-09-23',
  'week': '2',
  'age': '24.051',
  'team': 'NWE',
  'away_location': False,
  'opp': 'NYJ',
  'started': '',
  'pass_comp': '5',
  'pass_att': '10',
  'pass_cmp_perc': '50.00',
  'pass_yds': '46',
  'pass_td': '0',
  'pass_int': '0',
  'pass_sacked': '0',
  'pass_yds_per_att': '4.60',
  'pass_adj_yds_per_att': '4.60',
  'rush_att': '1',
  'rush_yds': '9',
  'rush_td': '0',
  'fumbles': '0'},
 {'year': '2001',
  'date': '2001-09-30',
  'week': '3',
  'age': '24.058',
  'team': 'NWE',
  'away_location': False,
  'opp'

In [152]:
load_dotenv()
DB_USERNAME = os.environ['DB_USERNAME']
DB_PASSWORD = os.environ['DB_PASSWORD']
DB_HOST = os.environ['DB_HOST']
DB_PORT = os.environ['DB_PORT']
DB_NAME = os.environ['DB_NAME']

# Construct the database URL
DB_URL = f"postgresql://{DB_USERNAME}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(DB_URL)

In [153]:
Session = sessionmaker(bind=engine)
session = Session()

In [156]:
player_data

{'name': 'QB',
 'height_cm': 193,
 'weight_kg': 102,
 'dob': '1977-08-03',
 'college': 'Michigan',
 'drafted_overall': 199,
 'drafted_year': 2000}

In [155]:
test_player = Player(**player_data)

AmbiguousForeignKeysError: Could not determine join condition between parent/child tables on relationship TeamGameStats.game - there are multiple foreign key paths linking the tables.  Specify the 'foreign_keys' argument, providing a list of those columns which should be counted as containing a foreign key reference to the parent table.