In [57]:
import pandas as pd
from bs4 import BeautifulSoup
import pickle
import time
import random
from dotenv import load_dotenv
import os
import requests
import re
from datetime import datetime

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

from models import Player, QbGameStats, SkillGameStats

In [58]:
def get_page(url):
    resp = requests.get(url)
    return resp
    
def get_soup(resp):
    soup = BeautifulSoup(resp.text, 'html.parser')
    return soup

def request_soup(url):
    resp = get_page(url)
    return get_soup(resp)

def extract_number(text):
    match = re.search(r'\d+', text)
    if match:
        return int(match.group())
    else:
        return None

In [59]:
#load hrefs
with open('player_hrefs.pkl', 'rb') as f:
    player_hrefs  = pickle.load(f)

In [101]:
soup = request_soup('https://www.pro-football-reference.com/players/J/JameEd00/gamelog/')
# soup = request_soup('https://www.pro-football-reference.com/players/B/BradTo00/gamelog/')

In [102]:
player_info = soup.find('div', {'id':'info'})
player_data = {}

player_data['name'] = player_info.find('h1').text.strip()

p_tags = player_info.find_all('p')

#position
pos_tag = player_info.find('strong', string='Position')
if pos_tag:
    pos = pos_tag.find_next_sibling(string=True)
    if pos:
        player_data['pos'] = pos.strip(": ").strip()

#Dimensions
dim_str = p_tags[2].text
cm_start = dim_str.find('(') + 1
cm_end = dim_str.find('c')
kg_start = cm_end + 4
kg_end = dim_str.find("k")

cm_str = dim_str[cm_start:cm_end]
kg_str = dim_str[kg_start:kg_end]

player_data['height_cm'] = int(cm_str)
player_data['weight_kg'] = int(kg_str)

#Date of Birth
bday_start_index = str(p_tags[3]).find('data-birth=') + len('data-birth=') + 1
bday_end_index = bday_start_index + 10
bday_str = str(p_tags[3])[bday_start_index : bday_end_index]
player_data['dob'] = bday_str

#College
player_data['college'] = p_tags[4].find('a').text.strip()

#drafted
overall_start_index = p_tags[7].text.find("(") + 1
overall_end_index = p_tags[7].text.find(")")
overall_str = p_tags[7].text[overall_start_index:overall_end_index]
player_data['drafted_overall'] = extract_number(overall_str)

player_data['drafted_year'] = None
if player_data['drafted_overall']:
    year_str = p_tags[7].text[overall_end_index:]
    try:
        player_data['drafted_year'] = extract_number(year_str)
    except:
        print(f"Issue with drafted year for {player_data['name']}")     

player_data['pfr_id'] = 'BradTo00'

In [103]:
print(player_data)

{'name': 'Edgerrin James', 'pos': 'RB', 'height_cm': 183, 'weight_kg': 99, 'dob': '1978-08-01', 'college': 'Miami (FL)', 'drafted_overall': 4, 'drafted_year': 1999, 'pfr_id': 'BradTo00'}


In [112]:
table = soup.find('table', {'class': 'stats_table'}).find('tbody')
stats_tables = table.findAll('tr', {'id': re.compile("^stats")})
dnps = table.findAll('tr', {'id': re.compile("^injury")})

player_game_data = []

for data in stats_tables:
    game_stats = {}
    game_stats['year'] = int(data.find('td', {"data-stat" : "year_id"}).text.strip())
    game_stats['date'] = datetime.strptime(data.find('td', {"data-stat" : "game_date"}).text.strip(), '%Y-%m-%d')
    game_stats['week'] = int(data.find('td', {"data-stat" : "week_num"}).text.strip())
    game_stats['age'] = float(data.find('td', {"data-stat" : "age"}).text.strip())
    game_stats['team'] = data.find('td', {"data-stat" : "team"}).text.strip()
    game_stats['away_game'] = data.find('td', {"data-stat": "game_location"}).text.strip() == '@'
    game_stats['opp'] = data.find('td', {"data-stat" : "opp"}).text
    game_stats['started'] = data.find('td', {"data-stat" : "gs"}).text.strip() == '*'

    game_stats['rush_att'] = int(data.find('td', {"data-stat" : "rush_att"}).text.strip())
    game_stats['rush_yds'] = int(data.find('td', {"data-stat" : "rush_yds"}).text.strip())
    game_stats['rush_tds'] = int(data.find('td', {"data-stat" : "rush_td"}).text.strip())

    game_stats['fumbles'] = int(data.find('td', {"data-stat" : "fumbles"}).text.strip())
    
    if player_data['pos'] == 'QB':
        game_stats['pass_comp'] = int(data.find('td', {"data-stat" : "pass_cmp"}).text.strip())
        game_stats['pass_att'] = int(data.find('td', {"data-stat" : "pass_att"}).text.strip())
        game_stats['pass_yds'] = int(data.find('td', {"data-stat" : "pass_yds"}).text.strip())
        game_stats['pass_tds'] = int(data.find('td', {"data-stat" : "pass_td"}).text.strip())
        game_stats['pass_ints'] = int(data.find('td', {"data-stat" : "pass_int"}).text.strip())
        game_stats['pass_sacks'] = int(data.find('td', {"data-stat" : "pass_sacked"}).text.strip())
        game_stats['pass_per_att'] = float(data.find('td', {"data-stat" : "pass_yds_per_att"}).text.strip())
        game_stats['pass_adj_per_att'] = float(data.find('td', {"data-stat" : "pass_adj_yds_per_att"}).text.strip())
        game_stats['snaps'] = int(data.find('td', {'data-stat' : 'offense'}).text.strip()) if data.find('td', {'data-stat': 'offense'}).text.strip() else None
        game_stats['snap_perc'] = game_stats['snap_perc'] = float(data.find('td', {'data-stat': 'off_pct'}).text.strip().replace('%', '')) / 100 if data.find('td', {'data-stat': 'off_pct'}).text.strip() else None
        
    else:
        game_stats['rec_rec'] = int(data.find('td', {'data-stat' : 'rec'}).text.strip())
        game_stats['rec_tgt'] = int(data.find('td', {'data-stat' : 'targets'}).text.strip())
        game_stats['rec_yds'] = int(data.find('td', {'data-stat' : 'rec_yds'}).text.strip())
        game_stats['rec_tds'] = int(data.find('td', {'data-stat' : 'rec_td'}).text.strip())
        game_stats['rec_rec'] = int(data.find('td', {'data-stat' : 'rec'}).text.strip())            
   
    player_game_data.append(game_stats)

dnp_game_data =  []

for dnp_data in dnps:
    dnp_data = {}
    dnp_data['year'] = int(data.find('td', {"data-stat" : "year_id"}).text.strip())
    dnp_data['date'] = datetime.strptime(data.find('td', {"data-stat" : "game_date"}).text.strip(), '%Y-%m-%d')
    dnp_data['week'] = int(data.find('td', {"data-stat" : "week_num"}).text.strip())     
    dnp_data['team'] = data.find('td', {"data-stat" : "team"}).text.strip()
    dnp_data['away_game'] = data.find('td', {"data-stat": "game_location"}).text.strip() == '@'
    dnp_data['opp'] = data.find('td', {"data-stat" : "opp"}).text.strip()

    reason = data.find('td', {"data-stat" : "reason"}).text.strip()
    print(reason)
    
    dnp_data['inactive'] = reason == 'Inactive'
    dnp_data['dnp'] = reason == 'Did Not Play'
    dnp_data['player_suspended'] = reason == 'Suspended'
    dnp_data['ir'] = reason == 'Injuried Reserve'

In [113]:
player_game_data[0]

{'year': 1999,
 'date': datetime.datetime(1999, 9, 12, 0, 0),
 'week': 1,
 'age': 21.042,
 'team': 'IND',
 'away_game': False,
 'opp': 'BUF',
 'started': True,
 'rush_att': 26,
 'rush_yds': 112,
 'rush_tds': 1,
 'fumbles': 1,
 'rec_rec': 4,
 'rec_tgt': 4,
 'rec_yds': 14,
 'rec_tds': 0}

In [114]:
for href in player_hrefs:
    player_url = f'pro-football-reference.com{href}'
    print(player_url)

pro-football-reference.com/players/C/ChamBy00.htm
pro-football-reference.com/players/F/FostDe00.htm
pro-football-reference.com/players/M/McCoLu00.htm
pro-football-reference.com/players/B/BrowMa03.htm
pro-football-reference.com/players/B/BrysSh00.htm
pro-football-reference.com/players/P/PalkTy00.htm
pro-football-reference.com/players/T/ThomAn01.htm
pro-football-reference.com/players/E/EngrEv00.htm
pro-football-reference.com/players/W/WardHi00.htm
pro-football-reference.com/players/M/MeyeJa01.htm
pro-football-reference.com/players/H/HarvPe00.htm
pro-football-reference.com/players/M/MiliIt00.htm
pro-football-reference.com/players/B/BrowRe01.htm
pro-football-reference.com/players/M/MoorLa00.htm
pro-football-reference.com/players/J/JeffVa00.htm
pro-football-reference.com/players/H/HuarDa00.htm
pro-football-reference.com/players/S/StreRo00.htm
pro-football-reference.com/players/S/SmitDe07.htm
pro-football-reference.com/players/H/HillRo00.htm
pro-football-reference.com/players/B/BellLe00.htm


In [115]:
player_data

{'name': 'Edgerrin James',
 'pos': 'RB',
 'height_cm': 183,
 'weight_kg': 99,
 'dob': '1978-08-01',
 'college': 'Miami (FL)',
 'drafted_overall': 4,
 'drafted_year': 1999,
 'pfr_id': 'BradTo00'}

In [116]:
load_dotenv()
DB_USERNAME = os.environ['DB_USERNAME']
DB_PASSWORD = os.environ['DB_PASSWORD']
DB_HOST = os.environ['DB_HOST']
DB_PORT = os.environ['DB_PORT']
DB_NAME = os.environ['DB_NAME']

# Construct the database URL
DB_URL = f"postgresql://{DB_USERNAME}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(DB_URL)

In [117]:
Session = sessionmaker(bind=engine)
session = Session()

def update_or_create(model, obj_data, **kwargs):
    existing_obj = session.query(model).filter_by(**kwargs).first()
    if existing_obj:
        for key, value in obj_data.items():
            setattr(existing_obj, key, value)
        session.commit()
        return existing_obj
    else:
        new_obj = model(**obj_data)
        session.add(new_obj)
        session.commit()
        return new_obj

In [118]:
game_stats

{'year': 2009,
 'date': datetime.datetime(2009, 11, 1, 0, 0),
 'week': 8,
 'age': 31.092,
 'team': 'SEA',
 'away_game': True,
 'opp': 'DAL',
 'started': False,
 'rush_att': 6,
 'rush_yds': 17,
 'rush_tds': 0,
 'fumbles': 0,
 'rec_rec': 1,
 'rec_tgt': 1,
 'rec_yds': 6,
 'rec_tds': 0}

In [119]:
player = update_or_create(Player, obj_data=player_data, pfr_id=player_data['pfr_id'])

for game_stats in player_game_data:
    game_stats['player_id'] = player.id

    if player.pos == 'QB':
        model_obj = QbGameStats
    else:
        model_obj = SkillGameStats

    game_stat_obj = update_or_create(model_obj, obj_data=game_stats, date=game_stats['date'], player_id=game_stats['player_id'])
    session.add(game_stat_obj)
      
session.commit()

In [120]:
session.close()