In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import pickle
import time
import random
from dotenv import load_dotenv
import os
import requests
import re
from datetime import datetime
from tqdm import tqdm

from IPython.display import clear_output
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

from models import Player, QbGameStats, SkillGameStats

In [3]:
def get_page(url):
    resp = requests.get(url)
    return resp
    
def get_soup(resp):
    soup = BeautifulSoup(resp.text, 'html.parser')
    return soup

def request_soup(url):
    resp = get_page(url)
    return get_soup(resp)

def extract_number(text):
    match = re.search(r'\d+', text)
    if match:
        return int(match.group())
    else:
        return None

In [4]:
def create_player_data(soup):
    player_info = soup.find('div', {'id':'info'})
    player_data = {}
    
    player_data['name'] = player_info.find('h1').text.strip()
    
    p_tags = player_info.find_all('p')
    
    #position
    pos_tag = player_info.find('strong', string='Position')
    if pos_tag:
        pos = pos_tag.find_next_sibling(string=True)
        if pos:
            player_data['pos'] = pos.strip(": ").strip()
    
    #Dimensions
    dim_str = p_tags[2].text
    cm_start = dim_str.find('(') + 1
    cm_end = dim_str.find('c')
    kg_start = cm_end + 4
    kg_end = dim_str.find("k")
    
    cm_str = dim_str[cm_start:cm_end]
    kg_str = dim_str[kg_start:kg_end]
    
    player_data['height_cm'] = int(cm_str)
    player_data['weight_kg'] = int(kg_str)

    #Current Team
    team_tag = player_info.find('strong', string='Team')
    if team_tag:
        current_team = team_tag.find_next_sibling()
        current_team = current_team.find('a').text.strip()
        player_data['current_team'] = current_team
    else:
        player_data['current_team'] = None
    
    #Date of Birth
    born_tag = player_info.find('strong', string='Born:')
    if born_tag:
        bday_span = born_tag.find_next_sibling()
        bday_start_index = str(bday_span).find('data-birth=') + len('data-birth=') + 1
        bday_end_index = bday_start_index + 10
        bday_str = str(bday_span)[bday_start_index : bday_end_index]
        player_data['dob'] = bday_str
    else:
        player_data['dob'] = None
    
    
    #College
    college_tag = player_info.find('strong', string='College')
    if college_tag:
        college_a = college_tag.find_next_sibling()
        player_data['college'] = college_a.text.strip()
    else:
        player_data['college'] = None
    
    #drafted
    overall_tag = player_info.find('strong', string='Draft').find_parent('p')
    if overall_tag:
        overall_start_index = overall_tag.text.find("(") + 1
        overall_end_index = overall_tag.text.find(")")
        overall_str = overall_tag.text[overall_start_index:overall_end_index]
        player_data['drafted_overall'] = extract_number(overall_str)
    else:
        player_data['drafted_overall'] = None

    player_data['drafted_year'] = None
    if player_data['drafted_overall'] and overall_tag:
        year_str = overall_tag.text[overall_end_index:]
        try:
            player_data['drafted_year'] = extract_number(year_str)
        except:
            print(f"Issue with drafted year for {player_data['name']}")
    else:
        player_data['drafted_year'] = None
    return player_data

In [5]:
def scrape(data, data_stat, typ=None):
    text = data.find('td', {"data-stat": data_stat}).text.strip()
    if text == '':
        return None
    if typ is not None:
        return typ(text)
    return text

def create_game_stats_data(soup):
    table = soup.find('table', {'class': 'stats_table'}).find('tbody')
    stats_tables = table.findAll('tr', {'id': re.compile("^stats")})
    dnps = table.findAll('tr', {'id': re.compile("^injury")})

    player_game_data = []
    
    for data in stats_tables:
        game_stats = {}
        game_stats['year'] = scrape(data, "year_id", int)
        game_stats['date'] = datetime.strptime(scrape(data, "game_date"), '%Y-%m-%d') if scrape(data, "game_date") else None
        game_stats['week'] = scrape(data, "week_num", int)
        game_stats['age'] = scrape(data, "age", float)
        game_stats['team'] = scrape(data, "team")
        game_stats['away_game'] = scrape(data, "game_location") == '@'
        game_stats['opp'] = scrape(data, "opp")
        game_stats['started'] = scrape(data, "gs") == '*'

        if player_data['pos'] != 'TE':
            game_stats['rush_att'] = scrape(data, "rush_att", int)
            game_stats['rush_yds'] = scrape(data, "rush_yds", int)
            game_stats['rush_tds'] = scrape(data, "rush_td", int)
        game_stats['fumbles'] = scrape(data, "fumbles", int)

        if player_data['pos'] == 'QB':
            game_stats['pass_comp'] = scrape(data, "pass_cmp", int)
            game_stats['pass_att'] = scrape(data, "pass_att", int)
            game_stats['pass_yds'] = scrape(data, "pass_yds", int)
            game_stats['pass_tds'] = scrape(data, "pass_td", int)
            game_stats['pass_ints'] = scrape(data, "pass_int", int)
            game_stats['pass_sacks'] = scrape(data, "pass_sacked", int)
            game_stats['pass_per_att'] = scrape(data, "pass_yds_per_att", float)
            game_stats['pass_adj_per_att'] = scrape(data, "pass_adj_yds_per_att", float)
            game_stats['snaps'] = scrape(data, "offense", int)
            game_stats['snap_perc'] = float(scrape(data, "off_pct").replace('%', '')) / 100 if scrape(data, "off_pct") else None
        else:
            game_stats['rec_rec'] = scrape(data, "rec", int)
            game_stats['rec_tgt'] = scrape(data, "targets", int)
            game_stats['rec_yds'] = scrape(data, "rec_yds", int)
            game_stats['rec_tds'] = scrape(data, "rec_td", int)
        
        player_game_data.append(game_stats)

    dnp_game_data = []
    for dnp_soup in dnps:
        dnp_data = {}
        dnp_data['year'] = scrape(dnp_soup, "year_id", int)
        dnp_data['date'] = datetime.strptime(scrape(dnp_soup, "game_date"), '%Y-%m-%d') if scrape(dnp_soup, "game_date") else None
        dnp_data['week'] = scrape(dnp_soup, "week_num", int)
        dnp_data['team'] = scrape(dnp_soup, "team")
        dnp_data['away_game'] = scrape(dnp_soup, "game_location") == '@'
        dnp_data['opp'] = scrape(dnp_soup, "opp")

        reason = scrape(dnp_soup, "reason")
        
        dnp_data['inactive'] = reason == 'Inactive'
        dnp_data['dnp'] = reason == 'Did Not Play'
        dnp_data['suspended'] = reason == 'Suspended'
        dnp_data['ir'] = reason == 'Injured Reserve'
        
        dnp_game_data.append(dnp_data)
    
    return player_game_data, dnp_game_data

In [6]:
load_dotenv()
DB_USERNAME = os.environ['DB_USERNAME']
DB_PASSWORD = os.environ['DB_PASSWORD']
DB_HOST = os.environ['DB_HOST']
DB_PORT = os.environ['DB_PORT']
DB_NAME = os.environ['DB_NAME']

# Construct the database URL
DB_URL = f"postgresql://{DB_USERNAME}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(DB_URL)

In [7]:
Session = sessionmaker(bind=engine)
session = Session()

def update_or_create(model, obj_data, **kwargs):
    existing_obj = session.query(model).filter_by(**kwargs).first()
    if existing_obj:
        for key, value in obj_data.items():
            setattr(existing_obj, key, value)
        session.commit()
        return existing_obj
    else:
        new_obj = model(**obj_data)
        session.add(new_obj)
        session.commit()
        return new_obj

In [8]:
def update_player_data(player_data, player_game_data, dnp_game_data):
    player = update_or_create(Player, obj_data=player_data, pfr_id=player_data['pfr_id'])
    
    for game_stats in player_game_data:
        game_stats['player_id'] = player.id
    
        if player.pos == 'QB':
            model_obj = QbGameStats
        else:
            model_obj = SkillGameStats
    
        game_stat_obj = update_or_create(model_obj, obj_data=game_stats, date=game_stats['date'], player_id=game_stats['player_id'])
        session.add(game_stat_obj)
    
    for dnp_stats in dnp_game_data:
        dnp_stats['player_id'] = player.id
        if player.pos == 'QB':
            model_obj = QbGameStats
        else:
            model_obj = SkillGameStats
    
        game_stat_obj = update_or_create(model_obj, obj_data=dnp_stats, date=game_stats['date'], player_id=game_stats['player_id'])
        session.add(game_stat_obj)
          
    session.commit()

In [9]:
with open('issue_player_hrefs.pkl', 'rb') as f:
    issues_hrefs_runtime  = pickle.load(f)

In [None]:
def show_progress_bar(pct):
    total_bars = 20
    filled_bars = int((pct //5) * (total_bars // 100 * 5))
    empty_bars = total_bars - filled_bars

    bar = '|' + '#' * filled_bars + '-' * empty_bars + '|'
    print(bar)
    
with open('player_hrefs.pkl', 'rb') as f:
    player_hrefs  = pickle.load(f)

issues_href = []
i = 0
for href in player_hrefs:
    # try:
    href = href.strip('.htm')
    pfr_id = href[-8:]

    player_url = f'https://pro-football-reference.com/players/L/LewiMa00/gamelog/'
    # player_url = f'https://pro-football-reference.com{href}/gamelog/'

    delay = random.uniform(2, 8) + random.uniform(5,7)
    time.sleep(delay)

    soup = request_soup(player_url) 

    player_data = create_player_data(soup) 
    player_data['pfr_id'] = pfr_id
    player_game_data, dnp_game_data = create_game_stats_data(soup) 

    update_player_data(player_data, player_game_data, dnp_game_data)

    clear_output(wait=True)
    i += 1
    perc_done = (i / len(player_hrefs)) * 100

    show_progress_bar(perc_done)
    print(f"{i}/{len(player_hrefs)} ({perc_done:.2f}%)")
    print(f"Issue Count: {len(issues_href)}")
    print(f"{player_data['name']} Updated")
    # except:
    issues_href.append(href)

In [None]:
len(issues_hrefs_runtime)

In [None]:
with open('issue_player_hrefs.pkl', 'wb') as f:
    pickle.dump(issues_href, f)
    

In [None]:
session.close()