In [18]:
from urllib.request import urlopen
from bs4 import BeautifulSoup, Comment
import pandas as pd
from zoneinfo import ZoneInfo
from datetime import datetime

In [19]:
abbr = {
    "PHILADELPHIA 76ERS": "PHI" , 
    "LOS ANGELES LAKERS": "LAL" , 
    "NEW ORLEANS PELICANS": "NOP" , 
    "PORTLAND TRAIL BLAZERS": "POR" , 
    "PHOENIX SUNS": "PHO" , 
    "MEMPHIS GRIZZLIES": "MEM",
    "SAN ANTONIO SPURS" : "SAS", 
    "WASHINGTON WIZARDS" : "WAS", 
    "CHARLOTTE HORNETS" : "CHO", 
    "ORLANDO MAGIC" : "ORL", 
    "BOSTON CELTICS" : "BOS", 
    "INDIANA PACERS" : "IND", 
    "UTAH JAZZ" : "UTA", 
    "LOS ANGELES CLIPPERS": "LAC", 
    "CHICAGO BULLS" : "CHI", 
    "NEW YORK KNICKS": "NYK", 
    "BROOKLYN NETS": "BRK", 
    "HOUSTON ROCKETS": "HOU", 
    "DALLAS MAVERICKS" : "DAL", 
    "GOLDEN STATE WARRIORS": "GSW", 
    "MINNESOTA TIMBERWOLVES" : "MIN", 
    "ATLANTA HAWKS" : "ATL", 
    "OKLAHOMA CITY THUNDER" : "OKC", 
    "MIAMI HEAT" : "MIA", 
    "DENVER NUGGETS" : "DEN", 
    "DETROIT PISTONS" : "DET", 
    "SACRAMENTO KINGS" : "SAC",  
    "TORONTO RAPTORS" : "TOR", 
    "CLEVELAND CAVALIERS" : "CLE", 
    "MILWAUKEE BUCKS" : "MIL"
}

allgames2016 = pd.read_csv("./games/allgames2016.csv")
allgames2017 = pd.read_csv("./games/allgames2017.csv")
allgames2018 = pd.read_csv("./games/allgames2018.csv") 
allgames2019 = pd.read_csv("./games/allgames2019.csv")
allgames2020 = pd.read_csv("./games/allgames2020.csv")

# cut off the postseason
allgames2016 = allgames2016.iloc[range(0, 1230), :]
allgames2017 = allgames2017.iloc[range(0, 1230), :]
allgames2018 = allgames2018.iloc[range(0,1230), :]
allgames2019 = allgames2019.iloc[range(0, 1230), :]
allgames2020 = allgames2020.iloc[range(0, 971), :] #  there were 971 non-bubble games played this season 


seasons_arr = [allgames2016, allgames2017, allgames2018, allgames2019, allgames2020] 
seasons = {year:seasons_arr[year-16] for year in range(16, 21)}

In [20]:
#  season should be either 16, 17, 18, 19, or 20. 
#  These numbers represent the end year of the desired season
def four_factors_iter(season):
    allgames = seasons[season]
    for index in range(1230):
        
        # convert from UTC to EST to get the correct date info 
        home_team = abbr[allgames.iloc[index, 3]] 
        date = allgames.iloc[index, 0][:-9]
        year, month, day = [int(data) for data in date.split(" ")[0].split("-")] 
        hour, minute = [int(data) for data in date.split(" ")[1].split(":")]
        date = datetime(year, month, day, hour, minute, tzinfo=ZoneInfo('UTC')) 
        date = date.astimezone(ZoneInfo('EST'))  
        year, month, day = date.year, date.month, date.day
        if month < 10: 
            month = "0" + str(month)
        if day < 10: 
            day = "0" + str(day) 
        
        # parse HTML info on basketball-reference for a given game 
        url = "https://www.basketball-reference.com/boxscores/%s%s%s0%s.html" % (year, month, day, home_team) 
        response = urlopen(url)
        html = response.read().decode()
        soup = BeautifulSoup(html, 'html.parser')
        comments = soup.find_all(text=lambda tag: isinstance(tag, Comment))
        comment_soup = BeautifulSoup(str(comments), "html.parser")
        
         # Build the dictionary, and yield it
        stats = ["efg_pct", "tov_pct", "orb_pct", "ft_rate"]
        four_factors = {
            "home_team" : (allgames.iloc[index, 3], []), 
            "away_team" : (allgames.iloc[index, 1], []) 
        } 
        for stat in stats: 
            value = comment_soup.select(
                    'table[id="four_factors"] tbody > tr > td[data-stat="%s"]' % stat
            )
            four_factors["home_team"][1].append(float(value[0].text))  
            four_factors["away_team"][1].append(float(value[1].text))  
        yield four_factors
            



