In [5]:
import requests, time, re, os, json, math
import pymysql, pyodbc
import sqlalchemy as sal
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

#importing credentials
with open('../../../Notes-General/config.txt', 'r') as f:
    creds = f.read()
creds = json.loads(creds)
league = 'nfl'

#### PARAMETERS ######
# assign directory and save output path
# the "..\" is because the script and output/data folders are at the same level
pathToLh = '..\\data\\League History\\2023-2023_league_history.txt'
# folder to draft htmls
directory = '..\data\Draft'
# output paths for excels
outputPathDraft = '..\\data\\Draft\\draftHistoryLoad.csv'
outputPathTeams = '..\\data\\League History\\teamsLoad.xlsx'
outputPathGames = '..\\data\\League History\\gamesLoad.xlsx'
outputPathPlayers = '..\\data\\League History\\playersLoad.xlsx'

# PROCESS DRAFT PAGE HTML - save html directly from webpage and save to text file

In [114]:
### INPUTS THAT NEED TO BE UPDATE FOR USER
# number of teams in the league
nTeams = 12

#suffixes = ["Jr.", "Sr.", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX"]  # name suffixes that will be

# this will hold the agg for all of the years
drafts = pd.DataFrame(columns=['season', 'pick', 'round', 'overallPick',
                               'name', 'playerTeam', 'pos', 'teamName'])

# iterate over files in  that directory
for filename in os.scandir(directory):
    if filename.is_file():

        # the file is just the <divs> that contain the draft round data
        # parent div at the time of creation = <div class="ResponsiveTable">
        # details for each pick are <td class=Table__TD>  there are 3 <td>s for each pick
        with open(filename.path, "r", encoding='utf-8') as f:
            data = f.read()

        soup = bs(data, "html.parser")
        picks = soup.find_all("td", class_="Table__TD")
        
        o = 1  # overall pick count tracker
        year = filename.name.split(".")[0]

        for p in range(0, len(picks), 3):
            # p = pick number; p+1= player name, team, pos; p+2 = fantasy team

            # splitting the player info <td>
            playerInfo = picks[p + 1].text.split()

            # checking for name suffixes and collapsing them into the last name when they exist
            if len(playerInfo) == 5:
                suffix = playerInfo.pop(2)
                playerInfo[1] += " " + suffix

            firstName = playerInfo[0]
            lastName = playerInfo[1]
            name = firstName + " " + lastName
            team = playerInfo[2].replace(",", "")
            pos = playerInfo[3]

            # round details
            n = int(picks[p].text)  # pick number in the round
            r = math.ceil((o / nTeams))  # round number

            # fantasy team making the pick
            fTeam = picks[p + 2].text

            pick = [int(year), n, r, o, name, team, pos, fTeam]
            # drafts = pd.concat([drafts, pick], ignore_index=True)
            drafts.loc[len(drafts.index)] = pick
            o += 1


# PROCESS END OF SEASON LEAGUE HISTORY DATA

In [46]:
with open(pathToLh, 'r') as f:
    d = f.read()
js = json.loads(d)

##### mTeam

In [115]:
### mTeam - has most team specific, season totals 
season = 2023
allTeamData = []

# teams are dictionaries in a single list
teams = js['teams']

# looping through team to create df for db load
for t in teams:
    teamId = t['id']
    #teamName = t['location'] + " " +  t['nickname'] 
    teamName = t['name']
    abbr = t['abbrev']
    pfAllSeason = t['points']
    rankRegSeason = t['playoffSeed']
    rankFinalCalc = t['rankCalculatedFinal']
    draftDayProjRank = t['draftDayProjectedRank']
    wins = t['record']['overall']['wins']
    losses = t['record']['overall']['losses']
    ties = t['record']['overall']['ties']
    pf = t['record']['overall']['pointsFor']
    pa = t['record']['overall']['pointsAgainst']
    streak = t['record']['overall']['streakLength']
    streakType = t['record']['overall']['streakType']
    budgetSpent = t['transactionCounter']['acquisitionBudgetSpent']
    acqs = t['transactionCounter']['acquisitions']
    drops = t['transactionCounter']['drops']
    
    
    team = [int(season), teamId, teamName, abbr, wins, losses, ties, pf, pa, pfAllSeason,
            rankRegSeason, rankFinalCalc,  streak, streakType, 
            budgetSpent, acqs, drops, draftDayProjRank]
    allTeamData.append(team)
        
cols = ['season', 'teamId', 'teamName', 'abbr', 'wins', 'losses', 'ties', 'pf', 'pa', 'pfAllSeason',
        'rankRegSeason', 'rankFinalCalc', 'streak', 'streakType', 
        'budgetSpent', 'acqs', 'drops', 'draftDayProjRank']
teams = pd.DataFrame(allTeamData, columns = cols)

In [None]:
################ MULTIPLE YEARS
### mTeam - has most team specific, season totals 
allTeamData = []

# looping through league history data. outermost keys are season years
for y in js.keys():
    # teams are dictionaries in a single list
    teams = js[y]['mTeam']['teams']
    # looping through team to create df for db load
    for t in teams:
        teamId = t['id']
        teamName = t['location'] + " " +  t['nickname'] 
        abbr = t['abbrev']
        pfAllSeason = t['points']
        rankRegSeason = t['playoffSeed']
        rankFinalCalc = t['rankCalculatedFinal']
        wins = t['record']['overall']['wins']
        losses = t['record']['overall']['losses']
        ties = t['record']['overall']['ties']
        pf = t['record']['overall']['pointsFor']
        pa = t['record']['overall']['pointsAgainst']
        streak = t['record']['overall']['streakLength']
        streakType = t['record']['overall']['streakType']
        budgetSpent = t['transactionCounter']['acquisitionBudgetSpent']
        acqs = t['transactionCounter']['acquisitions']
        drops = t['transactionCounter']['drops']
        
        team = [int(y), teamId, teamName, abbr, wins, losses, ties, pf, pa, pfAllSeason,
                rankRegSeason, rankFinalCalc,  streak, streakType, 
                budgetSpent, acqs, drops ]
        allTeamData.append(team)
        
cols = ['season', 'teamId', 'teamName', 'abbr', 'wins', 'losses', 'ties', 'pf', 'pa', 'pfAllSeason',
        'rankRegSeason', 'rankFinalCalc', 'streak', 'streakType', 
        'budgetSpent', 'acqs', 'drops' ]
teams = pd.DataFrame(allTeamData, columns = cols)

# filling in blank names from season 1
teams.iloc[10, 2] = "t1"
teams.iloc[11, 2] = "t2"

##### mBoxscore

In [116]:
## mBoxscore - has weekly matchup scores at the team level when looking at league history
# it has player scores too if looking in season during a week.

allGames = []
playoffStarts = {}

gameResults = js['schedule']

for i in range(len(gameResults)):
    g = gameResults[i]
    
    # handles playoff bye weeks
    if 'away' not in g:
        home = g['home']
        
        firstPlayoffWeek = g['matchupPeriodId']
        #checks if the season is already in the dict and skips re assigning if it is
        if season in playoffStarts:
            pass
        else:
           playoffStarts[int(season)] = firstPlayoffWeek
        
        week = g['matchupPeriodId']
        gameId = g['id']
        bye = 1
        teamOneId = None
        teamOneTiebreak = None
        teamOnePf = None
        
        teamTwoId = home['teamId']
        teamTwoTiebreak = home['tiebreak']
        teamTwoPf = home['totalPoints']
        
        winner = 0
        loser = 0
        tieTeamOne = 0  
        tieTeamTwo = 0
        
    # flatten game results for non-bye matchups
    else:
        week = g['matchupPeriodId']
        gameId = g['id']
        bye = 0
        
        away = g['away']
        home = g['home']
    
        teamOneId = away['teamId']
        teamOneTiebreak = away['tiebreak']
        teamOnePf = away['totalPoints']

        
        teamTwoId = home['teamId']
        teamTwoTiebreak = home['tiebreak']
        teamTwoPf = home['totalPoints']
    
        # label winner and loser
        if (teamOnePf + teamOneTiebreak) > (teamTwoPf + teamTwoTiebreak):
            winner = teamOneId
            loser = teamTwoId
            tieTeamOne = 0  
            tieTeamTwo = 0
        elif (teamOnePf + teamOneTiebreak) < (teamTwoPf + teamTwoTiebreak):
            winner = teamTwoId
            loser = teamOneId
            tieTeamOne = 0
            tieTeamTwo = 0
        else:
            winner = 0
            loser = 0
            tieTeamOne = teamOneId
            tieTeamTwo = teamTwoId
    
    gameResult = [int(season), week, gameId, teamOneId, teamOnePf, teamOneTiebreak, teamTwoId,
                  teamTwoPf, teamTwoTiebreak, winner, loser, tieTeamOne, tieTeamTwo,
                  bye#, playoff
                 ]
    allGames.append(gameResult)
        
cols = ['season', 'week', 'gameId', 'teamOneId', 'teamOnePf', 'teamOneTiebreak',
        'teamTwoId','teamTwoPf', 'teamTwoTiebreak', 'winner', 'loser', 'tieTeamOne', 
        'tieTeamTwo',  'bye'#, 'playoffs'
       ]

games = pd.DataFrame(allGames, columns = cols)

#will add values to this in the loop below
games['playoffs'] = 0

# adding playoff flag
for k, v in playoffStarts.items():
    
    # regular season games
    mask = ((games['season'] == k) & (games['week'] < v))
    games.loc[mask, 'playoffs'] = 0
    # play off games
    mask = ((games['season'] == k) & (games['week'] >= v))
    games.loc[mask, 'playoffs'] = 1                          

In [160]:
###### FOR MULTIPLE YEARS

## mBoxscore - has weekly matchup scores at the team level when looking at league history
# it has player scores too if looking in season during a week.

allGames = []
playoffStarts = {}
for y in js.keys():
    gameResults = js[y]['mBoxscore']['schedule']

    for i in range(len(gameResults)):
        g = gameResults[i]
        
        # handles playoff bye weeks
        if 'away' not in g:
            home = g['home']
            
            firstPlayoffWeek = g['matchupPeriodId']
            #checks if the season is already in the dict and skips re assigning if it is
            if y in playoffStarts:
                pass
            else:
               playoffStarts[int(y)] = firstPlayoffWeek
            
            week = g['matchupPeriodId']
            gameId = g['id']
            bye = 1
            teamOneId = None
            teamOneTiebreak = None
            teamOnePf = None
            
            teamTwoId = home['teamId']
            teamTwoTiebreak = home['tiebreak']
            teamTwoPf = home['totalPoints']
            
            winner = 0
            loser = 0
            tieTeamOne = 0  
            tieTeamTwo = 0
            
        # flatten game results for non-bye matchups
        else:
            week = g['matchupPeriodId']
            gameId = g['id']
            bye = 0
            
            away = g['away']
            home = g['home']
        
            teamOneId = away['teamId']
            teamOneTiebreak = away['tiebreak']
            teamOnePf = away['totalPoints']

            
            teamTwoId = home['teamId']
            teamTwoTiebreak = home['tiebreak']
            teamTwoPf = home['totalPoints']
        
            # label winner and loser
            if (teamOnePf + teamOneTiebreak) > (teamTwoPf + teamTwoTiebreak):
                winner = teamOneId
                loser = teamTwoId
                tieTeamOne = 0  
                tieTeamTwo = 0
            elif (teamOnePf + teamOneTiebreak) < (teamTwoPf + teamTwoTiebreak):
                winner = teamTwoId
                loser = teamOneId
                tieTeamOne = 0
                tieTeamTwo = 0
            else:
                winner = 0
                loser = 0
                tieTeamOne = teamOneId
                tieTeamTwo = teamTwoId
        
        gameResult = [int(y), week, gameId, teamOneId, teamOnePf, teamOneTiebreak, teamTwoId,
                      teamTwoPf, teamTwoTiebreak, winner, loser, tieTeamOne, tieTeamTwo,
                      bye#, playoff
                     ]
        allGames.append(gameResult)
        
cols = ['season', 'week', 'gameId', 'teamOneId', 'teamOnePf', 'teamOneTiebreak',
        'teamTwoId','teamTwoPf', 'teamTwoTiebreak', 'winner', 'loser', 'tieTeamOne', 
        'tieTeamTwo',  'bye'#, 'playoffs'
       ]

games = pd.DataFrame(allGames, columns = cols)

#will add values to this in the loop below
games['playoffs'] = 0

# adding playoff flag
for k, v in playoffStarts.items():
    
    # regular season games
    mask = ((games['season'] == k) & (games['week'] < v))
    games.loc[mask, 'playoffs'] = 0
    # play off games
    mask = ((games['season'] == k) & (games['week'] >= v))
    games.loc[mask, 'playoffs'] = 1                                 

##### kona_player_info

In [139]:
season = 2023
with open(r"..\data\League History\2023-2023_player_history.txt", 'r') as f:
    d = f.read()
js = json.loads(d)
js.keys()

dict_keys(['players', 'positionAgainstOpponent'])

In [140]:
### kona_player_info
# this has player stats, info, scoring for season total and weekly (when searching in season)
playerCount = {
    0:0, 1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 9:0, 10:0, 11:0, 12:0
}

allPlayers = []

players = js['players']
for p in players:
    
    #playerCount[p['onTeamId']] += 1
    playerId = p['id']
    ktbTeamId = p['onTeamId']
    nflTeamId = p['player']['proTeamId']
    playerName = p['player']['fullName']
    defaultPositionId = p['player']['defaultPositionId']
    ############# <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< NEED TO FIGURE OUT HOW TO STORE ALL ELIGLBLE POS ID SLOTS
    #eligibleSlots = p['player']['eligibleSlots']
    try:
        auctionValue = p['player']['ownership']['auctionValueAverage']
        adp =  p['player']['ownership']['averageDraftPosition']

        stats = p['player']['stats']
    except:
        continue

    # they started adding multiple entries to the stats landing, there is final week of the season
    # and 2 others I can't figure out what they are. The season long one has an id of '00<year>'
    if len(stats) > 1:
        #searching for the correct id
        for s in stats:
            if s['id' ] != '00' + str(season):
                continue
            else:
                points = s['appliedTotal']
                pointsAvg = s['appliedAverage']
    else:
        points = stats[0]['appliedTotal']
        pointsAvg = stats[0]['appliedAverage']
        
    ############# <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< NEED TO SEPARTE THE STATS, FIND ESPN ID TO STAT NAME MAP
    #stats = p['player']['stats']['stats']
    try:
        positionRank =  p['ratings']['0']['positionalRanking']
        overallRank =  p['ratings']['0']['totalRanking']
    except:
        positionRank = None
        overallRank = None
        
    playerData = [season, playerId, playerName, nflTeamId, ktbTeamId, defaultPositionId, #eligibleSlots, 
              auctionValue, adp, points, pointsAvg, positionRank, overallRank, #stats
             ]

        
    allPlayers.append(playerData)

cols = ['season', 'playerId', 'playerName', 'nflTeamId', 'ktbTeamId', 'defaultPositionId', #'eligibleSlots', 
        'auctionValue', 'adp', 'points', 'pointsAvg', 'positionRank', 'overallRank', #'stats'
        ]    
players = pd.DataFrame(allPlayers, columns = cols)

In [161]:
#### MULTIPLE YEARS

### kona_player_info
# this has player stats, info, scoring for season total and weekly (when searching in season)
playerCount = {
    0:0, 1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 9:0, 10:0, 11:0, 12:0
}

allPlayers = []
for y in js.keys():

    players = js[y]['kona_player_info']['players']
    for p in players:
        
        #playerCount[p['onTeamId']] += 1
        playerId = p['id']
        ktbTeamId = p['onTeamId']
        nflTeamId = p['player']['proTeamId']
        playerName = p['player']['fullName']
        defaultPositionId = p['player']['defaultPositionId']
        ############# <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< NEED TO FIGURE OUT HOW TO STORE ALL ELIGLBLE POS ID SLOTS
        #eligibleSlots = p['player']['eligibleSlots']
        try:
            auctionValue = p['player']['ownership']['auctionValueAverage']
            adp =  p['player']['ownership']['averageDraftPosition']

            stats = p['player']['stats']
        except:
            continue

        # they started adding multiple entries to the stats landing, there is final week of the season
        # and 2 others I can't figure out what they are. The season long one has an id of '00<year>'
        if len(stats) > 1:
            #searching for the correct id
            for s in stats:
                if s['id' ] != '00' + str(y):
                    continue
                else:
                    points = s['appliedTotal']
                    pointsAvg = s['appliedAverage']
        else:
            points = stats[0]['appliedTotal']
            pointsAvg = stats[0]['appliedAverage']
            
        ############# <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< NEED TO SEPARTE THE STATS, FIND ESPN ID TO STAT NAME MAP
        #stats = p['player']['stats']['stats']
        try:
            positionRank =  p['ratings']['0']['positionalRanking']
            overallRank =  p['ratings']['0']['totalRanking']
        except:
            positionRank = None
            overallRank = None
            
        playerData = [y, playerId, playerName, nflTeamId, ktbTeamId, defaultPositionId, #eligibleSlots, 
                  auctionValue, adp, points, pointsAvg, positionRank, overallRank, #stats
                 ]

            
        allPlayers.append(playerData)

cols = ['season', 'playerId', 'playerName', 'nflTeamId', 'ktbTeamId', 'defaultPositionId', #'eligibleSlots', 
        'auctionValue', 'adp', 'points', 'pointsAvg', 'positionRank', 'overallRank', #'stats'
        ]    
players = pd.DataFrame(allPlayers, columns = cols)

##### useless endpoints - mSettings, mRoster, player_wl, mSchedule

In [7]:
### mSettings:
# not much use. league settings including scoring schemes

### mRoster:
# each teams roster at the end of the season. Not of much use
# possible items of interest
    # end of season rosters
    # season total pts scored for rostered players
    # adp for the season of rostered players

### player_wl
# not much use. very basic team info  for the season - name, nickname, id, guid

### mSchedule
# no data of interest in league history data dumps

# ADDING TEAMID TO DRAFT DF and DRAFT POSITION TO TEAM DF

In [124]:
# joining team id to draft data
team = teams[['teamId']].set_index(['teamId'])
drafts = drafts.set_index(['teamId'])
drafts = drafts.join(team, on= ['teamId'], how='left', validate='many_to_one').reset_index()

# adding in missing data
drafts.loc[drafts['teamName'] == 'Big Baby Nate', 'teamId'] = 9
drafts.loc[drafts['teamName'] ==  "John's", 'teamId'] = 1
drafts.loc[drafts['teamName'] == 'breeced lightning', 'teamId'] = 12
drafts.loc[drafts['teamName'] == "Chubby Rice", 'teamId'] = 6
drafts.loc[drafts['teamName'] == "JB got last in 23", 'teamId'] = 5

# populateing current season team Ids, league history isn't available until the end of the season so this will be manual for every season until the LH is saved at the EOS
drafts.loc[drafts['teamName'] == "Chubby Rice", "teamId"] = 6
drafts.loc[drafts['teamName'] == "Big Baby Nate", "teamId"] = 9
drafts.loc[drafts['teamName'] == "John's", "teamId"] = 1
drafts.loc[drafts['teamName'] == "DPD DannyDimes", "teamId"] = 3
drafts.loc[drafts['teamName'] == "JB got last in 23", "teamId"] = 5
drafts.loc[drafts['teamName'] == "CeeDeez Nutz", "teamId"] = 10
drafts.loc[drafts['teamName'] == "Poopstained Warriors", "teamId"] = 4
drafts.loc[drafts['teamName'] == "Team Chaunce", "teamId"] = 7
drafts.loc[drafts['teamName'] == "Team Gomer", "teamId"] = 2
drafts.loc[drafts['teamName'] == "The Suavin Scoregasms", "teamId"] = 11
drafts.loc[drafts['teamName'] == "Touchdown My Pants", "teamId"] = 8
drafts.loc[drafts['teamName'] == "breeced lightning", "teamId"] = 12

del team
#del drafts['index']

In [None]:
# adding draft pick to team data for the season
teams.set_index(['teamId'])

# filter drafts to only 1st rounds
draft = drafts.loc[drafts['round'] == 1, ['pick', 'teamId']]
'''
#filling in blanks from first season
draft.iloc[7, -1] = 't1'
draft.iloc[8, -1] = 't2'
'''
# set index for join
#draft = draft.set_index(['teamName', 'season'])
draft = draft.set_index(['teamId'])

# join draft pick to team data for season
teams = teams.join(draft, on= ['teamId'], how='left')

'''
# adding in draft picks manually for 2022, team names don't align for 5 teams
teams.iloc[144, -1] = 4
teams.iloc[148, -1] = 7
teams.iloc[149, -1] = 10
teams.iloc[152, -1] = 12
teams.iloc[155, -1] = 5
'''
del draft

# calc final KTB rank, inverse of draft pick
pickToFinish = {12:1, 11:2, 10:4, 9:3, 8:5, 7:6, 
                6:12, 5:11, 4:10, 3:9, 2:8, 1:7}
for t in range(1, 13):

    #multiple seasons
    ###pick = teams.loc[(teams['season'] == s + 1) & (teams['teamId'] == t), 'pick'].iloc[0]
    ###teams.loc[(teams['season'] == s) & (teams['teamId'] == t), 'rankFinalKtb'] = pickToFinish[pick]
    #single season
    pick = teams.loc[teams['teamId'] == t, 'pick'].iloc[0]
    teams.loc[teams['teamId'] == t, 'rankFinalKtb'] = pickToFinish[pick]

# OUTPUT TO EXCEL

In [143]:
drafts.to_csv(outputPathDraft, index= False)
teams.to_csv(outputPathTeams, index = False)
games.to_csv(outputPathGames, index = False)
players.to_csv(outputPathPlayers, index = False)

# LOAD DB

In [144]:
# import credentials
dbUser = creds['mysqlSurface']['users'][1]
dbPw = creds['mysqlSurface']['creds']['jb']
dbHost = creds['mysqlSurface']['dbNFL']['host']
dbName = creds['mysqlSurface']['dbNFL']['database']
dbConnectionString = creds['pymysql'][league]

# connect
sqlEngine = create_engine(dbConnectionString, pool_recycle=3600)
dbConnection = sqlEngine.connect()

### TEAMS HISTORY

In [134]:
teams.to_sql('ktbteams', dbConnection, if_exists='append', index=False)

12

### DRAFT HISTORY

In [37]:
drafts.to_sql('ktbdrafts', dbConnection, if_exists='append', index=False)

156

### PLAYER HISTORY

In [145]:
players.to_sql('ktbplayers', dbConnection, if_exists='append', index=False)

2667

### GAMES HISTORY

In [136]:
games.to_sql('ktbgames', dbConnection, if_exists='append', index=False)

103

In [182]:
dbConnection.close()