In [36]:
from bs4 import BeautifulSoup
import datetime
import requests
import pandas as pd
import numpy as np


header_name = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
town_to_nickname_dict = {'Atlanta': 'Hawks',
                         'Boston': 'Celtics',
                         'Brooklyn': 'Nets',
                         'Charlotte': 'Hornets',
                         'Chicago': 'Bulls',
                         'Cleveland': 'Cavaliers',
                         'Dallas': 'Mavericks',
                         'Denver': 'Nuggets',
                         'Detroit': 'Pistons',
                         'Golden State': 'Warriors',
                         'Houston': 'Rockets',
                         'Indiana': 'Pacers',
                         'L.A. Clippers': 'Clippers',
                         'L.A. Lakers': 'Lakers',
                         'Memphis': 'Grizzlies',
                         'Miami': 'Heat',
                         'Milwaukee': 'Bucks',
                         'Minnesota': 'Timberwolves',
                         'New Orleans': 'Pelicans',
                         'New York': 'Knicks',
                         'Oklahoma City': 'Thunder',
                         'Orlando': 'Magic',
                         'Philadelphia': '76ers',
                         'Phoenix': 'Suns',
                         'Portland': 'Trail Blazers',
                         'Sacramento': 'Kings',
                         'San Antonio': 'Spurs',
                         'Toronto': 'Raptors',
                         'Utah': 'Jazz',
                         'Washington': 'Wizards'
                        }
nickname_to_acronym_dict = {'Hawks': 'ATL',
                         'Celtics': 'BOS',
                         'Nets': 'BKN',
                         'Hornets': 'CHA',
                         'Bulls': 'CHI',
                         'Cavaliers': 'CLE',
                         'Mavericks': 'DAL',
                         'Nuggets': 'DEN',
                         'Pistons': 'DET',
                         'Warriors': 'GS',
                         'Rockets': 'HOU',
                         'Pacers': 'IND',
                         'Clippers': 'LAC',
                         'Lakers': 'LAL',
                         'Grizzlies': 'MEM',
                         'Heat': 'MIA',
                         'Bucks': 'MIL',
                         'Timberwolves': 'MIN',
                         'Pelicans': 'NO',
                         'Knicks': 'NY',
                         'Thunder': 'OKC',
                         'Magic': 'ORL',
                         '76ers': 'PHI',
                         'Suns': 'PHX',
                         'Trail Blazers': 'POR',
                         'Kings': 'SAC',
                         'Spurs': 'SA',
                         'Raptors': 'TOR',
                         'Jazz': 'UTA',
                         'Wizards': 'WSH'
                        }

In [37]:
def initial_538_predictions(game_date):
    """Creates dataframe with initial values from 538
    Args:
        game_date: date of initial games
    Returns:
        df: dataframe of 538 predictions
        year: year of data
    """
    df_columns = ['date', 'team', '538 win%']
    df = pd.DataFrame(columns=df_columns)
    
    output_file = 'z:\python_projects\aaa.exe'
    year = 2019
    result = ''
    URL = 'https://projects.fivethirtyeight.com/2020-nba-predictions/games/'
    headers = {'User-Agent': header_name}
    source = requests.get(URL, headers=headers)
    soup = BeautifulSoup(source.content, 'html.parser')

    # Finding year of predictions
    for timestamp in soup.findAll('div', attrs={'class': 'container'}):
        for year in timestamp.findAll('div', attrs={'id': 'intro'}):
            year = int(year.h1.get_text()[0:4])

    # Gathering data
    for day_group in soup.findAll('div', attrs={'class': 'day-group'}):
        for day_of_year in day_group.findAll('section', attrs={'class': 'day'}):
            date = day_of_year.h3.get_text().strip()
            date = date.split(', ')[1]
            date = date[0:3] + ' ' + date[-2:].strip() + ' ' + str(year)
            date = datetime.datetime.strptime(date, '%b %d %Y')
            year = date.year
            month = date.month
            day = date.day
            date = date.strftime('%m/%d/%Y')
            
            if date == game_date:
                # Finding data for each game
                for game in day_of_year.findAll('tbody', attrs={'class': 'ie10up'}):
                    for num_teams, data in enumerate(game.findAll('tr', attrs={'class': 'tr team'})): 
                        team_acronym = data['data-team']
                        squad = data.find('td', attrs={'class': 'td text team '+team_acronym})
                        win_percentage = data.find('td', attrs={'class': 'td number chance'}).get_text().strip()
                        if squad:
                            team = squad.get_text().strip()
                            result = ''
                    
                            if num_teams == 0:
                                df = df.append(pd.Series([date, team, win_percentage], index=df.columns),
                                                          ignore_index=True)
                            else:
                                df = df.append(pd.Series([date, team, win_percentage], index=df.columns),
                                                          ignore_index=True)
                        
                        

    df.drop_duplicates(inplace=True)
    
    return df, year
    
def getting_spreadsheet(file_name):
    """Loads spreadsheet
    Args:
        file_name: path of file
    Returns:
        dataframe of data
    """
    return(pd.read_csv(file_name))
def separating_games(df):
    """Separates old games and new games
    Args:
        df: dataframe of data
    Returns:
        old_games: games with results already
        current_game: games without results
    """
    old_games = df[df['result'] != '']
    old_games_index = df[df['result'] != ''].index 
    current_games = df.drop(old_games_index)
    
    return old_games, current_games
def predictions_538(game_date):
    """Loads 538 predictions
    Args:
        game_date: date of games
    Returns:
        predictions: dataframe with 538 predictions
        year: year of games. Needed for Vegas odds
    """
    year = 2019
    result = ''
    URL = 'https://projects.fivethirtyeight.com/2020-nba-predictions/games/'
    headers = {'User-Agent': header_name}
    source = requests.get(URL, headers=headers)
    soup = BeautifulSoup(source.content, 'html.parser')
    cols = ['date', 'team', '538 win%']
    blank_rows = ['']*(3)
    cols.extend(blank_rows)
    predictions = pd.DataFrame(columns=cols)

    # Finding year of predictions
    for timestamp in soup.findAll('div', attrs={'class': 'container'}):
        for year in timestamp.findAll('div', attrs={'id': 'intro'}):
            year = int(year.h1.get_text()[0:4])

    # Gathering data
    for day_group in soup.findAll('div', attrs={'class': 'day-group'}):
        for day_of_year in day_group.findAll('section', attrs={'class': 'day'}):
            date = day_of_year.h3.get_text().strip()
            date = date.split(', ')[1]
            date = date[0:3] + ' ' + date[-2:].strip() + ' ' + str(year)
            date = datetime.datetime.strptime(date, '%b %d %Y')
            year = date.year
            month = date.month
            day = date.day
            date = date.strftime('%m/%d/%Y')

            if date == game_date:
                # Finding data for each game
                for game in day_of_year.findAll('tbody', attrs={'class': 'ie10up'}):
                    for num_teams, data in enumerate(game.findAll('tr', attrs={'class': 'tr team'})): 
                        team_acronym = data['data-team']
                        squad = data.find('td', attrs={'class': 'td text team '+team_acronym})
                        win_percentage = data.find('td', attrs={'class': 'td number chance'}).get_text().strip()
                        if squad:
                            team = squad.get_text().strip()
                            result = ''
                            
                            if num_teams == 0:
                                row_data = [date, team, win_percentage]
                                row_data.extend(blank_rows)
                                predictions = predictions.append(pd.Series(row_data, index=predictions.columns), 
                                                                 ignore_index=True)
                            else:
                                row_data = [date, team, win_percentage]
                                row_data.extend(blank_rows)                                
                                predictions = predictions.append(pd.Series(row_data, index=predictions.columns), 
                                                                 ignore_index=True)
                        
                            
    return predictions, year
def loading_odds(df, year):
    """Loads odds to spreadsheet
    Args:
        df: dataframe that odds will be written to
        year: year that will be used to gather the dates of the games
    Returns:
        dataframe of data with odds
    """
    names = ['Open', 'odds','Westgate','MGM Mirage', 'betMGM',
             'William Hill', 'CG Technology', 'Circa Sports','Stations']
    book = pd.DataFrame(columns=names)
    output_file = 'z:\python_projects\aaa.exe'
    temp_away_list = []
    temp_home_list = []
    teams_list = []
    dates_list = []
    URL = 'http://www.vegasinsider.com/nba/odds/las-vegas/money/'
    headers = {'User-Agent': header_name}
    source = requests.get(URL, headers=headers)
    soup = BeautifulSoup(source.content, 'html.parser')

    for gameboard in soup.findAll('table', attrs={'class': 'viBodyContainerTble'}):
        for num, games in enumerate(gameboard.findAll('td', attrs={'class': 'viBodyBorderNorm'})):
            for game_info in games.findAll('td'):  
                for date in game_info.findAll('span', attrs={'class': 'cellTextHot'}):
                    date = date.get_text().split()[0]
                    month = date.split('/')[0]
                    day = date.split('/')[1]

                    date = str(month) + '/' + str(day) + '/' + str(year)
                    dates_list.extend([date, date])

                for team_name in game_info.findAll('a', attrs={'class': 'tabletext'}):
                    teams_list.append(town_to_nickname_dict[team_name.get_text()])

            for num, spread in enumerate(games.findAll('td',attrs={'class': 
                                                                  ['viCellBg1 cellTextNorm cellBorderL1 center_text nowrap',
                                                                   'viCellBg1 cellTextHot cellBorderL1 center_text nowrap',
                                                                   'viCellBg2 cellTextNorm cellBorderL1 center_text nowrap',
                                                                   'viCellBg2 cellTextHot cellBorderL1 center_text nowrap']})):           
                spread_text = spread.get_text().strip()
                if spread_text == '' or spread_text == 'XXXX':
                    away_spread = np.nan
                    home_spread = np.nan
                elif(spread_text[4] == '+' or spread_text[4] == '-'):
                    away_spread = spread_text[0:4]
                    home_spread = spread_text[4:]
                else:
                    away_spread = spread_text[0:5]
                    home_spread = spread_text[5:]

                temp_away_list.append(away_spread)
                temp_home_list.append(home_spread)
                
                if len(temp_home_list) == 9:
                    book = book.append(pd.Series(temp_away_list, index=names), ignore_index=True)
                    book = book.append(pd.Series(temp_home_list, index=names), ignore_index=True)
                    temp_away_list = []
                    temp_home_list = []

    book['team'] = teams_list
    book['date'] = dates_list
    book = book[book.date == df.date.max()]    # Only getting odds for games in current week

    odds = book[['team', 'odds']]
    
    df = pd.merge(odds, df)[['date', 'team', '538 win%', 'odds']]

    return df
def date_formatter(row):
    """Formats dates
    Args:
        row: row of data
    Returns:
        date in proper form('mm/dd/yyyy')
    """

    if row.date == '':
        return(row.date)
    else:
        split_date = row.date.split('/')
        day = split_date[1]
        day = day.zfill(2)
        date = split_date[0] + day + split_date[2]
        date = datetime.datetime.strptime(date, '%m%d%Y')
        date = date.strftime('%m/%d/%Y')
        return(date)
def date_ffill_formatter(df):
    """Formats dates to be in every other row
    Args:
        df: df of data
    """
    for index, row in df.iterrows():
        if index%2 == 0:
            continue
        else:
            df.at[index, 'date'] = ''    
def combining_data(df_top, df_bottom):
    """Appends two dataframes
    Args:
        df_top: dataframe of data to go on top
        df_bottom: dataframe of data to go on bottom
    Returns:
        df: combined dataframe
    """
    cols = df_top.columns
    df = pd.concat([df_top, df_bottom], ignore_index=True, )
    df = df[cols]
    df.replace(np.nan, '', inplace=True)
    return(df)
def game_outcomes(game_dates, teams):
    """Finds winners and losers of games
    Args:
        game_dates: dates of games completed
        teams: teams in games that are completed
    Returns:
        winners: dict with dates as keys and a list of winners as values
        losers: dict with dates as keys and a list of losers as values
    """
    year = 2019
    result = ''
    URL = 'https://projects.fivethirtyeight.com/2020-nba-predictions/games/'
    headers = {'User-Agent': header_name}
    source = requests.get(URL, headers=headers)
    soup = BeautifulSoup(source.content, 'html.parser')
    winner_list = []
    loser_list = []
    winners = {}
    losers = {}
    team_acronyms = [nickname_to_acronym_dict[team] for team in teams]
                          
    # Finding year of predictions
    for timestamp in soup.findAll('div', attrs={'class': 'container'}):
        for year in timestamp.findAll('div', attrs={'id': 'intro'}):
            year = int(year.h1.get_text()[0:4])

    # Gathering data
    for day_group in soup.findAll('div', attrs={'class': 'completed-day'}):
        for day_of_year in day_group.findAll('section', attrs={'class': 'day'}):
            date = day_of_year.h3.get_text().strip()
            date = date.split(', ')[1]
            date = date[0:3] + ' ' + date[-2:].strip() + ' ' + str(year)
            date = datetime.datetime.strptime(date, '%b %d %Y')
            year = date.year
            month = date.month
            day = date.day
            date = date.strftime('%m/%d/%Y')

            if date in game_dates:
                for team_acronym in team_acronyms:
                    # Finding data for each game
                    for game in day_of_year.findAll('tbody', attrs={'class': 'ie10up'}):
                        for game_outcome in game.findAll('tr', attrs={'class': 'tr team'}): 
                            winner_string = 'td text team winner ' + team_acronym
                            loser_string = 'td text team loser ' + team_acronym

                            for winner in game_outcome.findAll('td', attrs={'class': winner_string}):
                                winner_list.append(winner.get_text().strip())
                            for loser in game_outcome.findAll('td', attrs={'class': loser_string}):
                                loser_list.append(loser.get_text().strip())
                                
                winners[date] = winner_list
                losers[date] = loser_list
                winner_list = []
                loser_list = []
                
    return winners, losers
def implied_probability(row):
    """Uses odds to determine implied probability
    Args:
        row: row of data from dataframe
    Returns:
        implied win probability if it exists
    """
    if 'implied' in row.index:
        return row['implied']
    elif row['odds'] == '':
        return('')
    elif row['odds'][0] == '+':
        return(round(100.0/(100+int(row['odds'][1:])), 2))
    elif row['odds'][0] == '-':
        return(round(int(row['odds'][1:])/(100.0+int(row['odds'][1:])), 2))
    else:
        return('')    
    
    
def pick(row, predictor):
    """Uses win% and odds to determine what team to pick
    Args:
        row: row of data from dataframe
        predictor: name of predicting column
    Returns:
        pick if there is one
    """
    if predictor + ' pick' in row.index:
        if row[predictor + ' pick'] == '':
            return ''
        else:    
            return(nickname_to_acronym_dict[row[predictor + ' pick']])
    elif row['implied'] == '':
        return('')
    elif (float(row[predictor + ' win%'][:-1])/100.0 > row['implied']):
        return(nickname_to_acronym_dict[row.team])
    else:
        return('')
    
def spreadsheet_formatter(df):
    """Formats spreadsheet
    Args:
        df: dataframe of data
    """
    if 'Unnamed: 0' in df.columns:
        df.drop('Unnamed: 0', axis=1, inplace=True)
        
    df.replace(np.nan, '', inplace=True)
    
    
def odds_formatter(row):
    """Formats odds
    Args:
        row: row of data from dataframe
    Returns:
        formatted odds
    """
    if row['odds'] == '':
        return(row['odds'])
    elif int(row['odds']) >= 100:
        return('+' + str(row['odds']))
    if int(row['odds']) <= -100:
        return(str(row['odds']))
    else:
        return(str(row['odds'])) 
    
def team_won_lost(row, winners, losers):
    """Determines team outcomes
    Args:
        row: row of data from dataframe
        winners: dict with dates as keys and a list of winners as values
        losers: dict with dates as keys and a list of losers as values
    Returns:
        updated entry for winner/loser
    """
    if row.date != '':
        if row.date in winners.keys():
            if row.team in winners[row.date]:
                return('w')
            elif row.team in losers[row.date]:
                return('l')
            else:
                return('')
        else:
            return('')
    else:
        return('')
    
     
def money_won_lost(row, predictor):
    """Determines amount won lost
    Args:
        row: row of data from dataframe
        predictor: name of predictor
    Returns:
        amount won/lost
    """
    if row[predictor + ' pick'] == '':
        return(int(0))
    else:
        if row['result'] == 'w':
            if row['odds'][0] == '+':
                return(int(row['odds'][1:]))
            else:
                return(100)
        elif row['result'] == 'l':
            if row['odds'][0] == '+':
                return(-100)
            else:
                return(int(row['odds']))
        else:
            return(int(0))
def writing_spreadsheet(df, filename):
    """Writing to spreadsheet
    Args:
        df: data
        file_name: path of file
    """
    df.to_csv(filename, index=False)

In [38]:
file_name = 'z:\\python projects\\NBA Game Outcome Spreadsheet.csv'

df = getting_spreadsheet(file_name) # Retrieving Spreadsheet
spreadsheet_formatter(df)
old_games, current_games = separating_games(df)

if not current_games.empty:
    current_games['date'] = current_games['date'].replace('', np.nan).ffill()
    current_games['odds'] = current_games.apply(lambda row: odds_formatter(row), axis=1)
    winners, losers = game_outcomes(current_games.date.unique(), current_games.team.unique())
    current_games['result'] = current_games.apply(lambda row: team_won_lost(row, winners, losers), axis=1)
    current_games['538 w/l'] = current_games.apply(lambda row: money_won_lost(row, '538'), axis=1)
    df = combining_data(old_games, current_games)

df['date'] = df.apply(lambda row: date_formatter(row), axis=1)

if datetime.date.today().strftime('%m/%d/%Y') > df.date.max():
    new_games, year = predictions_538(datetime.date.today().strftime('%m/%d/%Y'))
    new_games = loading_odds(new_games, year)
    new_games['implied'] = new_games.apply(lambda row: implied_probability(row), axis=1)
    new_games['538 pick'] = new_games.apply(lambda row: pick(row, '538'), axis=1)
    new_games['538 w/l'] = 0
    df = combining_data(df, new_games)

df['538 total'] = int(df.loc[df['538 w/l'] != np.nan]['538 w/l'].sum())
date_ffill_formatter(df)

df
#writing_spreadsheet(df, file_name)

Unnamed: 0,date,team,538 win%,odds,implied,538 pick,result,538 w/l,538 total
0,10/22/2019,Pelicans,30%,240,0.29,NO,l,-100,575
1,,Raptors,70%,-300,0.75,,w,0,575
2,10/22/2019,Lakers,43%,-160,0.62,,l,0,575
3,,Clippers,57%,140,0.42,LAC,w,140,575
4,10/23/2019,Pistons,24%,260,0.28,,w,0,575
5,,Pacers,76%,-320,0.76,,l,0,575
6,10/23/2019,Cavaliers,17%,335,0.23,,l,0,575
7,,Magic,83%,-420,0.81,ORL,w,100,575
8,10/23/2019,Bulls,51%,-160,0.62,,l,0,575
9,,Hornets,49%,140,0.42,Cha,w,140,575
