In [5]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from nba_api.stats.endpoints import boxscoreadvancedv2
from nba_api.stats.endpoints import boxscoresummaryv2
from nba_api.stats.endpoints import leaguegamefinder
from nba_api.stats.endpoints import boxscoremiscv2
from nba_api.stats.endpoints import boxscorefourfactorsv2
from nba_api.stats.endpoints import boxscoretraditionalv2
from nba_api.stats.endpoints import boxscoreusagev2
import matplotlib.pyplot as plt
from keras import backend as K
from sklearn import preprocessing
import tensorflow as tf
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import BaggingRegressor,BaggingClassifier, RandomForestRegressor,RandomForestClassifier
from sklearn.metrics import mean_squared_error,r2_score,roc_curve,auc
from sklearn.preprocessing import MinMaxScaler

2022-11-18 15:00:55.833961: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
nba_teams = ['Cleveland Cavaliers','Boston Celtics','Golden State Warriors','Houston Rockets','Utah Jazz'
             ,'Philadelphia 76ers','Toronto Raptors','New Orleans Pelicans','Washington Wizards','Miami Heat'
            ,'Milwaukee Bucks','Indiana Pacers','Oklahoma City Thunder','San Antonio Spurs','Portland Trail Blazers',
             'Minnesota Timberwolves','Chicago Bulls','Dallas Mavericks','Sacramento Kings','Los Angeles Lakers'
            ,'Orlando Magic','Denver Nuggets','LA Clippers','New York Knicks','Memphis Grizzlies','Detroit Pistons'
            ,'Charlotte Hornets','Atlanta Hawks','Phoenix Suns','Brooklyn Nets']

In [7]:
def feature_clean(features):
    for columnName in features:
        if ((columnName[0:4] == 'TEAM') | (columnName[0:4] == 'GAME')):
            features = features.drop(columns = [columnName])
    return features

In [64]:
def feature_clean_current(features):
    for columnName in features:
        if ((columnName[0:4] == 'TEAM') | (columnName[0:4] == 'GAME')):
            if(columnName != 'TEAM_ID_A'):
                features = features.drop(columns = [columnName])
    return features

In [8]:
def combine_team_games(df, keep_method='home'):
    '''Combine a TEAM_ID-GAME_ID unique table into rows by game. Slow.

        Parameters
        ----------
        df : Input DataFrame.
        keep_method : {'home', 'away', 'winner', 'loser', ``None``}, default 'home'
            - 'home' : Keep rows where TEAM_A is the home team.
            - 'away' : Keep rows where TEAM_A is the away team.
            - 'winner' : Keep rows where TEAM_A is the losing team.
            - 'loser' : Keep rows where TEAM_A is the winning team.
            - ``None`` : Keep all rows. Will result in an output DataFrame the same
                length as the input DataFrame.
                
        Returns
        -------
        result : DataFrame
    '''
    # Join every row to all others with the same game ID.
    joined = pd.merge(df, df, suffixes=['_A', '_B'],
                      on=['GAME_ID'])
    # Filter out any row that is joined to itself.
    
    for columnName in joined:
        if(columnName == 'TEAM_ID_A_x'):
            joined = joined.rename(columns = {'TEAM_ID_A_x':'TEAM_ID_A'})
        
    for columnName in joined:
        if(columnName == 'TEAM_ID_B_x'):
            joined = joined.rename(columns = {'TEAM_ID_B_x':'TEAM_ID_B'})  
        
    result = joined[joined.TEAM_ID_A != joined.TEAM_ID_B]
    # Take action based on the keep_method flag.
    if keep_method is None:
        # Return all the rows.
        pass
    elif keep_method.lower() == 'home':
        # Keep rows where TEAM_A is the home team.
        result = result[result.MATCHUP_A.str.contains(' vs. ')]
    elif keep_method.lower() == 'away':
        # Keep rows where TEAM_A is the away team.
        result = result[result.MATCHUP_A.str.contains(' @ ')]
    elif keep_method.lower() == 'winner':
        result = result[result.WL_A == 'W']
    elif keep_method.lower() == 'loser':
        result = result[result.WL_A == 'L']
    else:
        raise ValueError(f'Invalid keep_method: {keep_method}')
    return result

In [9]:
def combine_vegas_games(df):
    '''Combine a TEAM_ID-GAME_ID unique table into rows by game. Slow.

        Parameters
        ----------
        df : Input DataFrame.
        keep_method : {'home', 'away', 'winner', 'loser', ``None``}, default 'home'
            - 'home' : Keep rows where TEAM_A is the home team.
            - 'away' : Keep rows where TEAM_A is the away team.
            - 'winner' : Keep rows where TEAM_A is the losing team.
            - 'loser' : Keep rows where TEAM_A is the winning team.
            - ``None`` : Keep all rows. Will result in an output DataFrame the same
                length as the input DataFrame.
                
        Returns
        -------
        result : DataFrame
    '''
    # Join every row to all others with the same game ID.
    joined = pd.merge(df, df, suffixes=['_A', '_B'],
                      on=['GameId'])
    # Filter out any row that is joined to itself.
    
    result = joined[joined.TeamId_A != joined.TeamId_B]
    # Take action based on the keep_method flag.
    return result

In [10]:
def remove_unnamed(frame):
    for columnName in frame:
        if(columnName[0:7] == 'Unnamed'):
            frame = frame.drop(columns = [columnName])
    return frame

In [11]:
def pull_reg_season(start_year, start_date, end_date):
    reg = pd.DataFrame()
    for i in team_ids:
        games_base = leaguegamefinder.LeagueGameFinder(team_id_nullable = i)
        games_xxxx = games_base.get_data_frames()[0]
        games_xxxx = games_xxxx[games_xxxx.TEAM_NAME.isin(nba_teams)]
        regular_xxxx = games_xxxx[(games_xxxx.GAME_DATE > start_date) & (games_xxxx.GAME_DATE < end_date)]
        reg = reg.append(regular_xxxx)
        time.sleep(30)
    reg = reg.sort_values('GAME_ID')
    return reg

In [12]:
def pull_curr_reg(start_date):
    reg = pd.DataFrame()
    games_base = leaguegamefinder.LeagueGameFinder()
    games_xxxx = games_base.get_data_frames()[0]
    games_xxxx = games_xxxx[games_xxxx.TEAM_NAME.isin(nba_teams)]
    regular_xxxx = games_xxxx[(games_xxxx.GAME_DATE > start_date) ]
    reg = reg.append(regular_xxxx)
    reg = reg.sort_values('GAME_ID')
    return reg





In [13]:
def pull_stats(api, index, ids):
    box = pd.DataFrame()
    subset1 = ids[0:308]
    subset2 = ids[308:616]
    subset3 = ids[616:924]
    subset4 = ids[924:1231]
    subset_list = [subset1, subset2, subset3, subset4]
    for i in subset_list:
        for j in i:
            print(j)
            row = api(game_id = j)
            row = row.get_data_frames()[index]
            row['GAME_ID'] = j
            row = combine_team_games(row, keep_method = None)
            box = box.append(row)
        time.sleep(60)
    return box
        

            
                
            
    

In [14]:
def pull_stats_slow(api, index, ids):
    box = pd.DataFrame()
    subset1 = ids[0:77]
    subset2 = ids[77:154]
    subset3 = ids[154:231]
    subset4 = ids[231:308]
    subset5 = ids[308:385]
    subset6 = ids[385:462]
    subset7 = ids[462:539]
    subset8 = ids[539:616]
    subset9 = ids[616:693]
    subset10 = ids[693:770]
    subset11 = ids[770:847]
    subset12 = ids[847:924]
    subset13= ids[924:1001]
    subset14 = ids[1001:1078]
    subset15 = ids[1078:1155]
    subset16 = ids[1155:1231]
    subset_list = [subset1, subset2, subset3, subset4,subset5, subset6, subset7, subset8,
                  subset9, subset10, subset11, subset12,subset13, subset14, subset15, subset16]
    counter = 0
    for i in subset_list:
        for j in i:
            try:
                row = api(game_id = j)
                row = row.get_data_frames()[index]
                row['GAME_ID'] = j
                row = combine_team_games(row, keep_method = None)
                box = box.append(row)
                print(counter)
                counter = counter + 1
                time.sleep(1)
            except:
                return box
        time.sleep(150)
    return box

In [15]:
def pull_stats_super_slow(api, index, reg_season):
    box = pd.DataFrame()
    reg_season = reg_season.drop_duplicates(subset = ['GAME_ID'])
    ids = reg_season['GAME_ID']
    subset1 = ids[0:308]
    subset2 = ids[308:616]
    subset3 = ids[616:924]
    subset4 = ids[924:1231]
    subset_list = [subset1, subset2, subset3, subset4]
    for i in subset_list:
        for j in i:
            row = api(game_id = j)
            row = row.get_data_frames()[index]
            row['GAME_ID'] = j
            row = combine_team_games(row, keep_method = None)
            box = box.append(row)
            time.sleep(10)
    return box

In [16]:
def home_away(matchup):
    if(matchup[4] == '@'):
        return 0
    else:
        return 1
    

In [17]:
def reg_clean(reg):
    reg['home'] = reg.apply(lambda row : home_away(row['MATCHUP']), axis = 1)
    if(('GAME_ID')[0] == '0'):
        reg['GAME_ID'] = reg['GAME_ID'].str[2:].astype(int)
    reg = reg.sort_values('GAME_ID')
    reg = combine_team_games(reg,keep_method=None)
    return reg
    
    

In [18]:
def home_win(WL_A):
    if(WL_A == 'W'):
        return([1,0,0])
    else:
        return([0,1,0])

In [19]:
def adv_clean(advanced):
    advanced = advanced.drop(columns = ['TEAM_ID_B','TEAM_ABBREVIATION_A','TEAM_ABBREVIATION_B'
                                   ,'TEAM_CITY_A','TEAM_CITY_B','TEAM_NAME_A','TEAM_NAME_B','MIN_A'])
    advanced = advanced.sort_values('GAME_ID')
    return advanced
    


In [20]:
def traditional_clean(traditional):
    bench = traditional.drop_duplicates(subset = ['GAME_ID','TEAM_NAME_A'], keep = 'last')
    starters = traditional.drop_duplicates(subset = ['GAME_ID','TEAM_NAME_A'], keep = 'first')
    bench = bench.drop(columns = ['TEAM_NAME_A', 'TEAM_ABBREVIATION_A',
       'TEAM_CITY_A', 'STARTERS_BENCH_A', 'MIN_A','TEAM_ID_B', 'TEAM_NAME_B',
       'TEAM_ABBREVIATION_B', 'TEAM_CITY_B', 'STARTERS_BENCH_B', 'MIN_B'])
    starters = starters.drop(columns = ['TEAM_NAME_A', 'TEAM_ABBREVIATION_A',
       'TEAM_CITY_A', 'STARTERS_BENCH_A', 'MIN_A','TEAM_ID_B', 'TEAM_NAME_B',
       'TEAM_ABBREVIATION_B', 'TEAM_CITY_B', 'STARTERS_BENCH_B', 'MIN_B'])
    for columnName in bench:
        if((columnName == 'GAME_ID') | (columnName == 'TEAM_ID_A')):
            continue
        new_col_name = columnName + '_bench'
        bench[new_col_name] = bench[columnName]
        bench = bench.drop(columns = [columnName])
    for columnName in starters:
        if((columnName == 'GAME_ID') | (columnName == 'TEAM_ID_A')):
            continue
        new_col_name = columnName + '_starters'
        starters[new_col_name] = starters[columnName]
        starters = starters.drop(columns = [columnName])
        
    merged = pd.merge(starters,bench,left_on = 'GAME_ID',right_on = 'GAME_ID')
    merged = merged.drop_duplicates(subset = ['GAME_ID','TEAM_ID_A_x'])
    merged = merged.drop(columns = ['TEAM_ID_A_y'])
    merged = merged.rename(columns={"TEAM_ID_A_x": "TEAM_ID_A"})
    return(merged)
    

In [21]:
def four_clean(four):
    four = four.drop(columns = ['TEAM_NAME_A', 'TEAM_ABBREVIATION_A',
       'TEAM_CITY_A', 'MIN_A','TEAM_ABBREVIATION_B',
       'TEAM_CITY_B', 'MIN_B','OREB_PCT_A','TM_TOV_PCT_A','EFG_PCT_A'])
    four = four.sort_values('GAME_ID')
    return four

In [22]:
def usage_clean(usage):
    return usage
    

In [23]:
def misc_clean(misc):
    misc = misc.drop(columns = ['TEAM_NAME_A', 'TEAM_ABBREVIATION_A',
       'TEAM_CITY_A', 'MIN_A'])
    return misc

In [24]:
def get_minutes(matchupmisc):
    matchupmisc = matchupmisc.reset_index(drop=True)
    matchupmisc = matchupmisc.sort_values('GAME_ID')
    matchupmisc['minutes'] = (matchupmisc['MIN_A'].str[0:3].astype(int))/5
    minutes =  matchupmisc['minutes']
    return minutes
    
    

In [25]:
def per_minute_stats(frame, minutes):
    frame = frame.reset_index(drop = True)
    features = frame.drop(columns = ['GAME_ID'])
    game_id = frame['GAME_ID'].reset_index(drop = True)
    team_id = frame['TEAM_ID_A'].reset_index(drop=True)
    for columnName in features:
        if(columnName[-1] == 'B'):
            features = features.drop(columns = [columnName])
    for columnName in features:
        if(columnName[-1] == 'A'):
            features = features.rename(columns={columnName: columnName[0:-2]})
    features['minutes'] = minutes
    cumulative = features.groupby('TEAM_ID').cumsum()
    for columnName in cumulative:
        if((columnName == 'minutes') | (columnName == 'TEAM_ID')):
            continue
        new_col_name = columnName + '_per_minute'
        features[new_col_name] = ((cumulative[columnName] - features[columnName]) / (cumulative['minutes'] - features['minutes']))
        features = features.drop(columns = [columnName])
    features = features.reset_index(drop= True)
    features= features.drop(columns = ['minutes'])
    features['TEAM_ID'] = team_id
    features['GAME_ID'] = game_id
    return combine_team_games(features, keep_method = None)
    
        
    

In [26]:
def cum_stats(frame):
    frame = frame.reset_index(drop = True)
    features = frame.drop(columns = ['GAME_ID'])
    game_id = frame['GAME_ID'].reset_index(drop = True)
    team_id = frame['TEAM_ID_A'].reset_index(drop=True)
    for columnName in features:
        if(columnName[-1] == 'B'):
            features = features.drop(columns = [columnName])
    for columnName in features:
        if(columnName[-1] == 'A'):
            features = features.rename(columns={columnName: columnName[0:-2]})
    features = features.reset_index(drop = True)
    for columnName in features:
        if(columnName == 'TEAM_ID'):
            continue
        new_col_name = columnName + '_median'
        features[new_col_name] = features.groupby('TEAM_ID')[columnName].apply(lambda x: x.shift().expanding().median())
        features = features.drop(columns = [columnName])
    features = features.reset_index(drop= True)
    features['TEAM_ID'] = team_id
    features['GAME_ID'] = game_id
    features = features.reset_index(drop = True)
    return combine_team_games(features, keep_method = None)

In [27]:
def reg_impute(reg_cleaned, minutes):
    per_minute = reg_cleaned[['TEAM_ID_A', 'GAME_ID', 'PTS_A', 'FGM_A', 'FGA_A',
                      'FG3M_A','FG3A_A','FTM_A', 'FTA_A','OREB_A', 'DREB_A',
       'REB_A', 'AST_A', 'STL_A', 'BLK_A', 'TOV_A', 'PF_A', 'PLUS_MINUS_A']]
    
    cum = reg_cleaned[['TEAM_ID_A', 'GAME_ID','FG_PCT_A','FG3_PCT_A','FT_PCT_A']]
    home = reg_cleaned[['TEAM_ID_A','GAME_ID','home_A']]
    
    per_minute = per_minute_stats(per_minute, minutes)
    cum = cum_stats(cum)
    merged = pd.merge(cum,per_minute,left_on = ['GAME_ID','TEAM_ID_A'],right_on=['GAME_ID','TEAM_ID_A'])
    merged = pd.merge(merged,home,left_on = ['GAME_ID','TEAM_ID_A'],right_on=['GAME_ID','TEAM_ID_A'])
    #merged = merged.drop_duplicates(subset = ['TEAM_ID_A_x','GAME_ID','TEAM_ID_B_x'])
    merged = merged.rename(columns = {'TEAM_ID_A_x':'TEAM_ID'})
    for columnName in merged:
        if(columnName[-1] == 'B'):
            merged = merged.drop(columns = [columnName])
    for columnName in merged:
        if(columnName[-1] == 'A'):
            merged = merged.rename(columns={columnName: columnName[0:-2]})
    merged = merged.drop(columns = ['TEAM_ID_B_x','TEAM_ID_B_y'])
    merged = combine_team_games(merged,keep_method = None)
    #return merged
    #merged['GAME_ID'] = merged['GAME_ID'].str[2:].astype(int)
    return merged


In [28]:
def get_62(frame):
    return frame.iloc[600:,:]

In [29]:
def four_impute(four_cleaned):
    cum = cum_stats(four_cleaned)
    #cum['GAME_ID'] = cum['GAME_ID'].str[2:].astype(int)
    return cum

In [30]:
def adv_impute(advanced_cleaned):
    cum = cum_stats(advanced_cleaned)
    return cum

In [31]:
def merger(reg_imputed, advanced_imputed, four_imputed):
    merged = pd.merge(reg_imputed,advanced_imputed, left_on = ['GAME_ID','TEAM_ID_A'],right_on = ['GAME_ID','TEAM_ID_A'])
    merged = pd.merge(merged,four_imputed, left_on = ['GAME_ID','TEAM_ID_A'],right_on = ['GAME_ID','TEAM_ID_A'])
    return get_62(merged)

In [32]:
def current_merger(reg_imputed, advanced_imputed, four_imputed):
    merged = pd.merge(reg_imputed,advanced_imputed, left_on = ['GAME_ID','TEAM_ID_A'],right_on = ['GAME_ID','TEAM_ID_A'])
    merged = pd.merge(merged,four_imputed, left_on = ['GAME_ID','TEAM_ID_A'],right_on = ['GAME_ID','TEAM_ID_A'])
    return merged

In [33]:
def dataload_clean(year,start_date,end_date):
    reg = pull_reg_season(year,start_date,end_date)
    misc = pull_stats_slow(boxscoremiscv2.BoxScoreMiscV2,1,reg)
    minutes = get_minutes(misc)
    advanced = pull_stats_slow(boxscoreadvancedv2.BoxScoreAdvancedV2,1,reg)
    four = pull_stats_slow(boxscorefourfactorsv2.BoxScoreFourFactorsV2,1,reg)
    merged = merger(reg_impute(reg_clean(reg), minutes),advanced_impute(advancedboxscore_clean(advanced)),
                    four_impute(four_factors_clean(four)))
    merged.to_csv(year + '.csv')
    return merged

In [34]:
def dataloader(year,start_date,end_date):
    reg = pull_reg_season(year,start_date,end_date)
    reg.to_csv('reg' + year + '.csv')
    misc = pull_stats(boxscoremiscv2.BoxScoreMiscV2,1,get_ids(reg))
    misc.to_csv('misc' + year + '.csv')
    adv = pull_stats(boxscoreadvancedv2.BoxScoreAdvancedV2,1,get_ids(reg))
    adv.to_csv('adv' + year + '.csv')
    four = pull_stats(boxscorefourfactorsv2.BoxScoreFourFactorsV2,1,get_ids(reg))
    four.to_csv('four' + year + '.csv')
    return[reg,misc,adv,four]
    

In [35]:
def get_ids(reg):
    unique = reg.drop_duplicates('GAME_ID')
    return unique['GAME_ID']

In [36]:
def add_zeros(game_id):
    new_id = '00' + (str(game_id))
    return new_id

In [37]:
def get_A_features(frame):
    new = frame
    for columnName in frame:
        if(columnName[-1] == 'B'):
            new = new.drop(columns=[columnName])
            
    return new

In [38]:
def order_features(frame):
    counter = 1
    new = []
    for columnName in frame:
        if(columnName[-1] == 'A'):
            new.append(columnName)
    for columnName in frame:
        if(columnName[-1] == 'B'):
            new.append(columnName)
    return frame[new]
                
            
            
        
        

In [58]:
def get_month(date):
    months = {
        'jan': 1,
        'feb': 2,
        'mar': 3,
        'apr':4,
         'may':5,
         'jun':6,
         'jul':7,
         'aug':8,
         'sep':9,
         'oct':10,
         'nov':11,
         'dec':12
        }
    return months[date[4:7].lower()]
    

In [56]:
def get_day(date):
    split = date.split(' ')
    return split[2]
    

In [53]:
def month_clean(month,reg):
    month['month'] = month.apply(lambda row : get_month(row['Date']), axis = 1)
    month['day'] = month.apply(lambda row : get_day(row['Date']), axis = 1)
    month['home'] = month.apply(lambda row : get_team_id(row['Home/Neutral'],reg), axis = 1)
    month['away'] = month.apply(lambda row : get_team_id(row['Visitor/Neutral'],reg), axis = 1)
    return month[['home','away','month','day']]

    

In [10]:
def get_team_id(team_name, reg):
    for i in team_ids:
        temp = reg[reg['TEAM_ID'] == i]
        name = temp['TEAM_NAME'].iloc[0]
        if(name == 'LA Clippers'):
            name = 'Los Angeles Clippers' 
        if((name) == team_name):
            return i

In [68]:
def get_team_name(team_id,reg):
    for i in nba_teams:
        temp = reg[reg['TEAM_NAME'] == i]
        if((temp['TEAM_ID'].iloc[0]) == team_id):
            return i


In [44]:
def get_matchup_features(home_id, away_id, features):
    frame1 = pd.DataFrame()
    frame2 = pd.DataFrame()
    for i in features:
        if(i.TEAM_ID_A == home_id):
            stats1 = i
        if(i.TEAM_ID_A == away_id):
            stats2 = i
    frame1 = frame1.append(stats1)
    frame2 = frame2.append(stats2)
    frame1 = get_A_features(frame1)
    frame2 = get_A_features(frame2)
    frame1 = frame1.reset_index(drop=True)
    frame2 = frame2.reset_index(drop=True)
    merged = frame1.join(frame2,lsuffix='1',rsuffix='2')
    return feature_clean(merged)
    #return feature_clean(merged)
    
    


In [45]:
def get_current_features(features):
    feat = []
    for i in team_ids:
        temp = features[features['TEAM_ID_A'] == i]
        feat.append(temp.iloc[-1,:])
    return feat

In [2]:
def combine_features_current(reg,misc,adv,four):
    reg2 = reg_impute(reg_clean(reg),get_minutes(misc))
    adv2 = adv_impute(adv_clean(adv))
    four2 = four_impute(four_clean(four))
    merged = current_merger(reg2,adv2,four2)
    #merged = feature_clean(merged)
    merged = remove_unnamed(merged)
    return merged

In [47]:
def get_features(reg,misc,adv,four):
    reg2 = reg_impute(reg_clean(reg),get_minutes(misc))
    adv2 = adv_impute(adv_clean(adv))
    four2 = four_impute(four_clean(four))
    merged = merger(reg2,adv2,four2)
    #merged = feature_clean(merged)
    merged = remove_unnamed(merged)
    return merged

In [48]:
reg22 = pd.read_csv('reg2022.csv')
misc22 = pd.read_csv('misc2022.csv')
adv22 = pd.read_csv('adv2022.csv')
four22 = pd.read_csv('four2022.csv')

In [52]:
def odds_convert(american_lines):
    odds = []
    for i in american_lines:
        if(i < 0):
            odds.append((i -100)/i)
        else:
            odds.append((i+100)/100)
    return odds

In [54]:
def get_targets_OU(features, targets):
    df = pd.DataFrame()
    for i in range(len(features)):
        year = features[i]
        vegas = targets[i]
        vegas = vegas[['Pinnacle_Line_OU','Pinnacle_Odds_OU','Total','GameId']]
        merged = pd.merge(year,vegas, left_on = 'GAME_ID',right_on = 'GameId')
        merged = merged.drop_duplicates(subset = ['GAME_ID','TEAM_ID_A'])
        merged = merged.dropna()
        df = df.append(merged)
    y = df[['Pinnacle_Line_OU','Total']]
    #odds = y['Pinnacle_Odds_OU']
    #new = odds_convert(odds)
    #y['Pinnacle_Odds_OU'] = new
    x = df.drop(columns = ['Pinnacle_Line_OU','Pinnacle_Odds_OU','Total','GameId'])
    x = feature_clean(x)
    return[x,y]
        

In [55]:
def get_targets_ML(features, targets):
    df = pd.DataFrame()
    for i in range(len(features)):
        year = features[i]
        vegas = targets[i]
        vegas = combine_vegas_games(vegas)
        vegas = vegas[['Pinnacle_ML_A','Pinnacle_ML_B','Result_A','GameId','TeamId_A']]
        merged = pd.merge(year,vegas, left_on = ['GAME_ID','TEAM_ID_A'],right_on = ['GameId','TeamId_A'])
        merged = merged.drop_duplicates(subset = ['GAME_ID','TEAM_ID_A'])
        merged = merged.dropna()
        df = df.append(merged)
    y = df[['Result_A','Pinnacle_ML_A','Pinnacle_ML_B','GameId']]
    y['Odds_A'] = odds_convert(y['Pinnacle_ML_A'])
    y['Odds_B'] = odds_convert(y['Pinnacle_ML_B'])
    y['y_true'] = y.apply(lambda row : get_y_true(row['Result_A'],
                     row['Odds_A'], row['Odds_B']), axis = 1)
    x = df.drop(columns = ['Pinnacle_ML_A','Pinnacle_ML_B','Result_A','GameId','TeamId_A'])
    x = feature_clean(x)
    y = y[['y_true']]
    y2 = pd.DataFrame(y['y_true'].to_list(), columns = ['a_win', 'b_win', 'no_bet','odds_a','odds_b'])
    return[x,y2]
    

In [56]:
def get_y_true(result_a, ml_a,ml_b):
    if(result_a == 'W'):
        return [1,0,0,ml_a,ml_b]
    else:
        return [0,1,0,ml_a,ml_b]
    

In [57]:
def OU_accuracy(X_test, model, y_test,thresh):
    y_test['predictions'] = model.predict(X_test)
    y_test['diff'] = abs(y_test['Pinnacle_Line_OU'] - y_test['predictions'])
    y_test = y_test[y_test['diff'] > thresh]
    counter = 0
    for i in range(y_test.shape[0]):
        ou = y_test.iloc[i,0]
        total = y_test.iloc[i,1]
        predictions = y_test.iloc[i,2]
        if(ou > total):
            if(predictions < ou):
                counter = counter + 1
        if(ou < total):
            if(predictions > ou):
                counter = counter + 1
    return(counter/y_test.shape[0])
            
            
            
    
    
    

In [58]:
def decorrelation_loss(y_true, y_pred):
    p = y_true[:,0:1]
    result = y_true[:,1:2]
    return K.sqrt(K.mean(K.square(result - y_pred) - (0.6 * K.square(p - y_pred))))

In [59]:
def profit_loss(y_true,y_pred):
    home_win = y_true[:,0:1]
    away_win = y_true[:,1:2]
    no_bet = y_true[:,2:3]
    odds_a = y_true[:,3:4]
    odds_b = y_true[:,4:5]
    gain_loss_vector = K.concatenate([(home_win * (odds_a - 1)) + ((1-home_win)* -1),
                                     (away_win * (odds_b - 1)) + ((1-away_win) * -1),
                                     K.zeros_like(odds_a)],axis=1)
    return( -1 * (K.mean(K.sum(gain_loss_vector * y_pred, axis=1))))
    

In [60]:
def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))
    

In [61]:
def OU_features(features,corr,thresh):
    size = corr.shape[0]-1
    features2 = features.copy(deep=False)
    for columnName in corr:
        if (corr[columnName][size] < thresh):
            features2 = features2.drop(columns = [columnName])
    return features2
            

In [62]:
def get_bets(pred_odds):
    pred_odds['prob_a'] = 1 / (pred_odds['odds_a'])
    pred_odds['bets'] = pred_odds.apply(lambda row : get_bet(row['predictions'],row['prob_a']), axis = 1)
    return pred_odds
    

In [63]:
def get_bet(pred,odds):
    if((pred-odds) > 0):
        return(pred-odds)
    else:
        return 0

In [64]:
def get_profits(preds):
    preds['profit'] = preds.apply(lambda row: get_profit(row['odds_a'],row['bets'],row['a_win']),axis=1)
    return(preds['profit'].sum())

In [65]:
def filter_features(features, features_list):
    features2 = features.copy(deep=False)
    for columnName in features2:
        if(columnName in features_list):
            continue
        else:
            features2 = features2.drop(columns = [columnName])
    return features2
    

In [66]:
def get_profit(odds,bet,result):
    if(result == 1):
        return((bet*odds) - bet)
    if(result == 0):
        return(-1 * bet)

In [67]:
def profit(model, X_test, y_test):
    pred = model.predict(X_test)
    odds = y_test['odds_a']
    result =  y_test['a_win']
    test = pd.DataFrame()
    test['odds_a'] = odds
    test['a_win'] = result
    test['predictions'] = pred
    get_bets(test)
    profit = get_profits(test)
    print('total profit:  ' + str(profit))
    print('total bet:  ' + str(test['bets'].sum()))
    print('percent_return:  ' + str((100 * (profit/test['bets'].sum()))))

In [68]:
def plot_metric(history, metric):
    train_metrics = history.history[metric]
    val_metrics = history.history['val_'+metric]
    epochs = range(1, len(train_metrics) + 1)
    plt.plot(epochs, train_metrics)
    plt.plot(epochs, val_metrics)
    plt.title('Training and validation '+ metric)
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend(["train_"+metric, 'val_'+metric])
    plt.show()

In [69]:
def get_current_reg(start_date):
    current_reg = pull_reg_season('2022',start_date,str(date.today()))
    return current_reg

In [43]:
def pull_stats_current(api,index,ids):
    box = pd.DataFrame()
    
    for j in ids:
        print(j)
        row = api(game_id = j)
        row = row.get_data_frames()[index]
        row['GAME_ID'] = j
        row = combine_team_games(row, keep_method = None)
        box = box.append(row)
        
    return box

In [1]:
from datetime import date
def get_current_data(current_reg):
    four = pull_stats_current(boxscorefourfactorsv2.BoxScoreFourFactorsV2,1,get_ids(current_reg))
    adv = pull_stats_current(boxscoreadvancedv2.BoxScoreAdvancedV2,1,get_ids(current_reg))
    misc = pull_stats_current(boxscoremiscv2.BoxScoreMiscV2,1,get_ids(current_reg))
    #totalcurrent = combine_features_current(current_reg,misc,adv,four)
    return [adv,four,misc]
    
    
    

In [71]:
def profit_betting(odds_threshold, prediction_threshold, predictions,odds,betting_function):
    profit = 0
    bets = []
    betting_total = 0
    bets_placed = 0
    #for i in range(len(predictions)):
     #   predictions[i] = np.append(predictions[i],[odds.iloc[i,3]])
      #  predictions[i]= np.append(predictions[i],[odds.iloc[i,4]])
    #bets = []
    for i in range(len(predictions)):
        if((predictions[i][0] > prediction_threshold) & (odds.iloc[i,3] > odds_threshold)):
            bets = np.append(bets,betting_function(predictions[i][0],odds.iloc[i,3]))
            bets_placed += 1
        else:
            bets = np.append(bets,0)
    for i in range(len(bets)):
        if(bets[i] > 0):
            if(odds.iloc[i,0] == 1):
                profit = profit + ((bets[i] * odds.iloc[i,3])- bets[i])
            else:
                profit = profit - bets[i]
    for i in bets:
        betting_total = betting_total + i
        
        
    print('Odds Threshold: ' + str(odds_threshold))
    print('Pred Threshold: ' + str(prediction_threshold))
    print('Number of Bets Placed: ' + str(bets_placed))
    print('Total Bet: ' + str(betting_total))
    print('Profit: ' + str(profit))
    print('Percent Return: ' + str(profit/betting_total))
    return ((profit/betting_total) * (10*bets_placed))
    
            
            
        
        
    

In [72]:
def betting_function(pred_a,odds_a):
    return (pred_a)


In [73]:
def tune_betting_parameters(odds_threshold_range,odds_threshold_increment, prediction_threshold_range,prediction_threshold_increment,betting_function,X_test,y_test,model):
    odds_min = odds_threshold_range[0]
    odds_max = odds_threshold_range[1]
    pred_min = prediction_threshold_range[0]
    pred_max = prediction_threshold_range[1]
    max_return = -100
    max_odds_thresh = 0
    max_pred_thresh = 0
    for x in np.arange(odds_min, odds_max, odds_threshold_increment):
        for y in np.arange(pred_min, pred_max, prediction_threshold_increment):
            if(profit_betting(x,y,model.predict(X_test),y_test,betting_function) > max_return):
                max_return = profit_betting(x,y,model.predict(X_test),y_test,betting_function)
                max_odds_thresh = x
                max_pred_thresh = y
    print('best odds thresh: ' + str(max_odds_thresh))
    print('best pred thresh: ' + str(max_pred_thresh))
    print('max_return: '+ str(max_return))
            
    

In [82]:
#tune_betting_parameters([0,3],0.1,[0.1,0.45],0.05,betting_function,X_test,y_test2,model)

Odds Threshold: 0.0
Pred Threshold: 0.1
Number of Bets Placed: 390
Total Bet: 147.40872795134783
Profit: -5.811542481073463
Percent Return: -0.03942468374729859
Odds Threshold: 0.0
Pred Threshold: 0.15000000000000002
Number of Bets Placed: 313
Total Bet: 137.96753887832165
Profit: -7.025360191147868
Percent Return: -0.05092038495623073
Odds Threshold: 0.0
Pred Threshold: 0.20000000000000004
Number of Bets Placed: 259
Total Bet: 128.79917180538177
Profit: -5.169030237589769
Percent Return: -0.040132480396692934
Odds Threshold: 0.0
Pred Threshold: 0.25000000000000006
Number of Bets Placed: 222
Total Bet: 120.39526098966599
Profit: -4.119609375128482
Percent Return: -0.03421737152496463
Odds Threshold: 0.0
Pred Threshold: 0.25000000000000006
Number of Bets Placed: 222
Total Bet: 120.39526098966599
Profit: -4.119609375128482
Percent Return: -0.03421737152496463
Odds Threshold: 0.0
Pred Threshold: 0.30000000000000004
Number of Bets Placed: 190
Total Bet: 111.64437499642372
Profit: -11.76165

In [435]:
#profit_betting(0,0.1,model.predict(X_test),y_test2,betting_function)

Odds Threshold: 0
Pred Threshold: 0.1
Number of Bets Placed: 90
Total Bet: 55.88256438821554
Profit: 7.6298290774317
Percent Return: 0.1365332668777933


122.87994019001397

In [66]:
#model.save('nba_model.h5')


In [74]:
from keras.layers import Input
from keras.layers import BatchNormalization
from keras.layers import Dropout
from keras.layers import Dense
from keras import Model
def get_model(input_dim, output_dim, base=1000, multiplier=0.25, p=0.2,lr = 0.0001):
    inputs = Input(shape=(input_dim,))
    l = BatchNormalization()(inputs)
    l = Dropout(p)(l)
    n = base
    l = Dense(n, activation='relu')(l)
    l = BatchNormalization()(l)
    l = Dropout(p)(l)
    n = int((n * multiplier))
    l = Dense(n, activation='relu')(l)
    l = BatchNormalization()(l)
    l = Dropout(p)(l)
    n = int(n * multiplier)
    l = Dense(n, activation='relu')(l)
    outputs = Dense(output_dim, activation='softmax')(l)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate = lr), loss=profit_loss)
    return model

In [67]:
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
#import kerastuner as kt
def train(model,X_train,y_train,val_split,ep,bs):
    #callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
    history = model.fit(X_train,y_train,validation_split = val_split,epochs = ep, batch_size = bs)

In [62]:
#reg15 = pd.read_csv('NBA_DATA/reg15.csv')
#misc15 = pd.read_csv('NBA_DATA/misc15.csv')
#adv15 = pd.read_csv('NBA_DATA/adv15.csv')
#four15 = pd.read_csv('NBA_DATA/four15.csv')
reg16 = pd.read_csv('NBA_DATA/reg16.csv')
misc16 = pd.read_csv('NBA_DATA/misc16.csv')
adv16 = pd.read_csv('NBA_DATA/adv16.csv')
four16 = pd.read_csv('NBA_DATA/four16.csv')
reg17 = pd.read_csv('NBA_DATA/reg17.csv')
misc17 = pd.read_csv('NBA_DATA/misc17.csv')
adv17 = pd.read_csv('NBA_DATA/adv17.csv')
four17 = pd.read_csv('NBA_DATA/four17.csv')
reg18 = pd.read_csv('NBA_DATA/reg18.csv')
misc18 = pd.read_csv('NBA_DATA/misc18.csv')
adv18 = pd.read_csv('NBA_DATA/adv18.csv')
four18 = pd.read_csv('NBA_DATA/four18.csv')
reg19 = pd.read_csv('NBA_DATA/reg19.csv')
misc19 = pd.read_csv('NBA_DATA/misc19.csv')
adv19 = pd.read_csv('NBA_DATA/adv19.csv')
four19 = pd.read_csv('NBA_DATA/four19.csv')

team_ids = reg16.TEAM_ID.value_counts().index

#total15 = get_features(reg15,misc15,adv15,four15)
total16 = get_features(reg16,misc16,adv16,four16)
total17 = get_features(reg17,misc17,adv17,four17)
total18 = get_features(reg18,misc18,adv18,four18)
total19 = get_features(reg19,misc19,adv19,four19)
#total22 = get_features(current,misc22,adv22,four22)

features = [total16,total17,total18]


#vegas15 = pd.read_csv('vegas/2015-16/vegas.txt',sep = ',')
vegas16 = pd.read_csv('vegas/2016-17/vegas.txt',sep = ',')
vegas17 = pd.read_csv('vegas/2017-18/vegas.txt',sep = ',')
vegas18 = pd.read_csv('vegas/2018-19/vegas.txt',sep = ',')

targets = [vegas16,vegas17,vegas18]
#test_targ = [vegas15]

sixteen_eighteen = get_targets_ML(features,targets)
#test = get_targets_ML(test_features,test_targ)

X_train = sixteen_eighteen[0].iloc[0:4500,:]
X_test = sixteen_eighteen[0].iloc[4500:,:]

#sixteen_eighteen[1]['prob_a'] = 1/(sixteen_eighteen[1]['odds_a'])
y = sixteen_eighteen[1][['odds_a','a_win','odds_b']]
y_train1 = y.iloc[0:4500,:]
y_test1 = y.iloc[4500:,:]
y_train2 = y.iloc[0:4500,:]
y_test2 = y.iloc[4500:,:]

#nba_teams = ['Cleveland Cavaliers','Boston Celtics','Golden State Warriors','Houston Rockets','Utah Jazz'
 #            ,'Philadelphia 76ers','Toronto Raptors','New Orleans Pelicans','Washington Wizards','Miami Heat'
  #          ,'Milwaukee Bucks','Indiana Pacers','Oklahoma City Thunder','San Antonio Spurs','Portland Trail Blazers',
   #          'Minnesota Timberwolves','Chicago Bulls','Dallas Mavericks','Sacramento Kings','Los Angeles Lakers'
    ##       ,'Charlotte Hornets','Atlanta Hawks','Phoenix Suns','Brooklyn Nets']

#y_train1['odds_a'] = 1 / (y_train1['prob_a'])
#y_test1['odds_a'] = 1 / (y_test1['prob_a'])

#y_train2['odds_a'] = 1 / (y_train2['prob_a'])
#y_test2['odds_a'] = 1 / (y_test2['prob_a'])

#y_train2['odds_b'] = 1/(1-y_train2['prob_a'])
#y_test2['odds_b'] = 1/(1-y_test2['prob_a'])

y_train2['b_win'] = abs(1-y_train2['a_win'])
y_test2['b_win'] = abs(1-y_test2['a_win'])

y_train2['no_bet'] = 0
y_test2['no_bet'] = 0

X_train = order_features(X_train)
X_test = order_features(X_test)




y_train2 = y_train2.loc[:,['a_win','b_win','no_bet','odds_a','odds_b']]
y_test2 = y_test2.loc[:,['a_win','b_win','no_bet','odds_a','odds_b']]

#y_train2 = y_train2.drop(columns = ['prob_a'])
#y_test2 = y_test2.drop(columns = ['prob_a'])

NameError: name 'get_features' is not defined

In [81]:
model = get_model(96,3,96,0.5,0.5,0.0001)
train(model,X_train,y_train2,0.2,50,50)




Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [368]:
def balance_check(predictions):
    averageA = 0
    averageB = 0
    for i in predictions:
        averageA += i[0]
        averageB += i[1]
    print(str(averageB - averageA ))
        

In [437]:
#balance_check(model.predict(X_test))

56.971939268570594


In [401]:
def check_column(train, column_name):
    column_A = column_name + '_A'
    column_B = column_name + '_B'
    frame = train[[column_A,column_B]]
    sum_A = 0
    sum_B = 0
    for i in range(frame.shape[0]):
        sum_A += frame.iloc[i,0]
        sum_B += frame.iloc[i,1]
    print('sum_A: ' + str(sum_A))
    print('sum_B: ' + str(sum_B))
    return(abs(sum_A - sum_B) < 0.1)
        
        

In [402]:
def check_train(train):
    for columnName in train:
        columnName = columnName[:-2]
        check_column(X_train,columnName)

In [403]:
#check_train(X_train)

sum_A: 2057.760999999994
sum_B: 2057.7609999999945
sum_A: 1602.5265000000168
sum_B: 1602.5265000000168
sum_A: 3478.5705000000285
sum_B: 3478.570500000029
sum_A: 9893.667504642914
sum_B: 9893.667504642912
sum_A: 3662.559211251298
sum_B: 3662.5592112512977
sum_A: 8007.724012970529
sum_B: 8007.724012970528
sum_A: 947.197620806472
sum_B: 947.1976208064718
sum_A: 2642.315248918957
sum_B: 2642.315248918957
sum_A: 1621.351461333878
sum_B: 1621.3514613338782
sum_A: 2112.227041738095
sum_B: 2112.2270417380946
sum_A: 931.7214588045094
sum_B: 931.7214588045094
sum_A: 3137.588932320165
sum_B: 3137.588932320165
sum_A: 4069.310391124672
sum_B: 4069.310391124672
sum_A: 2126.3519750269597
sum_B: 2126.3519750269597
sum_A: 723.6094567685695
sum_B: 723.6094567685695
sum_A: 453.6075905100528
sum_B: 453.6075905100528
sum_A: 1287.1917956435123
sum_B: 1287.191795643512
sum_A: 1895.564179261716
sum_B: 1895.564179261716
sum_A: 0.9238727500617508
sum_B: 0.9238727500617498
sum_A: 2250
sum_B: 2250
sum_A: 476928.6