# Creating a Betting Algorithm for Points

### Importing Packages

In [291]:
import pandas as pd
import numpy as np

In [293]:
import nba_api.stats.endpoints as me
from nba_api.stats.endpoints import playercareerstats
from nba_api.stats.static import players

from datetime import datetime

from sklearn.metrics import mean_squared_error, root_mean_squared_error
from sklearn.model_selection import cross_val_score,train_test_split, TimeSeriesSplit
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.linear_model import LassoCV, Lasso, LinearRegression
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, StackingRegressor, RandomForestClassifier, GradientBoostingClassifier, GradientBoostingRegressor

from sklearn.preprocessing import PolynomialFeatures, StandardScaler

### Prelimary Data Gathering

In [324]:
playercareerstats.PlayerCareerStats(player_id = '1627826').get_data_frames()[0]

Unnamed: 0,PLAYER_ID,SEASON_ID,LEAGUE_ID,TEAM_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,GS,MIN,FGM,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
0,1627826,2016-17,0,1610612747,LAL,20.0,38,11,609.0,126,...,0.653,41,118,159,30,14,33,30,66,284
1,1627826,2017-18,0,1610612747,LAL,21.0,43,0,410.0,61,...,0.765,45,78,123,25,8,15,26,47,161
2,1627826,2018-19,0,1610612747,LAL,22.0,33,12,516.0,112,...,0.864,54,108,162,25,4,27,33,73,281
3,1627826,2018-19,0,1610612746,LAC,22.0,26,25,524.0,100,...,0.733,61,139,200,38,10,24,37,64,244
4,1627826,2018-19,0,0,TOT,22.0,59,37,1039.0,212,...,0.802,115,247,362,63,14,51,70,137,525
5,1627826,2019-20,0,1610612746,LAC,23.0,72,70,1326.0,236,...,0.747,197,346,543,82,16,66,61,168,596
6,1627826,2020-21,0,1610612746,LAC,24.0,72,33,1609.0,257,...,0.789,189,330,519,90,24,62,81,187,650
7,1627826,2021-22,0,1610612746,LAC,25.0,76,76,1852.0,310,...,0.727,217,427,644,120,36,77,114,203,785
8,1627826,2022-23,0,1610612746,LAC,26.0,76,76,2169.0,326,...,0.697,236,520,756,77,29,98,117,219,818
9,1627826,2023-24,0,1610612746,LAC,27.0,68,68,1795.0,337,...,0.723,196,430,626,93,22,83,79,180,794


In [297]:
player_details = me.PlayerIndex(league_id = '00', season = '2024-25').get_data_frames()[0]

def height_to_inches(height_str):
    # Split the height string into feet and inches
    feet, inches = height_str.split('-')
    
    # Convert to integers and calculate total height in inches
    total_inches = (int(feet) * 12) + int(inches)
    
    return total_inches

player_details['HEIGHT_INCHES'] = player_details['HEIGHT'].apply(height_to_inches)
player_details['WEIGHT'] = pd.to_numeric(player_details['WEIGHT'])
player_details['POSITION'] = player_details['POSITION'].apply(lambda x: x[0])
player_details['PLAYER'] = player_details['PLAYER_FIRST_NAME']+ ' '+ player_details['PLAYER_LAST_NAME']
player_details

Unnamed: 0,PERSON_ID,PLAYER_LAST_NAME,PLAYER_FIRST_NAME,PLAYER_SLUG,TEAM_ID,TEAM_SLUG,IS_DEFUNCT,TEAM_CITY,TEAM_NAME,TEAM_ABBREVIATION,...,DRAFT_NUMBER,ROSTER_STATUS,FROM_YEAR,TO_YEAR,PTS,REB,AST,STATS_TIMEFRAME,HEIGHT_INCHES,PLAYER
0,1630173,Achiuwa,Precious,precious-achiuwa,1610612752,knicks,0,New York,Knicks,NYK,...,20.0,1.0,2020,2024,,,,Season,80,Precious Achiuwa
1,203500,Adams,Steven,steven-adams,1610612745,rockets,0,Houston,Rockets,HOU,...,12.0,1.0,2013,2024,3.0,2.0,1.5,Season,83,Steven Adams
2,1628389,Adebayo,Bam,bam-adebayo,1610612748,heat,0,Miami,Heat,MIA,...,14.0,1.0,2017,2024,11.0,7.3,3.0,Season,81,Bam Adebayo
3,1630534,Agbaji,Ochai,ochai-agbaji,1610612761,raptors,0,Toronto,Raptors,TOR,...,14.0,1.0,2022,2024,11.2,4.7,1.7,Season,77,Ochai Agbaji
4,1630583,Aldama,Santi,santi-aldama,1610612763,grizzlies,0,Memphis,Grizzlies,MEM,...,30.0,1.0,2021,2024,13.8,7.0,3.2,Season,84,Santi Aldama
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
521,203469,Zeller,Cody,cody-zeller,1610612737,hawks,0,Atlanta,Hawks,ATL,...,4.0,1.0,2013,2024,,,,Season,83,Cody Zeller
522,1627826,Zubac,Ivica,ivica-zubac,1610612746,clippers,0,LA,Clippers,LAC,...,32.0,1.0,2016,2024,19.2,13.2,3.0,Season,84,Ivica Zubac
523,1641783,da Silva,Tristan,tristan-da-silva,1610612753,magic,0,Orlando,Magic,ORL,...,18.0,1.0,2024,2024,8.5,2.5,1.0,Season,80,Tristan da Silva
524,1628427,Čančar,Vlatko,vlatko-čančar,1610612743,nuggets,0,Denver,Nuggets,DEN,...,49.0,1.0,2019,2024,0.0,1.0,0.0,Season,80,Vlatko Čančar


In [298]:
id_list = [x for x in player_details['PERSON_ID']]


In [299]:

# Nikola Jokić
game_logs =  me.PlayerGameLogs(season_nullable= '2017-18') # pulls every game log for the season

# pandas data frames (optional: pip install pandas)
gdf = game_logs.get_data_frames()[0]
gdf.iloc[:, :20]

Unnamed: 0,SEASON_YEAR,PLAYER_ID,PLAYER_NAME,NICKNAME,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA
0,2017-18,202323,Evan Turner,Evan,1610612757,POR,Portland Trail Blazers,0021701229,2018-04-11T00:00:00,POR vs. UTA,W,28.400000,2,6,0.333,0,2,0.000,0,0
1,2017-18,1628021,David Nwaba,David,1610612741,CHI,Chicago Bulls,0021701224,2018-04-11T00:00:00,CHI vs. DET,L,29.816667,5,16,0.313,1,3,0.333,2,3
2,2017-18,1626245,Cristiano Felicio,Cristiano,1610612741,CHI,Chicago Bulls,0021701224,2018-04-11T00:00:00,CHI vs. DET,L,29.733333,3,5,0.600,0,0,0.000,4,6
3,2017-18,2544,LeBron James,LeBron,1610612739,CLE,Cleveland Cavaliers,0021701220,2018-04-11T00:00:00,CLE vs. NYK,L,10.550000,4,9,0.444,0,2,0.000,2,3
4,2017-18,201943,Brandon Jennings,Brandon,1610612749,MIL,Milwaukee Bucks,0021701223,2018-04-11T00:00:00,MIL @ PHI,L,18.600000,3,12,0.250,0,4,0.000,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26102,2017-18,2733,Shaun Livingston,Shaun,1610612744,GSW,Golden State Warriors,0021700002,2017-10-17T00:00:00,GSW vs. HOU,L,18.545000,2,5,0.400,0,0,0.000,2,2
26103,2017-18,1626172,Kevon Looney,Kevon,1610612744,GSW,Golden State Warriors,0021700002,2017-10-17T00:00:00,GSW vs. HOU,L,7.766667,0,0,0.000,0,0,0.000,1,2
26104,2017-18,203499,Shane Larkin,Shane,1610612738,BOS,Boston Celtics,0021700001,2017-10-17T00:00:00,BOS @ CLE,L,4.816667,0,1,0.000,0,1,0.000,0,0
26105,2017-18,1628400,Semi Ojeleye,Semi,1610612738,BOS,Boston Celtics,0021700001,2017-10-17T00:00:00,BOS @ CLE,L,8.650000,0,2,0.000,0,1,0.000,0,0


In [300]:
gdf.groupby('PLAYER_NAME')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001ADC7E46A90>

In [303]:
data = players.get_players()

In [328]:
df = pd.DataFrame(data)
active_df_24 = df.loc[df['is_active'] == True, :]
active_df_24.loc[active_df_24['first_name'] == 'Miles', :]

Unnamed: 0,id,full_name,first_name,last_name,is_active
508,1628970,Miles Bridges,Miles,Bridges,True
2868,1630540,Miles McBride,Miles,McBride,True


### Pulling Player gamelogs for every player from 2022-23 and 2023-24

In [330]:
game_log_dict_23 = {}
game_log_dict_24 = {}

# Assuming 'df' is a DataFrame that contains the player IDs
#for 2023 season
for id in active_df_24.loc[:, 'id']:
    
    try:
        # Pull every game log for the 2023-24 season
        game_logs = me.PlayerGameLog(season='2023-24', player_id=id, season_type_all_star = 'Regular Season')

        # Get the game log data as a DataFrame
        game_log_df = game_logs.get_data_frames()[0]
        game_log_df['OPP'] = game_log_df['MATCHUP'].apply(lambda x: x[-3:])
        print(game_log_df['Player_ID'].unique()[0])
        # Store the DataFrame in the dictionary with the player ID as the key
        game_log_dict_23[game_log_df['Player_ID'].unique()[0]] = game_log_df
        
    except Exception as e:
        print(f"Error processing player {id}: {e}")


# for 2024 season
for id in active_df_24.loc[:, 'id']:
    
    try:
        # Pull every game log for the 2023-24 season
        game_logs = me.PlayerGameLog(season='2024-25', player_id=id, season_type_all_star = 'Regular Season')

        # Get the game log data as a DataFrame
        game_log_df = game_logs.get_data_frames()[0]
        game_log_df['OPP'] = game_log_df['MATCHUP'].apply(lambda x: x[-3:])
        print(game_log_df['Player_ID'].unique()[0])
        # Store the DataFrame in the dictionary with the player ID as the key
        game_log_dict_24[game_log_df['Player_ID'].unique()[0]] = game_log_df
        
    except Exception as e:
        print(f"Error processing player {id}: {e}")

1630173
1628389
1630534
1630583
1629638
1628960
1628386
1641851
1630631
203937
203507
203648
1630175
1628384
1627853
1630166
1629028
1628962
1630641
1628963
1641735
1631116
1630163
1628964
1631094
1630217
1630625
1631230
203084
1630567
1629628
1629646
1641734
1628966
201587
1629647
203078
1627736
1630699
1631262
202722
201976
1630180
1629048
1641931
202687
1641710
1641778
1631205
203992
202711
1629626
1629716
1626164
1630527
1628449
1630547
1631123
1631103
1631128
1628969
1628970
1629052
1631167
1627763
1629717
1628415
1628971
1627759
1631112
1641738
1629650
1630535
1629718
1628972
1628973
1628418
1641723
203493
202692
1630215
202710
1631288
203484
1641739
203991
1628975
1628976
1630618
1627936
1630658
1630608
1630577
1630551
1631108
1631321
1629634
203903
1629651
1641730
1629599
1628381
1628380
201144
1626192
1641731
1641741
203496
1628470
203109
1630622
1630595
203552
201939
1630700
203076
1631098
1631120
201942
1641926
1628978
1631217
1629603
1628977
1641711
1631172
203915
1629029
1

### Making sure each dictionary has the same y

In [331]:
game_log_dict_23 = {k: v for k, v in game_log_dict_23.items() if k in id_list}
game_log_dict_24 = {k: v for k, v in game_log_dict_24.items() if k in id_list}


## This is the start of creating the data tables for prediction

### First Step gettings some modes

In [332]:
for player in game_log_dict_23:
    # Get the mode of POSITION, WEIGHT, and HEIGHT_INCHES
    print(player)
    position_mode = player_details.loc[player_details['PERSON_ID'] == player, 'POSITION'].mode()[0]
    weight_mode = player_details.loc[player_details['PERSON_ID'] == player, 'WEIGHT'].mode()[0]
    height_mode = player_details.loc[player_details['PERSON_ID'] == player, 'HEIGHT_INCHES'].mode()[0]
    name_mode = player_details.loc[player_details['PERSON_ID'] == player, 'PLAYER'].mode()[0]

    # Assign these values to the entire column for the player
    game_log_dict_23[player]['POSITION'] = position_mode
    game_log_dict_23[player]['WEIGHT'] = weight_mode
    game_log_dict_23[player]['HEIGHT_INCHES'] = height_mode
    game_log_dict_23[player]['PLAYER_NAME'] = name_mode
    game_log_dict_23[player]['GAME_DATE'] = pd.to_datetime(game_log_dict_23[player]['GAME_DATE'], format='%b %d, %Y')

    # Calculate the difference between each date and today's date
    game_log_dict_23[player]['DAYS_SINCE_TODAY'] = (datetime.now() - game_log_dict_23[player]['GAME_DATE']).dt.days


for player in game_log_dict_24:
    # Get the mode of POSITION, WEIGHT, and HEIGHT_INCHES
    position_mode = player_details.loc[player_details['PERSON_ID'] == player, 'POSITION'].mode()[0]
    weight_mode = player_details.loc[player_details['PERSON_ID'] == player, 'WEIGHT'].mode()[0]
    height_mode = player_details.loc[player_details['PERSON_ID'] == player, 'HEIGHT_INCHES'].mode()[0]
    name_mode = player_details.loc[player_details['PERSON_ID'] == player, 'PLAYER'].mode()[0]

    # Assign these values to the entire column for the player
    game_log_dict_24[player]['POSITION'] = position_mode
    game_log_dict_24[player]['WEIGHT'] = weight_mode
    game_log_dict_24[player]['HEIGHT_INCHES'] = height_mode
    game_log_dict_24[player]['PLAYER_NAME'] = name_mode
    game_log_dict_24[player]['GAME_DATE'] = pd.to_datetime(game_log_dict_24[player]['GAME_DATE'], format='%b %d, %Y')

    # Calculate the difference between each date and today's date
    game_log_dict_24[player]['DAYS_SINCE_TODAY'] = (datetime.now() - game_log_dict_24[player]['GAME_DATE']).dt.days

1630173
1628389
1630534
1630583
1629638
1628960
1628386
1630631
203937
203507
1630175
1628384
1630166
1629028
1628963
1631116
1630163
1628964
1631094
1630217
1630625
1631230
203084
1630567
1629628
1629646
1641734
201587
203078
1627736
1630699
1630180
1629048
1641710
203992
202711
1629626
1626164
1630527
1628449
1631103
1631128
1628969
1628970
1627763
1628415
1628971
1627759
1641738
1628973
1628418
1641723
202692
1630215
202710
1631288
203484
1641739
203991
1628975
1628976
1630618
1627936
1630658
1630577
1630551
1631108
1631321
1629634
203903
1629651
1641730
1629599
1628381
1628380
201144
1626192
1641731
1641741
1628470
1630595
203552
201939
1630700
203076
1631098
1631120
201942
1628978
1631217
1641711
1631172
203915
1629029
1629652
1630245
1630288
203083
1630537
1630561
1627739
201142
1631105
1631106
1630162
1630556
1631165
203954
1629234
203957
1628981
1627827
1641745
1631323
1628368
1629655
1629636
1630568
1641718
202331
201959
1630581
1628983
1630264
1631221
203497
203932
201569
203

### Creating new data tables

In [333]:
game_log_23_by_id = {}
game_log_23_by_player = {}
game_log_24_by_id = {}
game_log_24_by_player = {}

for player_id, df in game_log_dict_23.items():
    # Assuming you have a way to get the player's name from the dataframe or a separate source
    player_name = player_details.loc[player_details['PERSON_ID'] == player_id, 'PLAYER'].values[0]  # Get player name

    # Assign the same dataframe to both the player_id and player_name keys
    game_log_23_by_id[player_id] = df[::-1]
    game_log_23_by_player[player_name] = df[::-1]


for player_id, df in game_log_dict_24.items():
    # Assuming you have a way to get the player's name from the dataframe or a separate source
    player_name = player_details.loc[player_details['PERSON_ID'] == player_id, 'PLAYER'].values[0]  # Get player name

    # Assign the same dataframe to both the player_id and player_name keys
    game_log_24_by_id[player_id] = df[::-1]
    game_log_24_by_player[player_name] = df[::-1]

id_list2 = list(game_log_24_by_id.keys())
game_log_23_by_id = {k: v for k, v in game_log_23_by_id.items() if k in id_list2}

### Creating Progressive Average Stats by each game

In [334]:

# Do the same for game_log_23_by_id
for id in game_log_23_by_id:
    game_log_23_by_id[id]['Progressive_Avg_PTS'] = game_log_23_by_id[id]['PTS'].shift().expanding().mean() 
    game_log_23_by_id[id]['Progressive_Avg_AST'] = game_log_23_by_id[id]['AST'].shift().expanding().mean()
    game_log_23_by_id[id]['Progressive_Avg_REB'] = game_log_23_by_id[id]['REB'].shift().expanding().mean()
    game_log_23_by_id[id]['Progressive_Avg_FGA'] = game_log_23_by_id[id]['FGA'].shift().expanding().mean()
    game_log_23_by_id[id]['Progressive_Avg_FGM'] = game_log_23_by_id[id]['FGM'].shift().expanding().mean()
    game_log_23_by_id[id]['Progressive_Avg_FG_PCT'] = game_log_23_by_id[id]['Progressive_Avg_FGM'] / game_log_23_by_id[id]['Progressive_Avg_FGA']
    game_log_23_by_id[id]['Progressive_Avg_FG3A'] = game_log_23_by_id[id]['FG3A'].shift().expanding().mean()
    game_log_23_by_id[id]['Progressive_Avg_FG3M'] = game_log_23_by_id[id]['FG3M'].shift().expanding().mean()
    game_log_23_by_id[id]['Progressive_Avg_FG3_PCT'] = game_log_23_by_id[id]['Progressive_Avg_FG3M'] / game_log_23_by_id[id]['Progressive_Avg_FG3A']
    game_log_23_by_id[id]['Progressive_Avg_FTA'] = game_log_23_by_id[id]['FTA'].shift().expanding().mean()
    game_log_23_by_id[id]['Progressive_Avg_FTM'] = game_log_23_by_id[id]['FTM'].shift().expanding().mean()
    game_log_23_by_id[id]['Progressive_Avg_FT_PCT'] = game_log_23_by_id[id]['Progressive_Avg_FTM'] / game_log_23_by_id[id]['Progressive_Avg_FTA']
    game_log_23_by_id[id]['Progressive_Avg_STL'] = game_log_23_by_id[id]['STL'].shift().expanding().mean()
    game_log_23_by_id[id]['Progressive_Avg_BLK'] = game_log_23_by_id[id]['BLK'].shift().expanding().mean()
    game_log_23_by_id[id]['Progressive_Avg_TOV'] = game_log_23_by_id[id]['TOV'].shift().expanding().mean()
    game_log_23_by_id[id]['Progressive_Avg_MIN'] = game_log_23_by_id[id]['MIN'].shift().expanding().mean()


# 2024
for id in game_log_24_by_id:
    game_log_24_by_id[id]['Progressive_Avg_PTS'] = game_log_24_by_id[id]['PTS'].shift().expanding().mean() 
    game_log_24_by_id[id]['Progressive_Avg_AST'] = game_log_24_by_id[id]['AST'].shift().expanding().mean()
    game_log_24_by_id[id]['Progressive_Avg_REB'] = game_log_24_by_id[id]['REB'].shift().expanding().mean()
    game_log_24_by_id[id]['Progressive_Avg_FGA'] = game_log_24_by_id[id]['FGA'].shift().expanding().mean()
    game_log_24_by_id[id]['Progressive_Avg_FGM'] = game_log_24_by_id[id]['FGM'].shift().expanding().mean()
    game_log_24_by_id[id]['Progressive_Avg_FG_PCT'] = game_log_24_by_id[id]['Progressive_Avg_FGM'] / game_log_24_by_id[id]['Progressive_Avg_FGA']
    game_log_24_by_id[id]['Progressive_Avg_FG3A'] = game_log_24_by_id[id]['FG3A'].shift().expanding().mean()
    game_log_24_by_id[id]['Progressive_Avg_FG3M'] = game_log_24_by_id[id]['FG3M'].shift().expanding().mean()
    game_log_24_by_id[id]['Progressive_Avg_FG3_PCT'] = game_log_24_by_id[id]['Progressive_Avg_FG3M'] / game_log_24_by_id[id]['Progressive_Avg_FG3A']
    game_log_24_by_id[id]['Progressive_Avg_FTA'] = game_log_24_by_id[id]['FTA'].shift().expanding().mean()
    game_log_24_by_id[id]['Progressive_Avg_FTM'] = game_log_24_by_id[id]['FTM'].shift().expanding().mean()
    game_log_24_by_id[id]['Progressive_Avg_FT_PCT'] = game_log_24_by_id[id]['Progressive_Avg_FTM'] / game_log_24_by_id[id]['Progressive_Avg_FTA']
    game_log_24_by_id[id]['Progressive_Avg_STL'] = game_log_24_by_id[id]['STL'].shift().expanding().mean()
    game_log_24_by_id[id]['Progressive_Avg_BLK'] = game_log_24_by_id[id]['BLK'].shift().expanding().mean()
    game_log_24_by_id[id]['Progressive_Avg_TOV'] = game_log_24_by_id[id]['TOV'].shift().expanding().mean()
    game_log_24_by_id[id]['Progressive_Avg_MIN'] = game_log_24_by_id[id]['MIN'].shift().expanding().mean()


    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  game_log_23_by_id[id]['Progressive_Avg_PTS'] = game_log_23_by_id[id]['PTS'].shift().expanding().mean()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  game_log_23_by_id[id]['Progressive_Avg_AST'] = game_log_23_by_id[id]['AST'].shift().expanding().mean()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
 

### Similar to last, but this time creating 10 game averages

In [None]:
# Function to calculate 10-game averages
def calculate_10_game_avg(df, column):
    return df[column].rolling(window=10, min_periods=1).mean()

# Process 2023 data
for id in game_log_23_by_id:
    df = game_log_23_by_id[id]
    
    for stat in ['PTS', 'AST', 'REB', 'FGA', 'FGM', 'FG3A', 'FG3M', 'FTA', 'FTM', 'STL', 'BLK', 'TOV', 'MIN']:
        df[f'10_Game_Avg_{stat}'] = calculate_10_game_avg(df, stat)
    
    # Calculate percentages
    df['10_Game_Avg_FG_PCT'] = df['10_Game_Avg_FGM'] / df['10_Game_Avg_FGA']
    df['10_Game_Avg_FG3_PCT'] = df['10_Game_Avg_FG3M'] / df['10_Game_Avg_FG3A']
    df['10_Game_Avg_FT_PCT'] = df['10_Game_Avg_FTM'] / df['10_Game_Avg_FTA']
    
    # Set first entry to 0
    for col in df.columns:
        if col.startswith('10_Game_Avg_'):
            df.loc[df.index[0], col] = 0
    
    game_log_23_by_id[id] = df


# Process 2024 data
for id in game_log_24_by_id:
    df_24 = game_log_24_by_id[id]
    
    if id in game_log_23_by_id:
        df_23 = game_log_23_by_id[id]
        # Combine last 9 games of 2022 with 2023 data
        df_combined = pd.concat([df_23.tail(9), df_24])
    else:
        df_combined = df_24
    
    for stat in ['PTS', 'AST', 'REB', 'FGA', 'FGM', 'FG3A', 'FG3M', 'FTA', 'FTM', 'STL', 'BLK', 'TOV', 'MIN']:
        df_24[f'10_Game_Avg_{stat}'] = calculate_10_game_avg(df_combined, stat).tail(len(df_24))
    
    # Calculate percentages
    df_24['10_Game_Avg_FG_PCT'] = df_24['10_Game_Avg_FGM'] / df_24['10_Game_Avg_FGA']
    df_24['10_Game_Avg_FG3_PCT'] = df_24['10_Game_Avg_FG3M'] / df_24['10_Game_Avg_FG3A']
    df_24['10_Game_Avg_FT_PCT'] = df_24['10_Game_Avg_FTM'] / df_24['10_Game_Avg_FTA']
    
    game_log_24_by_id[id] = df_24

# Handle division by zero and NaN values
for id in game_log_23_by_id:
    game_log_23_by_id[id] = game_log_23_by_id[id].replace([np.inf, -np.inf], np.nan).fillna(0)

for id in game_log_24_by_id:
    game_log_24_by_id[id] = game_log_24_by_id[id].replace([np.inf, -np.inf], np.nan).fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'10_Game_Avg_{stat}'] = calculate_10_game_avg(df, stat)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'10_Game_Avg_{stat}'] = calculate_10_game_avg(df, stat)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'10_Game_Avg_{stat}'] = calculate_10_game_avg(df, stat)
A value is trying to be s

### Sorting to keep players who played over 20 mpg

In [None]:
def calculate_avg_minutes(player_df):
    return player_df['MIN'].mean()

# Step 1 & 2: Calculate average minutes and filter players
players_2023 = {}
players_2024 = {}

for player_id, df in game_log_23_by_id.items():
    avg_minutes = calculate_avg_minutes(df)
    
    if avg_minutes >= 20 and df.shape[0] > 40:
        players_2023[player_id] = avg_minutes

for player_id, df in game_log_24_by_id.items():
    avg_minutes = calculate_avg_minutes(df)
    
    if avg_minutes >= 20 and df.shape[0] > 40:
        players_2024[player_id] = avg_minutes

# Step 3: Keep only players present in both seasons
common_players = set(players_2023.keys()) & set(players_2024.keys())

# Filter the original dictionaries
game_log_23_by_id = {player_id: df for player_id, df in game_log_23_by_id.items() if player_id in common_players}
game_log_24_by_id = {player_id: df for player_id, df in game_log_24_by_id.items() if player_id in common_players}

### this next code creates expected increase or decrease in points based on team played and player position

In [None]:
team_list = list(game_log_23_by_id[203999]['OPP'].unique())
team_list.append('DEN')
position = ['G', 'F', 'C']

team_position_dict_23 = {}
team_position_dict_24 = {}



# Create empty lists for each team and position combination
for team in team_list:
    for pos in position:
        team_position_dict_23[f"{team}_{pos}"] = []
        team_position_dict_24[f"{team}_{pos}"] = []

In [None]:
# Getting average amount above avg based on team played for each position
for id in game_log_23_by_id:
    df = game_log_23_by_id[id].copy()
    position = df['POSITION'][0]
    for index, row in df.iterrows():
        if row['MIN'] > 19:
            team = row['OPP']  # The team the player played against
            points = row['PTS']
    
            key = f"{team}_{position}"
    
            if key in team_position_dict_23:
                team_position_dict_23[key].append(points)

for id in game_log_24_by_id:
    df = game_log_24_by_id[id].copy()
    position = df['POSITION'][0]
    for index, row in df.iterrows():
        if row['MIN'] > 19:
            team = row['OPP']  # The team the player played against
            points = row['PTS']
    
            key = f"{team}_{position}"
    
            if key in team_position_dict_24:
                team_position_dict_24[key].append(points)
    
            
            

In [None]:
points_averages_23 = {}
guard_averages_23 = []
forward_averages_23 = []
center_averages_23 = []

points_averages_24 = {}
guard_averages_24 = []
forward_averages_24 = []
center_averages_24 = []

for id in team_position_dict_23:
    if id[4] == 'G':
        guard_averages_23.append(pd.to_numeric(team_position_dict_23[id]).mean())
    elif id[4] == 'F':
        forward_averages_23.append(pd.to_numeric(team_position_dict_23[id]).mean())
    elif id[4] == 'C':
        center_averages_23.append(pd.to_numeric(team_position_dict_23[id]).mean())
        
guard_mean_23 = pd.to_numeric(guard_averages_23).mean()
forward_mean_23 = pd.to_numeric(forward_averages_23).mean()
center_mean_23 = pd.to_numeric(center_averages_23).mean()
for id in team_position_dict_23:
    if id[4] == 'G':  
        points_averages_23[id] = pd.to_numeric(team_position_dict_23[id]).mean() - guard_mean_23
    if id[4] == 'F':  
        points_averages_23[id] = pd.to_numeric(team_position_dict_23[id]).mean() - forward_mean_23
    if id[4] == 'C':  
        points_averages_23[id] = pd.to_numeric(team_position_dict_23[id]).mean() - center_mean_23



for id in team_position_dict_24:
    if id[4] == 'G':
        guard_averages_24.append(pd.to_numeric(team_position_dict_24[id]).mean())
    elif id[4] == 'F':
        forward_averages_24.append(pd.to_numeric(team_position_dict_24[id]).mean())
    elif id[4] == 'C':
        center_averages_24.append(pd.to_numeric(team_position_dict_24[id]).mean())
        
guard_mean_24 = pd.to_numeric(guard_averages_24).mean()
forward_mean_24 = pd.to_numeric(forward_averages_24).mean()
center_mean_24 = pd.to_numeric(center_averages_24).mean()
for id in team_position_dict_24:
    if id[4] == 'G':  
        points_averages_24[id] = pd.to_numeric(team_position_dict_24[id]).mean() - guard_mean_24
    if id[4] == 'F':  
        points_averages_24[id] = pd.to_numeric(team_position_dict_24[id]).mean() - forward_mean_24
    if id[4] == 'C':  
        points_averages_24[id] = pd.to_numeric(team_position_dict_24[id]).mean() - center_mean_24
        



In [None]:
def add_position_averages_to_gamelog(game_log_df, player_position, points_averages):
    # Create a new column to store the average value from the dictionary
    game_log_df['AVG_ABOVE_NORM_BY_POS_TEAM'] = None
    
    # Iterate over the rows in the game log
    for index, row in game_log_df.iterrows():
        team = row['OPP']  # Assuming 'OPP' contains the opposing team (e.g., 'BOS', 'LAL')
        position = player_position  # The player's position (e.g., 'G', 'F', 'C')

        # Create the key in {team_pos} format
        key = f'{team}_{position}'

        # Look up the corresponding value in the dictionary and add it to the DataFrame
        if key in team_position_dict_23:
            game_log_df.at[index, 'AVG_ABOVE_NORM_BY_POS_TEAM'] = points_averages[key]
        else:
            game_log_df.at[index, 'AVG_ABOVE_NORM_BY_POS_TEAM'] = None  # If no match, you can set this to None or 0

for id in game_log_23_by_id:
    position = df['POSITION'][0]
    add_position_averages_to_gamelog(game_log_23_by_id[id], position, points_averages_23) 

for id in game_log_24_by_id:
    position = df['POSITION'][0]
    add_position_averages_to_gamelog(game_log_24_by_id[id], position, points_averages_24) 

### This calculated Progressive Averages again I guess I messed up in the first code?

In [None]:
# Now, let's process the 2023 data
for player_id in game_log_23_by_id:
    df = game_log_23_by_id[player_id]
    
    # Calculate non-percentage stats
    for stat in ['PTS', 'AST', 'REB', 'FGA', 'FG3A', 'FTA', 'STL', 'BLK', 'TOV', 'MIN']:
        df[f'Progressive_Avg_{stat}'] = df[stat].expanding().mean()
    
    # Calculate percentage stats correctly
    df['Progressive_Avg_FG_PCT'] = df['FGM'].expanding().sum() / df['FGA'].expanding().sum()
    print(df['Progressive_Avg_FG_PCT'], df['FGM'], df['FGA'])
    df['Progressive_Avg_FG3_PCT'] = df['FG3M'].expanding().sum() / df['FG3A'].expanding().sum()
    df['Progressive_Avg_FT_PCT'] = df['FTM'].expanding().sum() / df['FTA'].expanding().sum()
    
    # Replace NaN and inf values with 0
    df = df.replace([np.inf, -np.inf], np.nan).fillna(0)
    
    # Update the dictionary with the modified DataFrame
    game_log_23_by_id[player_id] = df



# Now, let's process the 2024 data
for player_id in game_log_24_by_id:
    df = game_log_24_by_id[player_id]
    first_index = df.index[0]
    
    # Calculate non-percentage stats
    for stat in ['PTS', 'AST', 'REB', 'FGA', 'FG3A', 'FTA', 'STL', 'BLK', 'TOV', 'MIN']:
        df[f'Progressive_Avg_{stat}'] = df[stat].expanding().mean()
        # Fill the first row with the player's average from 2022 if available
        if player_id in game_log_23_by_id:
            df.loc[first_index, f'Progressive_Avg_{stat}'] = game_log_23_by_id[player_id][stat].mean()
    
    # Calculate percentage stats correctly
    df['Progressive_Avg_FG_PCT'] = df['FGM'].expanding().sum() / df['FGA'].expanding().sum()
    df['Progressive_Avg_FG3_PCT'] = df['FG3M'].expanding().sum() / df['FG3A'].expanding().sum()
    df['Progressive_Avg_FT_PCT'] = df['FTM'].expanding().sum() / df['FTA'].expanding().sum()
    
    # Fill the first row of percentage stats with the player's average from 2022 if available
    if player_id in game_log_23_by_id:
        df.loc[first_index, 'Progressive_Avg_FG_PCT'] = game_log_23_by_id[player_id]['FGM'].sum() / game_log_23_by_id[player_id]['FGA'].sum()
        df.loc[first_index, 'Progressive_Avg_FG3_PCT'] = game_log_23_by_id[player_id]['FG3M'].sum() / game_log_23_by_id[player_id]['FG3A'].sum()
        df.loc[first_index, 'Progressive_Avg_FT_PCT'] = game_log_23_by_id[player_id]['FTM'].sum() / game_log_23_by_id[player_id]['FTA'].sum()
    
    # Replace NaN and inf values with 0
    df = df.replace([np.inf, -np.inf], np.nan).fillna(0)
    
    # Update the dictionary with the modified DataFrame
    game_log_24_by_id[player_id] = df


### Makes sure that the ids match in both once again

In [None]:
id_list2 = list(game_log_24_by_id.keys())
game_log_23_by_id = {k: v for k, v in game_log_23_by_id.items() if k in id_list2}


# Test modeling on Jalen Brunson

In [272]:
point_predictions = []
real_points = []
training_set = pd.concat([game_log_23_by_id[1628973].iloc[:20, :], game_log_22_by_id[1628973]])

length = len(game_log_23_by_id[1628973])
for i in range(20, length):
    X_train = training_set.loc[:, ['Progressive_Avg_PTS', 'Progressive_Avg_AST', 'Progressive_Avg_REB',
    'Progressive_Avg_FG_PCT', 'Progressive_Avg_FGA', 'Progressive_Avg_FG3A',
    'Progressive_Avg_FG3_PCT', 'Progressive_Avg_FTA',
    'Progressive_Avg_FT_PCT', 'Progressive_Avg_STL', 'Progressive_Avg_BLK',
    'Progressive_Avg_TOV', 'Progressive_Avg_MIN','10_Game_Avg_PTS',
       '10_Game_Avg_AST', '10_Game_Avg_REB', '10_Game_Avg_FGA',
       '10_Game_Avg_FG3A', '10_Game_Avg_FTA', '10_Game_Avg_STL',
       '10_Game_Avg_BLK', '10_Game_Avg_TOV', '10_Game_Avg_MIN',
       '10_Game_Avg_FG_PCT', '10_Game_Avg_FG3_PCT', '10_Game_Avg_FT_PCT',
    'AVG_ABOVE_NORM_BY_POS_TEAM', 'WEIGHT',
    'HEIGHT_INCHES', 'DAYS_SINCE_TODAY']]
    y = training_set['PTS']

     
    test_set = game_log_23_by_id[1628973].iloc[i, :]
    X_test = test_set.loc[['Progressive_Avg_PTS', 'Progressive_Avg_AST', 'Progressive_Avg_REB',
    'Progressive_Avg_FG_PCT', 'Progressive_Avg_FGA', 'Progressive_Avg_FG3A',
    'Progressive_Avg_FG3_PCT', 'Progressive_Avg_FTA',
    'Progressive_Avg_FT_PCT', 'Progressive_Avg_STL', 'Progressive_Avg_BLK',
    'Progressive_Avg_TOV', 'Progressive_Avg_MIN','10_Game_Avg_PTS',
       '10_Game_Avg_AST', '10_Game_Avg_REB', '10_Game_Avg_FGA',
       '10_Game_Avg_FG3A', '10_Game_Avg_FTA', '10_Game_Avg_STL',
       '10_Game_Avg_BLK', '10_Game_Avg_TOV', '10_Game_Avg_MIN',
       '10_Game_Avg_FG_PCT', '10_Game_Avg_FG3_PCT', '10_Game_Avg_FT_PCT',
    'AVG_ABOVE_NORM_BY_POS_TEAM', 'WEIGHT',
    'HEIGHT_INCHES', 'DAYS_SINCE_TODAY']]
    
    X_test = X_test.to_frame().T
    y_test = test_set['PTS']
    y_test_array = np.array([y_test])
    
    tscv = TimeSeriesSplit(n_splits=5)  # You can adjust the number of splits

    alphas = np.logspace(0,-4,200)
    lassocv = LassoCV(
    cv=tscv,  # Use the TimeSeriesSplit we defined earlier
    max_iter=5000,  # Reduce max iterations
    tol=1e-4,  # Set a tolerance for early stopping
    n_jobs=-1
)
    lassocv.fit(X_train, y)
    
    prediction = lassocv.predict(X_test)
    rmse = root_mean_squared_error(prediction, y_test_array)
    training_set = pd.concat([game_log_23_by_id[1628973].iloc[:i+1, :], game_log_22_by_id[1628973]])

    point_predictions.append(prediction)
    real_points.append(y_test)
    print(i)


20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76


In [274]:
predictions = []
for d in point_predictions:
    predictions.append(d[0])

In [276]:
for i in range(len(predictions)):
    print(predictions[i] - real_points[i])

2.9649888610438992
4.889802293835199
2.7507353934729224
-24.495348397654727
5.210733665305838
-2.5940388840039255
10.59981545066525
-9.245611431569586
-12.813720173176911
3.8268050981207296
6.8887146605394065
-1.7208400177844538
11.032241780893344
-2.2768075207891343
-2.2234117038770265
-6.3762357009082
16.48239217424063
-3.9251857584582552
-4.178970041515029
-13.976085936333448
-9.166178345236943
-1.471200224833673
7.934007573620114
-3.2679645986255323
-2.8706436162608533
0.7336466593133508
-9.843954374251933
-5.099067282310532
4.211232731820765
-7.669495975045962
5.203852792225582
-1.83583355912468
10.079182731745743
-2.6603352708366117
-3.313490023048196
4.319998805999031
29.62490512705772
1.6548263690912144
9.470644491402766
5.8719048436822625
-18.122103577501527
-14.266272778240161
-4.96728074160054
3.0319435320826074
10.87802978777487
-0.6611135159163126
2.6104904792713945
-30.547776331483508
1.4584235759173971
9.925164690537091
-4.78704802246634
-3.2407788600148564
-10.521032352

In [278]:
real_points

[23,
 21,
 23,
 50,
 22,
 29,
 16,
 36,
 38,
 24,
 20,
 28,
 16,
 31,
 29,
 33,
 12,
 30,
 30,
 41,
 38,
 30,
 21,
 32,
 32,
 29,
 40,
 36,
 27,
 39,
 27,
 33,
 21,
 34,
 35,
 27,
 0,
 26,
 19,
 20,
 45,
 42,
 34,
 26,
 17,
 28,
 26,
 61,
 30,
 20,
 35,
 35,
 43,
 45,
 39,
 30,
 40]

In [280]:
predictions

[25.9649888610439,
 25.8898022938352,
 25.750735393472922,
 25.504651602345273,
 27.21073366530584,
 26.405961115996075,
 26.59981545066525,
 26.754388568430414,
 25.18627982682309,
 27.82680509812073,
 26.888714660539407,
 26.279159982215546,
 27.032241780893344,
 28.723192479210866,
 26.776588296122974,
 26.6237642990918,
 28.48239217424063,
 26.074814241541745,
 25.82102995848497,
 27.023914063666552,
 28.833821654763057,
 28.528799775166327,
 28.934007573620114,
 28.732035401374468,
 29.129356383739147,
 29.73364665931335,
 30.156045625748067,
 30.900932717689468,
 31.211232731820765,
 31.33050402495404,
 32.20385279222558,
 31.16416644087532,
 31.079182731745743,
 31.33966472916339,
 31.686509976951804,
 31.31999880599903,
 29.62490512705772,
 27.654826369091214,
 28.470644491402766,
 25.871904843682263,
 26.877896422498473,
 27.73372722175984,
 29.03271925839946,
 29.031943532082607,
 27.87802978777487,
 27.338886484083687,
 28.610490479271395,
 30.452223668516492,
 31.4584235759