In [15]:
# IMPORT STATEMENTS
import pandas as pd
# from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm

In [16]:
# IMPORT DATA
df = pd.read_csv('nba_player_game_logs.csv')
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x).rename(columns=lambda x: x.strip()) # remove whitespace


In [17]:
# df.head()

In [18]:
# GET REGULAR SEASON DATA, BEFORE AND AFTER ALL-STAR 

start_of_regular_season_date = '2021-10-19'
all_star_break_date = '2022-02-17'
after_break_date = '2022-02-24'
end_of_regular_season_date = '2022-04-10' # just use regular season data really

before_break_values = df.loc[df['game_date'].between(start_of_regular_season_date, all_star_break_date)].copy()
after_break_values = df.loc[df['game_date'].between(after_break_date, end_of_regular_season_date)].copy()


In [19]:
# GROUP AND SORT DATA 

# Group players by ID, games played, and average minutes BEFORE All-Star Break
before_break_players = before_break_values.groupby('PLAYER_ID').agg(games_played=('PLAYER_ID', 'count'),min_mean=('MIN','mean')).reset_index() 

# Sort players playing > 15 games and > 15 MPG
before_break_players = before_break_players[(before_break_players['games_played'] > 15) & (before_break_players['min_mean'] > 15)] 

# Group players by ID, games played, and average minutes AFTER All-Star Break
after_break_players = after_break_values.groupby('PLAYER_ID').agg(games_played=('PLAYER_ID', 'count'),min_mean=('MIN','mean')).reset_index()

# Sort players playing > 15 games and > 15 MPG
after_break_players = after_break_players[(after_break_players['games_played'] > 15) & (after_break_players['min_mean'] > 15)]

# Get list of players who player 15 games and averaged 15 MPG on BOTH SIDES of All-Star Break
final_players_list = list(set(before_break_players['PLAYER_ID']).intersection(set(after_break_players['PLAYER_ID'])))

In [20]:
# Variables to sort by for regression
variable_list = ['PTS', 'MIN', 'FG_PCT', 'FG3A', 'FTA', 'AST', 'TOV', 'OREB']
variable_dict = dict.fromkeys(variable_list, 'mean')

In [21]:
# ORGANIZE PLAYER DATA

# Get players grouped by PLAYER_ID and relevant variables from BEFORE All-Star Break
final_players_before_df = before_break_values.groupby('PLAYER_ID').agg(variable_dict)
final_players_before_df = final_players_before_df.reset_index() # make PLAYER_ID a column
final_players_before_df = final_players_before_df[final_players_before_df['PLAYER_ID'].isin(final_players_list)] # Get players from Dataframe in games/minutes list

# final_players_df;
test_df = final_players_before_df.drop(columns=['PLAYER_ID', 'MIN']) # drop PLAYER_ID and MN column since we won't need it for regression

# Get players grouped by PLAYER_ID and relevant variables from AFTER All-Star Break
final_players_after_df = after_break_values.groupby('PLAYER_ID').agg(variable_dict)
final_players_after_df = final_players_after_df.reset_index()
final_players_after_df = final_players_after_df[final_players_after_df['PLAYER_ID'].isin(final_players_list)] 

test_after_df = final_players_after_df.drop(columns=['PLAYER_ID', 'MIN']) # # drop PLAYER_ID and MN column since we won't need it for regression
# test_after_df

In [22]:
# LOGISTIC REGRESSION TIME
# SPLIT DATA

# Encode continuous PTS data as categorical (i.e. Yes/No for avg. 15 PPG)
pts = pd.cut(test_df['PTS'], bins=[0,15,100], labels=[0, 1])
if 'More than 15?' in test_df.columns: # Check if 'More than 15?' column has been added
    test_df = test_df.drop(columns=['More than 15?']) # Drop 'More than 15?' column if column exists already; no need for axis errors
test_df.insert(7, 'More than 15?', pts) # Insert new column at end of DataFrame

# Encode continuous PTS data as categorical (i.e. Yes/No for avg. 15 PPG)
pts = pd.cut(test_after_df['PTS'], bins=[0,15,100], labels=[0, 1])
if 'More than 15?' in test_after_df.columns:
    test_after_df = test_after_df.drop(columns=['More than 15?'])
test_after_df.insert(7, 'More than 15?', pts)


In [23]:
# DEFINE MODEL
#log_reg = LogisticRegression() # Create logistic regression
# Create regresion model based on whether a player scored more than 15 PPG
#log_reg.fit(X=test_df.drop(columns=['PTS','More than 15?']), y=test_df['More than 15?'])

In [24]:
# TEST MODEL
# Predict if players score more than 15 PPG after all-star break
# preds = log_reg.predict(X=test_after_df.drop(columns=['PTS', 'More than 15?']))
# print(pd.crosstab(preds, test_after_df['More than 15?'])) # Print confusion matrix for model


In [25]:
# TEST FOR P VALUES
x = sm.add_constant(test_df.drop(columns=['PTS','More than 15?'])) 
log_reg = sm.OLS(test_df['More than 15?'], x) # logistic regression (OLS)
model = log_reg.fit() # fit model
# print(model.summary())


In [26]:
# WRITE TO FILE 

p_values = model.pvalues[1:] # removes initial row we don't need
p_values = pd.DataFrame(p_values).reset_index() # reset column index 
p_values.columns = ['PREDICTOR', 'P_VALUE'] 
p_values = p_values[p_values['P_VALUE'] < 0.05] # only p_values less than 0.05 threshold; significant
p_values.to_csv('logistic_results.csv', index=False) 
