# Script to Model NBA Player Performance in Python

In [1]:
import requests
import json
import pandas as pd
import numpy as np
import time
import math

from functools import reduce

from nba_api.stats.endpoints import playercareerstats, commonplayerinfo
from nba_api.stats.static import players

from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, RidgeCV, Lars
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold
from sklearn.metrics import accuracy_score

## Getting the Active Players for the Current Season

Utilize NBA API to get list of the active Players

In [2]:
# GETTING ACTIVE PLAYERS IN THE NBA
print('-----------API Call to get Active Players-----------')
player_list = players.get_active_players()
active_player_ids = list()

for i in player_list:
    player_id = i['id']
    active_player_ids.append(player_id)
print('-----------DONE Getting the Active Players-----------')

-----------API Call to get Active Players-----------
-----------DONE Getting the Active Players-----------


Creating a df of the players data... This will take a long time to run to avoid connection time out error.

There are around 500 active players in the NBA for this season.

In [None]:
print('Giving some rest between API Calls')
time.sleep(60)

# Generating a DF of the player stats of the people that were active in the 2019-2020 season
player_df = list()
career_df = list()
print('-----------API Call to get Player Information-----------')
for i, data in enumerate(active_player_ids):
    print('Getting Data for Player {}'.format(i + 1))
    player_id = data[0]
    career = playercareerstats.PlayerCareerStats(player_id = player_id)
    career_df.append(career.get_data_frames()[0])
    print('Sleep 1 for Player {}'.format(i))
    time.sleep(5)

    player_info = commonplayerinfo.CommonPlayerInfo(player_id = player_id)
    player_df.append(player_info.get_data_frames()[0])
    print('Sleep 2 for Player {}'.format(i))
    time.sleep(5)

print('-----------DONE Getting the Players INFO-----------')
player_df = pd.concat(player_df)
career_df = pd.concat(career_df)

player_df.to_csv('../data/input/active_player_df.csv')
career_df.to_csv('../data/input/career_df_master.csv')

## Preprocessing the Data

Steps in Preprocessing
1) Cleans out data for players that are not currently active
2) Gets rid of season data for seasons where a player played less than 500 minutes (might get rid of this)
3) Gets the columns needed for input and output for the model
4) Changes the height from string to int in inches
5) Changes position to a number

In [3]:
print('-----------Preprocessing the data-----------')
# Getting Other player data
active_player_df = pd.read_csv('../data/input/active_player_df.csv')
career_df = pd.read_csv('../data/input/career_df_master.csv')

career_df = career_df.rename(columns = {'PLAYER_ID': 'PERSON_ID'})

career_df = career_df[['PERSON_ID', 'SEASON_ID','PLAYER_AGE']]

# DROPPING PLAYERS WHO WERE NOT ACTIVE THIS SEASON
multi_season_df = pd.read_csv('../data/input/players_stats_multi_season_2010_2020.csv')
active_data = multi_season_df[multi_season_df['PERSON_ID'].isin(active_player_ids)]

# Exclude Seasons where they played less than 500 minutes
exclude_mins = False

if exclude_mins:
    active_data = active_data[active_data['MIN'] >= 500]

players = active_data['PERSON_ID'].unique()

merged = pd.merge(active_data, active_player_df, on = 'PERSON_ID')

merged = pd.merge(merged, career_df, how = 'left', left_on = ['PERSON_ID', 'GROUP_VALUE'], right_on = ['PERSON_ID', 'SEASON_ID'])
# merged.to_csv('../data/input/merged.csv')
# UTILIZING SKLEARN TO TRY AND MODEL
# features = ['GP',	'W', 'L', 'W_PCT', 'MIN', 'PLUS_MINUS', 
#             'DD2', 'TD3', 'HEIGHT', 'WEIGHT', 'SEASON_EXP', 'POSITION']

features = ['GP', 'W', 'L', 'W_PCT', 'PLUS_MINUS','MIN', 'DD2', 'TD3']

outputs = ['FGM', 'FGA', 'FG3M','FTM', 'FTA', 'REB',	'AST',	'TOV',	'STL',	'BLK',	'PTS']

# needed = ['PERSON_ID', 'GP',	'W', 'L', 'W_PCT', 'MIN', 'PLUS_MINUS', 
#             'DD2', 'TD3', 'HEIGHT', 'WEIGHT', 'SEASON_EXP', 'POSITION',
#             'FGM', 'FGA', 'FG3M','FTM', 'FTA', 'REB',	'AST',	'TOV',	'STL',	'BLK',	'PTS']

needed = ['PERSON_ID', 'PLAYER_AGE', 'SEASON_ID', 'GP',	'W', 'L', 'W_PCT', 'MIN', 'PLUS_MINUS', 
            'DD2', 'TD3', 'HEIGHT', 'WEIGHT', 'SEASON_EXP', 'POSITION',
            'FGM', 'FGA', 'FG3M','FTM', 'FTA', 'REB',	'AST',	'TOV',	'STL',	'BLK',	'PTS']


input_data = merged[needed]
input_data.to_csv('../data/input/input_data.csv')

# 
# Changing positions to values (int) with dictionary below
# Changing the height of the player to inches
# positions = {'Center': 5, 'Center-Forward': 4, 'Forward': 3, 
            # 'Forward-Guard': 2, 'Guard': 1, 'Guard-Forward': 2, 'Forward-Center': 4}
positions = {'Center': 3, 'Center-Forward': 3, 'Forward': 2, 
            'Forward-Guard': 2, 'Guard': 1, 'Guard-Forward': 1, 'Forward-Center': 2}

for ind, row in input_data.iterrows():
    height = row['HEIGHT']
    spt = height.split('-')
    height = (int(spt[0]) * 12) + int(spt[1])
    input_data.at[ind, 'HEIGHT'] = height

    position = positions[row['POSITION']]
    input_data.at[ind, 'POSITION'] = position
print('-----------DONE Preprocessing the data-----------')

-----------Preprocessing the data-----------
-----------DONE Preprocessing the data-----------


## Data Statistics

Gathering Initial Statistics from the preprocessed data.

In [4]:
# Count of people in each POSITION
# unproc = merged[['PERSON_ID', 'POSITION']].unique()
grouped_unproc = active_player_df.groupby(['POSITION'])['POSITION'].count()
print(grouped_unproc)

active_proc = active_player_df.replace({"POSITION": positions})
grouped_proc = active_proc.groupby(['POSITION'])['POSITION'].count()
print(grouped_proc)

#


POSITION
Center             34
Center-Forward     29
Forward           142
Forward-Center     38
Forward-Guard      27
Guard             182
Guard-Forward      67
Name: POSITION, dtype: int64
POSITION
1    249
2    207
3     63
Name: POSITION, dtype: int64


## Modeling using SciKit Learn Package in Python

SKLearn Linear Regression and Ridge are utilized to build a prediction model

### Running a Linear Model to Predict the Features that will be inserted

The eventual features that will be imported into the model in predicting performance, first requires prediction because we do not have the current season stats.

The data that we have include stats like Games Played (GP), Wins (W), Losses (L), Postion (Position) and so on. For the upcoming season, we do not have this data, so we will have to predict it first. 

Below is a Linear Regression Model that is fitted with all of the 'Pre-Features' which are ['PLAYER_AGE', 'POSITION', 'HEIGHT', 'WEIGHT']. The important stat here is Plus/Minus which is a rating of impact the player when they are on the court.

It will produce and outcome of 'GP' = Games Played, 'W', 'L', 'W_PCT', 'MIN', 'DD2' = Double Doubles, 'TD3' = Triple Doubles which is used as the input or 2020-2021 features.

In [10]:
add_features = ['GP',	'W', 'L', 'W_PCT', 'MIN', 
                'DD2', 'TD3', 'PLUS_MINUS', 'HEIGHT', 'WEIGHT', 'SEASON_EXP', 'POSITION']

pre_features = ['PLAYER_AGE', 'POSITION', 'HEIGHT', 'WEIGHT']

feature_predict = ['GP', 'W', 'L', 'W_PCT', 'PLUS_MINUS', 'MIN', 'DD2', 'TD3']
# feature_predict = ['GP', 'W_PCT', 'MIN']
# pre_lr = LinearRegression()

position_list = [1, 2, 3, 4, 5]
position_list = [1, 2, 3]
clf_list = list()
for i in position_list:
    temp_df = input_data[input_data['POSITION'] == i]
    pre = temp_df[pre_features].to_numpy()
    test = temp_df[feature_predict].to_numpy()

    clf_list.append(LinearRegression().fit(pre, test))



### Evaluation of the Pre-Feature Fit Model

Seeing how accurately the model can predict the features that will be used for the 2019-2020 season according to the modeling method above

In [14]:
old_data = input_data[input_data['SEASON_ID'] != '2019-20']

season_2019 = input_data[input_data['SEASON_ID'] == '2019-20']

# position_list = [1, 2, 3, 4, 5]
position_list = [1, 2, 3]
test_clf_list = list()
for i in position_list:
    temp_df = old_data[old_data['POSITION'] == i]

    pre = old_data[pre_features].to_numpy()
    test = old_data[feature_predict].to_numpy()

    test_clf_list.append(LinearRegression().fit(pre, test))

rmse_list = list()
error_list = list()

for pos in position_list:
    temp_df = season_2019[season_2019['POSITION'] == pos]
    # clf = test_clf_list[pos - 1]
    clf = clf_list[pos - 1]
    rmse = np.zeros(8)
    error = np.zeros(8)
    
    count = 0 
    for player in active_player_ids:
        if player in temp_df['PERSON_ID'].values:
            count += 1
            pre = temp_df[pre_features].to_numpy()
            train = temp_df[feature_predict].to_numpy()[0]

            test = clf.predict(pre)[0]

            diff = np.subtract(test, train)
            
            div = np.divide(diff, train)
            error = np.add(error, div)

            sq = np.power(diff, 2)

            rmse = np.add(rmse, sq)
        
    error = np.divide(error, count)

    rmse = np.divide(rmse, count)
    rmse = np.sqrt(rmse)

    temp = list()
    temp.append([pos])
    temp.append(error.tolist())
    temp = reduce(lambda x, y: x + y, temp)

    error_list.append(temp)

    temp_list = list()
    temp_list.append([pos])
    temp_list.append(rmse.tolist())
    temp_list = reduce(lambda x, y: x + y, temp_list)

    rmse_list.append(temp_list)

rmse_cols = ['POSITION','GP', 'W', 'L', 'W_PCT', 'PLUS_MINUS', 'MIN', 'DD2', 'TD3']


rmse_df = pd.DataFrame(rmse_list, columns = rmse_cols)
rmse_df.to_csv('../data/output/RMSE Values for Pre-Features.csv')

error_df = pd.DataFrame(error_list, columns = rmse_cols)
error_df.to_csv('../data/output/Error Values for Pre-Features.csv')


### Modeling with SKLearn

The model fit above is utilized to generate 2020-2021 features in the form of ['GP', 'W', 'L', 'W_PCT', 'PLUS_MINUS', 'MIN', 'DD2', 'TD3'] and used as the x_test in predicting y_pred. The previous season data is used to fit a Ridge Regression Model. For players who do not have more than one season of data, the model was fitted using the X, y data available for all in the same position as that player.

In [11]:
curr = list()
rmse_list = list()

print('-----------Modeling the data after sorting by position-----------')
input_data = input_data.sort_values(by = ['POSITION'])

# position_list = [1, 2, 3, 4, 5]
position_list = [1, 2, 3]

for i in position_list:
    current_pos = i

    current_df = input_data[input_data['POSITION'] == current_pos]

    # Set up new Regressor for each position to help players who were rookies or only had one season
    # but produced enough to warrant a prediction next season
    clf = Ridge(normalize = True, fit_intercept = True, solver = 'lsqr')
    # clf = LinearRegression()
    for player in active_player_ids:
        if player in current_df['PERSON_ID'].values:
            temp_df = input_data[input_data['PERSON_ID'] == player]
    
            num_seasons = temp_df.shape[0]
            
            player_id = player
            pre_lr = clf_list[i - 1]
            if num_seasons > 1:
                X = temp_df[features].to_numpy()
                y = temp_df[outputs].to_numpy()

                pre_X = temp_df[pre_features].to_numpy()
                
                x_train = pre_X[len(pre_X) - 1].reshape(1, -1)
                
                # Update the age
                x_train[0][0] = x_train[0][0] + 1

                x_test = pre_lr.predict(x_train)
                
                clf.fit(X, y)

                y_pred = clf.predict(x_test)

                # Append to List to turn into final DF and CSV
                temp = list()
                temp.append([player_id])
                temp.append(np.round(y_pred[0]).tolist())
                temp = reduce(lambda x, y: x + y, temp)
    
                curr.append(temp)
            elif num_seasons == 1:
                X = current_df[features].to_numpy()
                y = current_df[outputs].to_numpy()
                
                pre_X = temp_df[pre_features].to_numpy()
                
                x_train = pre_X[0].reshape(1, -1)
                
                # Update the age
                x_train[0][0] = x_train[0][0] + 1
                x_test = pre_lr.predict(x_train)
                
                clf.fit(X, y) # Fit on the whole data lot available for that position

                y_pred = clf.predict(x_test)

                # Append to List to turn into final DF and CSV
                temp = list()
                temp.append([player_id])
                temp.append(np.round(y_pred[0]).tolist())
                temp = reduce(lambda x, y: x + y, temp)
                
                curr.append(temp)

print('-----------DONE Modeling the data-----------')

-----------Modeling the data after sorting by position-----------
The exact solution is  x = 0                              
The exact solution is  x = 0                              
The exact solution is  x = 0                              
The exact solution is  x = 0                              
The exact solution is  x = 0                              
The exact solution is  x = 0                              
The exact solution is  x = 0                              
The exact solution is  x = 0                              
The exact solution is  x = 0                              
The exact solution is  x = 0                              
The exact solution is  x = 0                              
The exact solution is  x = 0                              
The exact solution is  x = 0                              
The exact solution is  x = 0                              
The exact solution is  x = 0                              
The exact solution is  x = 0                     

## Turning the data into DF and creating a CSV

Take the prediction data and turn it into CSV File

In [13]:
out_cols = ['PERSON_ID', 'FGM', 'FGA', 'FG3M','FTM', 'FTA', 'REB',	'AST',	'TOV',	'STL',	'BLK',	'PTS']

final_cols = ['PERSON_ID', 'DISPLAY_FIRST_LAST', 'HEIGHT', 'WEIGHT', 'POSITION', 'TEAM_NAME', 'GROUP_VALUE', 'FGM',             'FGA', 'FG3M','FTM', 'FTA', 'REB',	'AST',	'TOV',	'STL',	'BLK',	'PTS']

in_cols = ['PERSON_ID', 'FGM', 'FGA', 'FG3M','FTM', 'FTA', 'REB',	'AST',	'TOV',	'STL',	'BLK',	'PTS', 'GROUP_VALUE']
previous_data = active_data[in_cols]
in_merged = pd.merge(active_player_df, previous_data, on = 'PERSON_ID')


print('-----------PRINTING FINAL RESULTS-----------')
#
# Turning outputs from the LR model into a pandas DF and forming into a CSV file
output_df = pd.DataFrame(curr, columns = out_cols).abs()
output_df['GROUP_VALUE'] = '2020-21'
output_merged = pd.merge(active_player_df, output_df, on = 'PERSON_ID')
final_output = pd.concat([output_merged, in_merged], ignore_index = True)
final_output = final_output[final_cols]
final_output = final_output.sort_values(by = ['DISPLAY_FIRST_LAST', 'GROUP_VALUE'])
final_output.to_csv('../data/output/MODEL_OUTPUT_2020-2021.csv')

print('-----------DONE PRINTING FINAL RESULTS-----------')


-----------PRINTING FINAL RESULTS-----------
-----------DONE PRINTING FINAL RESULTS-----------
