# Final Project
## Predicting Defensive or Offensive NBA Players
Tyler Nicholson, Jesus Fernandez, Rebecca Samouha, and Maria Carmen

## Set Up

In [None]:
# Importing necessary packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from ipywidgets import interact, FloatSlider, Dropdown, Button
import statsmodels.api as sm
import statsmodels.formula.api as smf

def get_nba_data(endpt, params, return_url=False):

    # endpt: https://github.com/seemethere/nba_py/wiki/stats.nba.com-Endpoint-Documentation
    # params: dictionary of parameters: i.e., {'LeagueID':'00'}
    from pandas import DataFrame
    from urllib.parse import urlencode
    import json
    
    useragent = "\"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9\""

    dataurl = "\"" + "http://stats.nba.com/stats/" + endpt + "?" + urlencode(params) + "\""
    
    # For debugging: just return the url
    if return_url:
        return(dataurl)
    
    jsonstr = !wget -q -O - --user-agent={useragent} {dataurl}
    
    data = json.loads(jsonstr[0])
    
    h = data['resultSets'][0]['headers']
    d = data['resultSets'][0]['rowSet']
    
    return(DataFrame(d, columns=h))

## Collecting Data

In [None]:
# Only using these 5 seasons because the 'leaguedashptdefend' endpoint only has data for these
# seasons
season_list = [str(one)+'-'+str(one+1)[2:] for one in range(2013, 2018)]
season_list

In [None]:
params = {'LeagueID':'00', 'Season': '2017-18', 'IsOnlyCurrentSeason': '0'}
players = get_nba_data('commonallplayers', params)

In [None]:
# SeasonType options: (Regular Season)|(Pre Season)|(Playoffs)|(All Star)
# PerMode options: (Totals)|(PerGame)
player_shot = []
for i in season_list:
    params = {'LeagueID':'00',
              'PerMode':'Totals',
              'Season':i,
              'SeasonType':'Regular Season'}
    player_shot.append(get_nba_data('leaguedashplayerptshot', params))

In [None]:
bio_stats = []
for i in season_list:
    params = {'LeagueID':'00',
              'PerMode':'Totals',
              'Season':i,
              'SeasonType':'Regular Season'}
    bio_stats.append(get_nba_data('leaguedashplayerbiostats', params))

In [None]:
#DefenseCategory Options: (Overall)|(3 Pointers)|(2 Pointers)|(Less Than 6Ft)
#                         |(Less Than 10Ft)|(Greater Than 15Ft)
pt_defended = []
for i in season_list:
    params = {'LeagueID':'00',
              'PerMode':'Totals',
              'Season': i,
              'SeasonType':'Regular Season',
              'DefenseCategory':'Overall'}
    pt_defended.append(get_nba_data('leaguedashptdefend', params))

In [None]:
# Finding Net Rating for all players
net_rating = []
for i in season_list:
    params = {'College':'',
              'Conference':'',
              'Country':'',
              'DateFrom':'',
              'DateTo':'',
              'Division':'',
              'DraftPick':'',
              'DraftYear':'',
              'GameScope':'',
              'GameSegment':'',
              'Height':'',
              'LastNGames':'0',
              'LeagueID':'00',
              'Location':'',
              'MeasureType':'Advanced',
              'Month':'0',
              'OpponentTeamID':'0',
              'Outcome':'',
              'PORound':'0',
              'PaceAdjust':'N',
              'PerMode':'Totals',
              'Period':'0',
              'PlayerExperience':'',
              'PlayerPosition':'',
              'PlusMinus':'N',
              'Rank':'N',
              'Season': i,
              'SeasonSegment':'',
              'SeasonType':'Regular Season',
              'ShotClockRange':'',
              'StarterBench':'',
              'TeamID':'0',
              'VsConference':'',
              'VsDivision':'',
              'Weight':''}
    net_rating.append(get_nba_data('leaguedashplayerstats', params))

## Cleaning up data into a single dataframe

In [None]:
for i in range(0,5):
    player_shot[i]['SEASON'] = season_list[i]
    bio_stats[i]['SEASON'] = season_list[i]
    pt_defended[i]['SEASON'] = season_list[i]
    net_rating[i]['SEASON'] = season_list[i]

In [None]:
player_shot_df = pd.concat(player_shot)
bio_stats_df = pd.concat(bio_stats)
pt_defended_df = pd.concat(pt_defended)
net_rating_df = pd.concat(net_rating)

In [None]:
print(player_shot_df.shape)
print(bio_stats_df.shape)
print(pt_defended_df.shape)
print(net_rating_df.shape)

In [None]:
player_shot_df = player_shot_df.drop(['PLAYER_NAME', 'PLAYER_LAST_TEAM_ID','GP', 'G',
                                      'PLAYER_LAST_TEAM_ABBREVIATION', 'AGE'], axis = 1)

bio_stats_df = bio_stats_df.drop(['PLAYER_NAME', 'TEAM_ID', 'TEAM_ABBREVIATION', 'AGE',
                                  'PLAYER_HEIGHT','COLLEGE', 'COUNTRY', 'DRAFT_YEAR', 
                                  'DRAFT_ROUND', 'DRAFT_NUMBER'], axis = 1)

pt_defended_df = pt_defended_df.drop(['PLAYER_NAME', 'PLAYER_LAST_TEAM_ID',
                                      'PLAYER_LAST_TEAM_ABBREVIATION', 'PLAYER_POSITION', 
                                      'AGE','GP', 'G'], axis = 1)

net_rating_df = net_rating_df.drop(['TEAM_ID','TEAM_ABBREVIATION','AGE','W','L','W_PCT',
                                   'OFF_RATING', 'DEF_RATING', 'GP_RANK', 'W_RANK', 'L_RANK',
                                   'W_PCT_RANK', 'MIN_RANK', 'OFF_RATING_RANK', 
                                   'DEF_RATING_RANK','NET_RATING_RANK', 'AST_PCT_RANK', 
                                   'AST_TO_RANK', 'AST_RATIO_RANK','OREB_PCT_RANK', 
                                   'DREB_PCT_RANK', 'REB_PCT_RANK', 'TM_TOV_PCT_RANK',
                                   'EFG_PCT_RANK', 'TS_PCT_RANK', 'USG_PCT_RANK', 'PACE_RANK', 
                                   'PIE_RANK','FGM_RANK', 'FGA_RANK', 'FGM_PG_RANK', 
                                   'FGA_PG_RANK', 'FG_PCT_RANK','CFID', 'CFPARAMS','GP',
                                   'OREB_PCT', 'DREB_PCT', 'NET_RATING','EFG_PCT','TS_PCT',
                                   'USG_PCT','FGM', 'FGA','AST_PCT','FG_PCT'], axis = 1)

In [None]:
print("Player shot: ",player_shot_df.keys())
print("       ")
print("Bio stats: ",bio_stats_df.keys())
print("       ")
pt_defended_df = pt_defended_df.rename(columns={'CLOSE_DEF_PERSON_ID': 'PLAYER_ID'})
print("Pt defended: ",pt_defended_df.keys())
print("       ")
print("Net rating: ",net_rating_df.keys())

In [None]:
player_shot_df = player_shot_df.sort_values(['PLAYER_ID','SEASON'])
bio_stats_df = bio_stats_df.sort_values(['PLAYER_ID','SEASON'])
pt_defended_df = pt_defended_df.sort_values(['PLAYER_ID','SEASON'])
net_rating_df = net_rating_df.sort_values(['PLAYER_ID','SEASON'])

In [None]:
merge1 = pd.merge(player_shot_df,bio_stats_df, on = ['PLAYER_ID', 'SEASON'])

In [None]:
merge2 = pd.merge(merge1,pt_defended_df, on = ['PLAYER_ID', 'SEASON'])

In [None]:
final_df = pd.merge(merge2, net_rating_df, on = ['PLAYER_ID', 'SEASON'])

In [None]:
final_df.keys()

In [None]:
cols = ['SEASON', 'PLAYER_NAME', 'PLAYER_ID',  'PLAYER_HEIGHT_INCHES', 'PLAYER_WEIGHT',
        'GP', 'MIN','NET_RATING', 'PIE', 'PACE', 'PTS', 'TS_PCT','FGA_FREQUENCY', 'FGA_PG', 
        'FGM', 'FGA', 'FG_PCT', 'FGM_PG', 'EFG_PCT', 'FG2A_FREQUENCY', 'FG2M', 'FG2A', 
        'FG2_PCT', 'FG3A_FREQUENCY', 'FG3M', 'FG3A', 'FG3_PCT', 'AST', 'AST_PCT', 'AST_TO', 
        'AST_RATIO', 'REB', 'OREB_PCT', 'DREB_PCT', 'REB_PCT', 'D_FGM', 'D_FGA', 'D_FG_PCT', 
        'FREQ', 'USG_PCT', 'NORMAL_FG_PCT', 'PCT_PLUSMINUS', 'TM_TOV_PCT']

In [None]:
final_df = final_df[cols]

In [None]:
final_df.loc[(final_df.NET_RATING > 0 ), 'NET_RATING'] = 1 # Offensive Player
final_df.loc[(final_df.NET_RATING < 0 ), 'NET_RATING'] = 0 # Defensive Player

In [None]:
final_df['NET_RATING'] = final_df['NET_RATING'].astype(int)
final_df['PLAYER_ID'] = final_df['PLAYER_ID'].astype(str)

In [None]:
final_df = final_df.drop(['FREQ','FGA_FREQUENCY'], axis = 1) # Same value for all obs

In [None]:
final_df.head()

In [None]:
cor = final_df.corr(method = 'pearson')
cor

## Dealing with missing values

In [None]:
final_df.columns[final_df.isnull().any()].tolist()

In [None]:
# has missing values and we can use other variables to capture that information
final_df = final_df.drop(['FG2_PCT', 'FG3_PCT'], axis = 1) 

In [None]:
final_df.loc[final_df.isnull().any(axis=1)[final_df.isnull().any(axis=1) 
                                                     == True].index.tolist()]

In [None]:
final_df.loc[final_df['PLAYER_NAME'] == 'Elliot Williams']

In [None]:
final_df.loc[1070,'PLAYER_HEIGHT_INCHES'] = 77.0
final_df.loc[1070,'PLAYER_WEIGHT'] = 185

In [None]:
final_df.loc[final_df['PLAYER_NAME'] == 'Jeff Adrien']

In [None]:
final_df.loc[1136,'PLAYER_HEIGHT_INCHES'] = 79.0
final_df.loc[1136,'PLAYER_WEIGHT'] = 245

In [None]:
final_df.loc[final_df['PLAYER_NAME'] == 'Patrick Christopher']

In [None]:
final_df.loc[1816,'PLAYER_HEIGHT_INCHES'] = 77.0
final_df.loc[1816,'PLAYER_WEIGHT'] = 209

In [None]:
final_df.loc[final_df['PLAYER_NAME'] == 'Will Cherry']

In [None]:
final_df.loc[1835,'PLAYER_HEIGHT_INCHES'] = 73.0
final_df.loc[1835,'PLAYER_WEIGHT'] = 181

In [None]:
final_df.dtypes

In [None]:
final_df['PLAYER_WEIGHT'] = final_df['PLAYER_WEIGHT'].astype(int)

In [None]:
final_df.isnull().any().any()

In [None]:
final_df_copy = final_df.copy()

In [None]:
final_df_copy.head()

## Getting Training set and Test set

In [None]:
# Got code from Evidation Health Notebooks

final_df_copy.MIN.hist()
plt.title('Minutest Played Distribution')
plt.xlabel('Minutes')

In [None]:
final_df_copy.GP.hist()
plt.title('Games Played Distribution')
plt.xlabel('Games Played')

Since the stats given will be greater for a player that has played more games and also more minutes we want to separate the number of games played and minutes played into bins to take into account when making our training and test sets.

In [None]:
# Create numbers of games played bins
min_bins = [x*10 for x in range(6)]
# Create a new variable that designates a number of games played to a bin
final_df_copy['min_bin'] = pd.cut(final_df_copy.MIN, bins=min_bins)

# Create numbers of games played bins
game_bins = [x*10 for x in range(10)]
# Create a new variable that designates a number of games played to a bin
final_df_copy['game_bin'] = pd.cut(final_df_copy.GP, bins=game_bins)

In [None]:
final_df_copy.game_bin.value_counts().sort_index().plot(kind='barh')

In [None]:
final_df_copy.min_bin.value_counts().sort_index().plot(kind='barh')

In [None]:
# Specify our test proportion
test_frac = 0.2

# Conduct random stratified sampling and get a set of test ids
test_indices = final_df_copy.groupby(['GP', 'game_bin', 'min_bin'])
test_indices = test_indices.apply(lambda df: df.sample(frac=test_frac,random_state=2)).index.unique()
for i in range(0,len(test_indices)):
    test_indices[i] = test_indices[i][3]

In [None]:
# Check everything worked out
(len(test_indices) / final_df_copy.index.nunique())

In [None]:
# Next, extract train ids
train_indices = final_df_copy[~final_df_copy.index.isin(test_indices)].index.unique()

# QC check
len(train_indices) / final_df_copy.index.nunique()

In [None]:
len(set(train_indices).intersection(set(test_indices))) == 0

In [None]:
# Test set
test_set = final_df_copy.loc[test_indices]

# Train set
train_set = final_df_copy.loc[train_indices]

## GLM

In [None]:
fitted_model = smf.glm(formula = 'NET_RATING ~ PLAYER_HEIGHT_INCHES + PLAYER_WEIGHT + GP + MIN + PIE + PACE + PTS + TS_PCT + FGA_PG + FGM + FGA + FG_PCT + FGM_PG + EFG_PCT + FG2A_FREQUENCY + FG2M + FG2A + FG3A_FREQUENCY + FG3M + FG3A + AST + AST_PCT + AST_TO + AST_RATIO + REB + OREB_PCT + DREB_PCT + REB_PCT + D_FGM + D_FGA + D_FG_PCT + USG_PCT + NORMAL_FG_PCT + PCT_PLUSMINUS + TM_TOV_PCT',
                        data = final_df_copy, 
                        family = sm.families.Binomial()).fit()

In [None]:
fitted_model.summary()