### Basic Imports

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [18]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

### Modeling Imports

In [19]:
from sklearn.model_selection import train_test_split, cross_val_score

## Read in the data

---
Data is provided by [Kaggle](https://www.kaggle.com/c/kobe-bryant-shot-selection/data?select=sample_submission.csv.zip).

The data comes in together. However, the test data is any `NaN` value in the `shot_made_flags` column. Everything else is considered train data. We will determine 5000 of Kobe's shot to see if it went in. 

In [20]:
df = pd.read_csv('kobe/kobe.csv')

In [21]:
df.head()

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,playoffs,season,seconds_remaining,shot_distance,shot_made_flag,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,game_date,matchup,opponent,shot_id
0,Jump Shot,Jump Shot,10,20000012,33.9723,167,72,-118.1028,10,1,0,2000-01,27,18,,2PT Field Goal,Right Side(R),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,1
1,Jump Shot,Jump Shot,12,20000012,34.0443,-157,0,-118.4268,10,1,0,2000-01,22,15,0.0,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,2
2,Jump Shot,Jump Shot,35,20000012,33.9093,-101,135,-118.3708,7,1,0,2000-01,45,16,1.0,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,3
3,Jump Shot,Jump Shot,43,20000012,33.8693,138,175,-118.1318,6,1,0,2000-01,52,22,0.0,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,4
4,Driving Dunk Shot,Dunk,155,20000012,34.0443,0,0,-118.2698,6,2,0,2000-01,19,0,1.0,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,5


In [22]:
df.describe()

Unnamed: 0,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,playoffs,seconds_remaining,shot_distance,shot_made_flag,team_id,shot_id
count,30697.0,30697.0,30697.0,30697.0,30697.0,30697.0,30697.0,30697.0,30697.0,30697.0,30697.0,25697.0,30697.0,30697.0
mean,249.1908,24764070.0,33.953192,7.110499,91.107535,-118.26269,4.885624,2.519432,0.146562,28.365085,13.437437,0.446161,1610613000.0,15349.0
std,150.003712,7755175.0,0.087791,110.124578,87.791361,0.110125,3.449897,1.153665,0.353674,17.478949,9.374189,0.497103,0.0,8861.604943
min,2.0,20000010.0,33.2533,-250.0,-44.0,-118.5198,0.0,1.0,0.0,0.0,0.0,0.0,1610613000.0,1.0
25%,110.0,20500080.0,33.8843,-68.0,4.0,-118.3378,2.0,1.0,0.0,13.0,5.0,0.0,1610613000.0,7675.0
50%,253.0,20900350.0,33.9703,0.0,74.0,-118.2698,5.0,3.0,0.0,28.0,15.0,0.0,1610613000.0,15349.0
75%,368.0,29600470.0,34.0403,95.0,160.0,-118.1748,8.0,3.0,0.0,43.0,21.0,1.0,1610613000.0,23023.0
max,659.0,49900090.0,34.0883,248.0,791.0,-118.0218,11.0,7.0,1.0,59.0,79.0,1.0,1610613000.0,30697.0


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30697 entries, 0 to 30696
Data columns (total 25 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   action_type         30697 non-null  object 
 1   combined_shot_type  30697 non-null  object 
 2   game_event_id       30697 non-null  int64  
 3   game_id             30697 non-null  int64  
 4   lat                 30697 non-null  float64
 5   loc_x               30697 non-null  int64  
 6   loc_y               30697 non-null  int64  
 7   lon                 30697 non-null  float64
 8   minutes_remaining   30697 non-null  int64  
 9   period              30697 non-null  int64  
 10  playoffs            30697 non-null  int64  
 11  season              30697 non-null  object 
 12  seconds_remaining   30697 non-null  int64  
 13  shot_distance       30697 non-null  int64  
 14  shot_made_flag      25697 non-null  float64
 15  shot_type           30697 non-null  object 
 16  shot

## Make a Train and Test data

---


In [24]:
train = df[(df['shot_made_flag']==1) | (df['shot_made_flag']==0)]
test = df[(df['shot_made_flag']!=1) & (df['shot_made_flag']!=0)]

In [25]:
train.head()

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,playoffs,season,seconds_remaining,shot_distance,shot_made_flag,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,game_date,matchup,opponent,shot_id
1,Jump Shot,Jump Shot,12,20000012,34.0443,-157,0,-118.4268,10,1,0,2000-01,22,15,0.0,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,2
2,Jump Shot,Jump Shot,35,20000012,33.9093,-101,135,-118.3708,7,1,0,2000-01,45,16,1.0,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,3
3,Jump Shot,Jump Shot,43,20000012,33.8693,138,175,-118.1318,6,1,0,2000-01,52,22,0.0,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,4
4,Driving Dunk Shot,Dunk,155,20000012,34.0443,0,0,-118.2698,6,2,0,2000-01,19,0,1.0,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,5
5,Jump Shot,Jump Shot,244,20000012,34.0553,-145,-11,-118.4148,9,3,0,2000-01,32,14,0.0,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,6


In [26]:
test.head()

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,playoffs,season,seconds_remaining,shot_distance,shot_made_flag,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,game_date,matchup,opponent,shot_id
0,Jump Shot,Jump Shot,10,20000012,33.9723,167,72,-118.1028,10,1,0,2000-01,27,18,,2PT Field Goal,Right Side(R),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,1
7,Jump Shot,Jump Shot,254,20000012,34.0163,1,28,-118.2688,8,3,0,2000-01,5,2,,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,8
16,Driving Layup Shot,Layup,100,20000019,34.0443,0,0,-118.2698,0,1,0,2000-01,1,0,,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-11-01,LAL vs. UTA,UTA,17
19,Driving Layup Shot,Layup,249,20000019,34.0443,0,0,-118.2698,10,3,0,2000-01,46,0,,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-11-01,LAL vs. UTA,UTA,20
32,Jump Shot,Jump Shot,4,20000047,33.9683,163,76,-118.1068,11,1,0,2000-01,26,17,,2PT Field Goal,Right Side(R),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-11-04,LAL @ VAN,VAN,33


In [27]:
train.shape, test.shape

((25697, 25), (5000, 25))

In [28]:
# check the empty values of test dataframe
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 30693
Data columns (total 25 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   action_type         5000 non-null   object 
 1   combined_shot_type  5000 non-null   object 
 2   game_event_id       5000 non-null   int64  
 3   game_id             5000 non-null   int64  
 4   lat                 5000 non-null   float64
 5   loc_x               5000 non-null   int64  
 6   loc_y               5000 non-null   int64  
 7   lon                 5000 non-null   float64
 8   minutes_remaining   5000 non-null   int64  
 9   period              5000 non-null   int64  
 10  playoffs            5000 non-null   int64  
 11  season              5000 non-null   object 
 12  seconds_remaining   5000 non-null   int64  
 13  shot_distance       5000 non-null   int64  
 14  shot_made_flag      0 non-null      float64
 15  shot_type           5000 non-null   object 
 16  shot_

We will drop the `shot_made_flag` column in `test`. This is what we will feed into our model to make predictions.

In [29]:
test = test.drop(columns='shot_made_flag')
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 30693
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   action_type         5000 non-null   object 
 1   combined_shot_type  5000 non-null   object 
 2   game_event_id       5000 non-null   int64  
 3   game_id             5000 non-null   int64  
 4   lat                 5000 non-null   float64
 5   loc_x               5000 non-null   int64  
 6   loc_y               5000 non-null   int64  
 7   lon                 5000 non-null   float64
 8   minutes_remaining   5000 non-null   int64  
 9   period              5000 non-null   int64  
 10  playoffs            5000 non-null   int64  
 11  season              5000 non-null   object 
 12  seconds_remaining   5000 non-null   int64  
 13  shot_distance       5000 non-null   int64  
 14  shot_type           5000 non-null   object 
 15  shot_zone_area      5000 non-null   object 
 16  shot_

# Preprocessing


---
We will check for nulls. Since the only nulls are the values we are trying to predict, there is no need to do any imputing of values. However, this does not excuse remapping.

In [49]:
train.isnull().sum()

action_type               0
combined_shot_type        0
game_event_id             0
game_id                   0
lat                       0
loc_x                     0
loc_y                     0
lon                       0
minutes_remaining         0
period                    0
playoffs                  0
season                    0
seconds_remaining         0
shot_distance             0
shot_made_flag            0
shot_type                 0
shot_zone_area            0
shot_zone_basic           0
shot_zone_range           0
team_id                   0
team_name                 0
game_date                 0
matchup                   0
opponent                  0
shot_id                   0
action_type_NUM           0
combined_shot_type_NUM    0
shot_type_NUM             0
shot_zone_area_NUM        0
shot_zone_basic_NUM       0
shot_zone_range_NUM       0
dtype: int64

In [48]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25697 entries, 1 to 30696
Data columns (total 31 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   action_type             25697 non-null  object 
 1   combined_shot_type      25697 non-null  object 
 2   game_event_id           25697 non-null  int64  
 3   game_id                 25697 non-null  int64  
 4   lat                     25697 non-null  float64
 5   loc_x                   25697 non-null  int64  
 6   loc_y                   25697 non-null  int64  
 7   lon                     25697 non-null  float64
 8   minutes_remaining       25697 non-null  int64  
 9   period                  25697 non-null  int64  
 10  playoffs                25697 non-null  int64  
 11  season                  25697 non-null  object 
 12  seconds_remaining       25697 non-null  int64  
 13  shot_distance           25697 non-null  int64  
 14  shot_made_flag          25697 non-null

In [37]:
train.sample(5)

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,playoffs,season,seconds_remaining,shot_distance,shot_made_flag,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,game_date,matchup,opponent,shot_id
337,Jump Shot,Jump Shot,61,20000230,33.8793,-86,165,-118.3558,5,1,0,2000-01,30,18,1.0,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-12-01,LAL vs. SAS,SAS,338
26888,Turnaround Jump Shot,Jump Shot,394,40100316,33.9813,91,63,-118.1788,9,4,1,2001-02,2,11,1.0,2PT Field Goal,Right Side(R),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2002-05-31,LAL vs. SAC,SAC,26889
17768,Jump Shot,Jump Shot,334,21000999,33.8633,48,181,-118.2218,1,3,0,2010-11,32,18,0.0,2PT Field Goal,Center(C),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2011-03-14,LAL vs. ORL,ORL,17769
27073,Jump Shot,Jump Shot,11,40200174,33.9933,-163,51,-118.4328,10,1,1,2002-03,10,17,0.0,2PT Field Goal,Left Side(L),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2003-04-27,LAL vs. MIN,MIN,27074
28236,Running Jump Shot,Jump Shot,80,40700226,34.0083,57,36,-118.2128,3,1,1,2007-08,6,6,1.0,2PT Field Goal,Center(C),In The Paint (Non-RA),Less Than 8 ft.,1610612747,Los Angeles Lakers,2008-05-16,LAL @ UTA,UTA,28237


Look at some unique values for some categorical variables.

In [36]:
cats = ['action_type', 'combined_shot_type', 'shot_type','shot_zone_area', 'shot_zone_basic','shot_zone_range']

for c in cats:
    print(f'There are {len(train[c].unique())} unique values in {c.title()}.')
    print()
    print(train[c].unique())
    print('--'*35)

There are 55 unique values in Action_Type.

['Jump Shot' 'Driving Dunk Shot' 'Layup Shot' 'Running Jump Shot'
 'Reverse Dunk Shot' 'Slam Dunk Shot' 'Driving Layup Shot'
 'Turnaround Jump Shot' 'Reverse Layup Shot' 'Tip Shot'
 'Running Hook Shot' 'Alley Oop Dunk Shot' 'Dunk Shot'
 'Alley Oop Layup shot' 'Running Dunk Shot' 'Driving Finger Roll Shot'
 'Running Layup Shot' 'Finger Roll Shot' 'Fadeaway Jump Shot'
 'Follow Up Dunk Shot' 'Hook Shot' 'Turnaround Hook Shot' 'Jump Hook Shot'
 'Running Finger Roll Shot' 'Jump Bank Shot' 'Turnaround Finger Roll Shot'
 'Hook Bank Shot' 'Driving Hook Shot' 'Running Tip Shot'
 'Running Reverse Layup Shot' 'Driving Finger Roll Layup Shot'
 'Fadeaway Bank shot' 'Pullup Jump shot' 'Finger Roll Layup Shot'
 'Turnaround Fadeaway shot' 'Driving Reverse Layup Shot'
 'Driving Slam Dunk Shot' 'Step Back Jump shot' 'Turnaround Bank shot'
 'Reverse Slam Dunk Shot' 'Floating Jump shot' 'Putback Slam Dunk Shot'
 'Running Bank shot' 'Driving Bank shot' 'Driving J

### Engineer the categorical columns into numeric values to use for classification

---
We will retain the orignal columns but make new columns that we can convert into numerics. This will be applied to the `test` dataframe as well.

In [40]:
# Train dataframe
train['action_type_NUM'] = train['action_type']
train['combined_shot_type_NUM'] = train['combined_shot_type']
train['shot_type_NUM'] = train['shot_type']
train['shot_zone_area_NUM'] = train['shot_zone_area']
train['shot_zone_basic_NUM'] = train['shot_zone_basic']
train['shot_zone_range_NUM'] = train['shot_zone_range']
# Test dataframe
test['action_type_NUM'] = test['action_type']
test['combined_shot_type_NUM'] = test['combined_shot_type']
test['shot_type_NUM'] = test['shot_type']
test['shot_zone_area_NUM'] = test['shot_zone_area']
test['shot_zone_basic_NUM'] = test['shot_zone_basic']
test['shot_zone_range_NUM'] = test['shot_zone_range']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

In [45]:
print(test.columns) 
print()
print(train.columns)

Index(['action_type', 'combined_shot_type', 'game_event_id', 'game_id', 'lat', 'loc_x', 'loc_y', 'lon', 'minutes_remaining', 'period', 'playoffs', 'season', 'seconds_remaining', 'shot_distance', 'shot_type', 'shot_zone_area', 'shot_zone_basic', 'shot_zone_range', 'team_id', 'team_name', 'game_date', 'matchup', 'opponent', 'shot_id', 'action_type_NUM', 'combined_shot_type_NUM', 'shot_type_NUM', 'shot_zone_area_NUM', 'shot_zone_basic_NUM', 'shot_zone_range_NUM'], dtype='object')

Index(['action_type', 'combined_shot_type', 'game_event_id', 'game_id', 'lat', 'loc_x', 'loc_y', 'lon', 'minutes_remaining', 'period', 'playoffs', 'season', 'seconds_remaining', 'shot_distance', 'shot_made_flag', 'shot_type', 'shot_zone_area', 'shot_zone_basic', 'shot_zone_range', 'team_id', 'team_name', 'game_date', 'matchup', 'opponent', 'shot_id', 'action_type_NUM', 'combined_shot_type_NUM', 'shot_type_NUM', 'shot_zone_area_NUM', 'shot_zone_basic_NUM', 'shot_zone_range_NUM'], dtype='object')


## Mapping the `NUM` columns

---
Convert the numbers into some numeric value that seems plausible for each column.

In [47]:
train['shot_type_NUM'] = train['shot_type_NUM'].map({'2PT Field Goal':2,
                            '3PT Field Goal':3})
test['shot_type_NUM'] = test['shot_type_NUM'].map({'2PT Field Goal':2,
                            '3PT Field Goal':3})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


# Create X and y from `train` dataframe

---
We will create `X` and `y`. From there, convert them into X_train, X_val, y_train, and y_val.

In [26]:
X = train.drop(columns='shot_made_flag')
y = train['shot_made_flag']

In [28]:
X.head()

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,playoffs,season,seconds_remaining,shot_distance,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,game_date,matchup,opponent,shot_id
1,Jump Shot,Jump Shot,12,20000012,34.0443,-157,0,-118.4268,10,1,0,2000-01,22,15,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,2
2,Jump Shot,Jump Shot,35,20000012,33.9093,-101,135,-118.3708,7,1,0,2000-01,45,16,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,3
3,Jump Shot,Jump Shot,43,20000012,33.8693,138,175,-118.1318,6,1,0,2000-01,52,22,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,4
4,Driving Dunk Shot,Dunk,155,20000012,34.0443,0,0,-118.2698,6,2,0,2000-01,19,0,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,5
5,Jump Shot,Jump Shot,244,20000012,34.0553,-145,-11,-118.4148,9,3,0,2000-01,32,14,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,6


In [29]:
y.head()

1    0.0
2    1.0
3    0.0
4    1.0
5    0.0
Name: shot_made_flag, dtype: float64

In [34]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.33, random_state=24)

In [35]:
X_train.shape, y_train.shape

((17216, 24), (17216,))

In [36]:
X_val.shape, y_val.shape

((8481, 24), (8481,))