# Preprocessing and Modeling

### Basic Imports

In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [57]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

### Modeling Imports

In [181]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import plot_tree, DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import LinearSVC, SVC

import pickle

In [89]:
np.random.seed(824)

## Read in the data

---
Data is provided by [Kaggle](https://www.kaggle.com/c/kobe-bryant-shot-selection/data?select=sample_submission.csv.zip).

The data comes in together. However, the test data is any `NaN` value in the `shot_made_flags` column. Everything else is considered train data. We will determine 5000 of Kobe's shot to see if it went in. 

In [59]:
df = pd.read_csv('kobe/kobe.csv')

In [60]:
df.head()

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,playoffs,season,seconds_remaining,shot_distance,shot_made_flag,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,game_date,matchup,opponent,shot_id
0,Jump Shot,Jump Shot,10,20000012,33.9723,167,72,-118.1028,10,1,0,2000-01,27,18,,2PT Field Goal,Right Side(R),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,1
1,Jump Shot,Jump Shot,12,20000012,34.0443,-157,0,-118.4268,10,1,0,2000-01,22,15,0.0,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,2
2,Jump Shot,Jump Shot,35,20000012,33.9093,-101,135,-118.3708,7,1,0,2000-01,45,16,1.0,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,3
3,Jump Shot,Jump Shot,43,20000012,33.8693,138,175,-118.1318,6,1,0,2000-01,52,22,0.0,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,4
4,Driving Dunk Shot,Dunk,155,20000012,34.0443,0,0,-118.2698,6,2,0,2000-01,19,0,1.0,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,5


In [61]:
df.describe()

Unnamed: 0,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,playoffs,seconds_remaining,shot_distance,shot_made_flag,team_id,shot_id
count,30697.0,30697.0,30697.0,30697.0,30697.0,30697.0,30697.0,30697.0,30697.0,30697.0,30697.0,25697.0,30697.0,30697.0
mean,249.1908,24764070.0,33.953192,7.110499,91.107535,-118.26269,4.885624,2.519432,0.146562,28.365085,13.437437,0.446161,1610613000.0,15349.0
std,150.003712,7755175.0,0.087791,110.124578,87.791361,0.110125,3.449897,1.153665,0.353674,17.478949,9.374189,0.497103,0.0,8861.604943
min,2.0,20000010.0,33.2533,-250.0,-44.0,-118.5198,0.0,1.0,0.0,0.0,0.0,0.0,1610613000.0,1.0
25%,110.0,20500080.0,33.8843,-68.0,4.0,-118.3378,2.0,1.0,0.0,13.0,5.0,0.0,1610613000.0,7675.0
50%,253.0,20900350.0,33.9703,0.0,74.0,-118.2698,5.0,3.0,0.0,28.0,15.0,0.0,1610613000.0,15349.0
75%,368.0,29600470.0,34.0403,95.0,160.0,-118.1748,8.0,3.0,0.0,43.0,21.0,1.0,1610613000.0,23023.0
max,659.0,49900090.0,34.0883,248.0,791.0,-118.0218,11.0,7.0,1.0,59.0,79.0,1.0,1610613000.0,30697.0


In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30697 entries, 0 to 30696
Data columns (total 25 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   action_type         30697 non-null  object 
 1   combined_shot_type  30697 non-null  object 
 2   game_event_id       30697 non-null  int64  
 3   game_id             30697 non-null  int64  
 4   lat                 30697 non-null  float64
 5   loc_x               30697 non-null  int64  
 6   loc_y               30697 non-null  int64  
 7   lon                 30697 non-null  float64
 8   minutes_remaining   30697 non-null  int64  
 9   period              30697 non-null  int64  
 10  playoffs            30697 non-null  int64  
 11  season              30697 non-null  object 
 12  seconds_remaining   30697 non-null  int64  
 13  shot_distance       30697 non-null  int64  
 14  shot_made_flag      25697 non-null  float64
 15  shot_type           30697 non-null  object 
 16  shot

## Make a Train and Test data

---


In [63]:
train = df[(df['shot_made_flag']==1) | (df['shot_made_flag']==0)].copy()
test = df[(df['shot_made_flag']!=1) & (df['shot_made_flag']!=0)].copy()

In [64]:
train.head()

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,playoffs,season,seconds_remaining,shot_distance,shot_made_flag,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,game_date,matchup,opponent,shot_id
1,Jump Shot,Jump Shot,12,20000012,34.0443,-157,0,-118.4268,10,1,0,2000-01,22,15,0.0,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,2
2,Jump Shot,Jump Shot,35,20000012,33.9093,-101,135,-118.3708,7,1,0,2000-01,45,16,1.0,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,3
3,Jump Shot,Jump Shot,43,20000012,33.8693,138,175,-118.1318,6,1,0,2000-01,52,22,0.0,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,4
4,Driving Dunk Shot,Dunk,155,20000012,34.0443,0,0,-118.2698,6,2,0,2000-01,19,0,1.0,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,5
5,Jump Shot,Jump Shot,244,20000012,34.0553,-145,-11,-118.4148,9,3,0,2000-01,32,14,0.0,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,6


In [65]:
test.head()

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,playoffs,season,seconds_remaining,shot_distance,shot_made_flag,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,game_date,matchup,opponent,shot_id
0,Jump Shot,Jump Shot,10,20000012,33.9723,167,72,-118.1028,10,1,0,2000-01,27,18,,2PT Field Goal,Right Side(R),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,1
7,Jump Shot,Jump Shot,254,20000012,34.0163,1,28,-118.2688,8,3,0,2000-01,5,2,,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,8
16,Driving Layup Shot,Layup,100,20000019,34.0443,0,0,-118.2698,0,1,0,2000-01,1,0,,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-11-01,LAL vs. UTA,UTA,17
19,Driving Layup Shot,Layup,249,20000019,34.0443,0,0,-118.2698,10,3,0,2000-01,46,0,,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-11-01,LAL vs. UTA,UTA,20
32,Jump Shot,Jump Shot,4,20000047,33.9683,163,76,-118.1068,11,1,0,2000-01,26,17,,2PT Field Goal,Right Side(R),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-11-04,LAL @ VAN,VAN,33


In [66]:
train.shape, test.shape

((25697, 25), (5000, 25))

In [67]:
# check the empty values of test dataframe
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 30693
Data columns (total 25 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   action_type         5000 non-null   object 
 1   combined_shot_type  5000 non-null   object 
 2   game_event_id       5000 non-null   int64  
 3   game_id             5000 non-null   int64  
 4   lat                 5000 non-null   float64
 5   loc_x               5000 non-null   int64  
 6   loc_y               5000 non-null   int64  
 7   lon                 5000 non-null   float64
 8   minutes_remaining   5000 non-null   int64  
 9   period              5000 non-null   int64  
 10  playoffs            5000 non-null   int64  
 11  season              5000 non-null   object 
 12  seconds_remaining   5000 non-null   int64  
 13  shot_distance       5000 non-null   int64  
 14  shot_made_flag      0 non-null      float64
 15  shot_type           5000 non-null   object 
 16  shot_

We will drop the `shot_made_flag` column in `test`. This is what we will feed into our model to make predictions.

In [68]:
test = test.drop(columns='shot_made_flag')
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 30693
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   action_type         5000 non-null   object 
 1   combined_shot_type  5000 non-null   object 
 2   game_event_id       5000 non-null   int64  
 3   game_id             5000 non-null   int64  
 4   lat                 5000 non-null   float64
 5   loc_x               5000 non-null   int64  
 6   loc_y               5000 non-null   int64  
 7   lon                 5000 non-null   float64
 8   minutes_remaining   5000 non-null   int64  
 9   period              5000 non-null   int64  
 10  playoffs            5000 non-null   int64  
 11  season              5000 non-null   object 
 12  seconds_remaining   5000 non-null   int64  
 13  shot_distance       5000 non-null   int64  
 14  shot_type           5000 non-null   object 
 15  shot_zone_area      5000 non-null   object 
 16  shot_

# Preprocessing


---
We will check for nulls. Since the only nulls are the values we are trying to predict, there is no need to do any imputing of values. However, this does not excuse remapping.

In [69]:
train.isnull().sum()

action_type           0
combined_shot_type    0
game_event_id         0
game_id               0
lat                   0
loc_x                 0
loc_y                 0
lon                   0
minutes_remaining     0
period                0
playoffs              0
season                0
seconds_remaining     0
shot_distance         0
shot_made_flag        0
shot_type             0
shot_zone_area        0
shot_zone_basic       0
shot_zone_range       0
team_id               0
team_name             0
game_date             0
matchup               0
opponent              0
shot_id               0
dtype: int64

In [70]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25697 entries, 1 to 30696
Data columns (total 25 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   action_type         25697 non-null  object 
 1   combined_shot_type  25697 non-null  object 
 2   game_event_id       25697 non-null  int64  
 3   game_id             25697 non-null  int64  
 4   lat                 25697 non-null  float64
 5   loc_x               25697 non-null  int64  
 6   loc_y               25697 non-null  int64  
 7   lon                 25697 non-null  float64
 8   minutes_remaining   25697 non-null  int64  
 9   period              25697 non-null  int64  
 10  playoffs            25697 non-null  int64  
 11  season              25697 non-null  object 
 12  seconds_remaining   25697 non-null  int64  
 13  shot_distance       25697 non-null  int64  
 14  shot_made_flag      25697 non-null  float64
 15  shot_type           25697 non-null  object 
 16  shot

In [71]:
train.sample(5)

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,playoffs,season,seconds_remaining,shot_distance,shot_made_flag,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,game_date,matchup,opponent,shot_id
15150,Jump Shot,Jump Shot,25,20900197,33.8613,-106,183,-118.3758,8,1,0,2009-10,58,21,0.0,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2009-11-22,LAL vs. OKC,OKC,15151
19775,Jump Shot,Jump Shot,15,21200280,33.8653,-168,179,-118.4378,9,1,0,2012-13,42,24,1.0,3PT Field Goal,Left Side Center(LC),Above the Break 3,24+ ft.,1610612747,Los Angeles Lakers,2012-12-07,LAL @ OKC,OKC,19776
10193,Driving Layup Shot,Layup,223,20600418,34.0443,0,0,-118.2698,2,2,0,2006-07,18,0,1.0,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2006-12-27,LAL @ ORL,ORL,10194
17095,Jump Shot,Jump Shot,308,21000498,33.7923,108,252,-118.1618,6,3,0,2010-11,15,27,0.0,3PT Field Goal,Right Side Center(RC),Above the Break 3,24+ ft.,1610612747,Los Angeles Lakers,2011-01-02,LAL vs. MEM,MEM,17096
1763,Jump Shot,Jump Shot,433,20100204,33.9513,-141,93,-118.4108,4,4,0,2001-02,29,16,0.0,2PT Field Goal,Left Side(L),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2001-11-27,LAL vs. MIL,MIL,1764


Look at some unique values for some categorical variables.

In [72]:
cats = ['action_type', 'combined_shot_type', 'shot_type','shot_zone_area', 'shot_zone_basic','shot_zone_range']

for c in cats:
    print(f'There are {len(train[c].unique())} unique values in {c.title()}.')
    print()
    print(train[c].unique())
    print('--'*35)

There are 55 unique values in Action_Type.

['Jump Shot' 'Driving Dunk Shot' 'Layup Shot' 'Running Jump Shot'
 'Reverse Dunk Shot' 'Slam Dunk Shot' 'Driving Layup Shot'
 'Turnaround Jump Shot' 'Reverse Layup Shot' 'Tip Shot'
 'Running Hook Shot' 'Alley Oop Dunk Shot' 'Dunk Shot'
 'Alley Oop Layup shot' 'Running Dunk Shot' 'Driving Finger Roll Shot'
 'Running Layup Shot' 'Finger Roll Shot' 'Fadeaway Jump Shot'
 'Follow Up Dunk Shot' 'Hook Shot' 'Turnaround Hook Shot' 'Jump Hook Shot'
 'Running Finger Roll Shot' 'Jump Bank Shot' 'Turnaround Finger Roll Shot'
 'Hook Bank Shot' 'Driving Hook Shot' 'Running Tip Shot'
 'Running Reverse Layup Shot' 'Driving Finger Roll Layup Shot'
 'Fadeaway Bank shot' 'Pullup Jump shot' 'Finger Roll Layup Shot'
 'Turnaround Fadeaway shot' 'Driving Reverse Layup Shot'
 'Driving Slam Dunk Shot' 'Step Back Jump shot' 'Turnaround Bank shot'
 'Reverse Slam Dunk Shot' 'Floating Jump shot' 'Putback Slam Dunk Shot'
 'Running Bank shot' 'Driving Bank shot' 'Driving J

### Engineer the categorical columns into numeric values to use for classification

---
We will retain the orignal columns but make new columns that we can convert into numerics. This will be applied to the `test` dataframe as well.

In [73]:
# Train dataframe
train['action_type_NUM'] = train['action_type']
train['combined_shot_type_NUM'] = train['combined_shot_type']
train['shot_type_NUM'] = train['shot_type']
train['shot_zone_area_NUM'] = train['shot_zone_area']
train['shot_zone_basic_NUM'] = train['shot_zone_basic']
train['shot_zone_range_NUM'] = train['shot_zone_range']
# Test dataframe
test['action_type_NUM'] = test['action_type']
test['combined_shot_type_NUM'] = test['combined_shot_type']
test['shot_type_NUM'] = test['shot_type']
test['shot_zone_area_NUM'] = test['shot_zone_area']
test['shot_zone_basic_NUM'] = test['shot_zone_basic']
test['shot_zone_range_NUM'] = test['shot_zone_range']

In [74]:
print(test.columns) 
print()
print(train.columns)

Index(['action_type', 'combined_shot_type', 'game_event_id', 'game_id', 'lat', 'loc_x', 'loc_y', 'lon', 'minutes_remaining', 'period', 'playoffs', 'season', 'seconds_remaining', 'shot_distance', 'shot_type', 'shot_zone_area', 'shot_zone_basic', 'shot_zone_range', 'team_id', 'team_name', 'game_date', 'matchup', 'opponent', 'shot_id', 'action_type_NUM', 'combined_shot_type_NUM', 'shot_type_NUM', 'shot_zone_area_NUM', 'shot_zone_basic_NUM', 'shot_zone_range_NUM'], dtype='object')

Index(['action_type', 'combined_shot_type', 'game_event_id', 'game_id', 'lat', 'loc_x', 'loc_y', 'lon', 'minutes_remaining', 'period', 'playoffs', 'season', 'seconds_remaining', 'shot_distance', 'shot_made_flag', 'shot_type', 'shot_zone_area', 'shot_zone_basic', 'shot_zone_range', 'team_id', 'team_name', 'game_date', 'matchup', 'opponent', 'shot_id', 'action_type_NUM', 'combined_shot_type_NUM', 'shot_type_NUM', 'shot_zone_area_NUM', 'shot_zone_basic_NUM', 'shot_zone_range_NUM'], dtype='object')


## Mapping the `NUM` columns

---
Convert the numbers into some numeric value that seems plausible for each column.

In [75]:
# Shot Type
train['shot_type_NUM'] = train['shot_type_NUM'].map({'2PT Field Goal':2,
                            '3PT Field Goal':3})
test['shot_type_NUM'] = test['shot_type_NUM'].map({'2PT Field Goal':2,
                            '3PT Field Goal':3})

In [76]:
# Shot Zone Range
train['shot_zone_range_NUM'] = train['shot_zone_range_NUM'].map({
   '8-16 ft.':12,
   '16-24 ft.':20,
   'Less Than 8 ft.':8, 
   '24+ ft.':24,
   'Back Court Shot':40
})
test['shot_zone_range_NUM'] = test['shot_zone_range_NUM'].map({
   '8-16 ft.':12,
   '16-24 ft.':20,
   'Less Than 8 ft.':8, 
   '24+ ft.':24, 
   'Back Court Shot':40
})

To determine a value for `combined_shot_type`, I will group and then look at the normalized value counts.

We will use the scoring percentage for Kobe in a particular shot. The higher the percentage, the lower the mapping value will be.

1. Dunk ~ 92%
2. Bank Shot ~ 79%
3. Layup ~ 56%
4. Hook Shot ~ 53%
5. Jump Shot ~ 39%
6. Tip Shot ~ 34%

In [77]:
train.groupby('combined_shot_type')['shot_made_flag'].value_counts(normalize=True)

combined_shot_type  shot_made_flag
Bank Shot           1.0               0.791667
                    0.0               0.208333
Dunk                1.0               0.928030
                    0.0               0.071970
Hook Shot           1.0               0.535433
                    0.0               0.464567
Jump Shot           0.0               0.608929
                    1.0               0.391071
Layup               1.0               0.565093
                    0.0               0.434907
Tip Shot            0.0               0.651316
                    1.0               0.348684
Name: shot_made_flag, dtype: float64

In [78]:
train['combined_shot_type_NUM'] = train['combined_shot_type_NUM'].map({
    'Dunk':1,
    'Bank Shot':2,
    'Layup':3,
    'Hook Shot':4,
    'Jump Shot':5,
    'Tip Shot':6
})
test['combined_shot_type_NUM'] = test['combined_shot_type_NUM'].map({
    'Dunk':1,
    'Bank Shot':2,
    'Layup':3,
    'Hook Shot':4,
    'Jump Shot':5,
    'Tip Shot':6
})

In [79]:
# Remapped them but will most likely use pd.get_dummies on them
# This is to lessen overfitting and try to map things to a more precise shot

# May or may not copy the same numbers I used for the combined_shot_type but there might be a lot of overlap causing high correlation

train['action_type_NUM']=train['action_type_NUM'].map({'Jump Shot':'Jump',
                              'Driving Dunk Shot':'Dunk',
                              'Layup Shot':'Layup',
                              'Running Jump Shot':'Jump',
                              'Reverse Dunk Shot':'Dunk',
                              'Slam Dunk Shot':'Dunk',
                              'Driving Layup Shot':'Layup',
                              'Turnaround Jump Shot':'Jump',
                              'Reverse Layup Shot':'Layup',
                              'Tip Shot':'Tip',
                              'Running Hook Shot':'Hook',
                              'Alley Oop Dunk Shot':'Dunk',
                              'Dunk Shot':'Dunk',
                              'Alley Oop Layup shot':'Layup',
                              'Running Dunk Shot':'Dunk',
                              'Driving Finger Roll Shot':'Layup',
                              'Running Layup Shot':'Layup',
                              'Finger Roll Shot':'Layup',
                              'Fadeaway Jump Shot':'Jump',
                              'Follow Up Dunk Shot':'Dunk',
                              'Hook Shot':'Hook',
                              'Turnaround Hook Shot':'Hook',
                              'Jump Hook Shot':'Hook',
                              'Running Finger Roll Shot':'Layup',
                              'Jump Bank Shot':'Bank',
                              'Turnaround Finger Roll Shot':'Layup',
                              'Hook Bank Shot':'Hook',
                              'Driving Hook Shot':'Hook',
                              'Running Tip Shot':'Tip',
                              'Running Reverse Layup Shot':'Layup',
                              'Driving Finger Roll Layup Shot':'Layup',
                              'Fadeaway Bank shot':'Bank',
                              'Pullup Jump shot':'Jump',
                              'Finger Roll Layup Shot':'Layup',
                              'Turnaround Fadeaway shot':'Jump',
                              'Driving Reverse Layup Shot':'Layup',
                              'Driving Slam Dunk Shot':'Dunk',
                              'Step Back Jump shot':'Jump',
                              'Turnaround Bank shot':'Bank',
                              'Reverse Slam Dunk Shot':'Dunk',
                              'Floating Jump shot':'Jump',
                              'Putback Slam Dunk Shot':'Tip',
                              'Running Bank shot':'Bank',
                              'Driving Bank shot':'Bank',
                              'Driving Jump shot':'Jump',
                              'Putback Layup Shot':'Tip',
                              'Putback Dunk Shot':'Tip',
                              'Running Finger Roll Layup Shot':'Layup',
                              'Pullup Bank shot':'Bank',
                              'Running Slam Dunk Shot':'Dunk',
                              'Cutting Layup Shot':'Layup',
                              'Driving Floating Jump Shot':'Jump',
                              'Running Pull-Up Jump Shot':'Jump',
                              'Tip Layup Shot':'Tip',
                              'Driving Floating Bank Jump Shot':'Jump'})
test['action_type_NUM'] = test['action_type_NUM'].map({'Jump Shot':'Jump',
                              'Driving Dunk Shot':'Dunk',
                              'Layup Shot':'Layup',
                              'Running Jump Shot':'Jump',
                              'Reverse Dunk Shot':'Dunk',
                              'Slam Dunk Shot':'Dunk',
                              'Driving Layup Shot':'Layup',
                              'Turnaround Jump Shot':'Jump',
                              'Reverse Layup Shot':'Layup',
                              'Tip Shot':'Tip',
                              'Running Hook Shot':'Hook',
                              'Alley Oop Dunk Shot':'Dunk',
                              'Dunk Shot':'Dunk',
                              'Alley Oop Layup shot':'Layup',
                              'Running Dunk Shot':'Dunk',
                              'Driving Finger Roll Shot':'Layup',
                              'Running Layup Shot':'Layup',
                              'Finger Roll Shot':'Layup',
                              'Fadeaway Jump Shot':'Jump',
                              'Follow Up Dunk Shot':'Dunk',
                              'Hook Shot':'Hook',
                              'Turnaround Hook Shot':'Hook',
                              'Jump Hook Shot':'Hook',
                              'Running Finger Roll Shot':'Layup',
                              'Jump Bank Shot':'Bank',
                              'Turnaround Finger Roll Shot':'Layup',
                              'Hook Bank Shot':'Hook',
                              'Driving Hook Shot':'Hook',
                              'Running Tip Shot':'Tip',
                              'Running Reverse Layup Shot':'Layup',
                              'Driving Finger Roll Layup Shot':'Layup',
                              'Fadeaway Bank shot':'Bank',
                              'Pullup Jump shot':'Jump',
                              'Finger Roll Layup Shot':'Layup',
                              'Turnaround Fadeaway shot':'Jump',
                              'Driving Reverse Layup Shot':'Layup',
                              'Driving Slam Dunk Shot':'Dunk',
                              'Step Back Jump shot':'Jump',
                              'Turnaround Bank shot':'Bank',
                              'Reverse Slam Dunk Shot':'Dunk',
                              'Floating Jump shot':'Jump',
                              'Putback Slam Dunk Shot':'Tip',
                              'Running Bank shot':'Bank',
                              'Driving Bank shot':'Bank',
                              'Driving Jump shot':'Jump',
                              'Putback Layup Shot':'Tip',
                              'Putback Dunk Shot':'Tip',
                              'Running Finger Roll Layup Shot':'Layup',
                              'Pullup Bank shot':'Bank',
                              'Running Slam Dunk Shot':'Dunk',
                              'Cutting Layup Shot':'Layup',
                              'Driving Floating Jump Shot':'Jump',
                              'Running Pull-Up Jump Shot':'Jump',
                              'Tip Layup Shot':'Tip',
                              'Driving Floating Bank Jump Shot':'Jump'})

In [80]:
train['action_type_NUM'].unique()

array(['Jump', 'Dunk', 'Layup', 'Tip', 'Hook', 'Bank'], dtype=object)

In [102]:
train.sample(3)

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,playoffs,season,seconds_remaining,shot_distance,shot_made_flag,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,game_date,matchup,opponent,shot_id,action_type_NUM,combined_shot_type_NUM,shot_type_NUM,shot_zone_area_NUM,shot_zone_basic_NUM,shot_zone_range_NUM
11037,Running Layup Shot,Layup,98,20601016,34.0443,0,0,-118.2698,0,1,0,2006-07,50,0,1.0,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2007-03-22,LAL @ MEM,MEM,11038,Layup,3,2,Center(C),Restricted Area,8
29524,Jump Shot,Jump Shot,97,40900407,33.7983,97,246,-118.1728,2,1,1,2009-10,0,26,0.0,3PT Field Goal,Right Side Center(RC),Above the Break 3,24+ ft.,1610612747,Los Angeles Lakers,2010-06-17,LAL vs. BOS,BOS,29525,Jump,5,3,Right Side Center(RC),Above the Break 3,24
13152,Jump Shot,Jump Shot,361,20701216,34.0493,212,-5,-118.0578,2,3,0,2007-08,56,21,0.0,2PT Field Goal,Right Side(R),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2008-04-15,LAL vs. SAC,SAC,13153,Jump,5,2,Right Side(R),Mid-Range,20


In [81]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25697 entries, 1 to 30696
Data columns (total 31 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   action_type             25697 non-null  object 
 1   combined_shot_type      25697 non-null  object 
 2   game_event_id           25697 non-null  int64  
 3   game_id                 25697 non-null  int64  
 4   lat                     25697 non-null  float64
 5   loc_x                   25697 non-null  int64  
 6   loc_y                   25697 non-null  int64  
 7   lon                     25697 non-null  float64
 8   minutes_remaining       25697 non-null  int64  
 9   period                  25697 non-null  int64  
 10  playoffs                25697 non-null  int64  
 11  season                  25697 non-null  object 
 12  seconds_remaining       25697 non-null  int64  
 13  shot_distance           25697 non-null  int64  
 14  shot_made_flag          25697 non-null

In [100]:
# Save them to CSV files for EDA
# train.to_csv('preprocessed-data/pre_train.csv', index=False)
# test.to_csv('preprocessed-data/pre_test.csv',index=False)

# Feature Selection

---
I will select my features and make an archive containing them.

In [228]:
feature1 = ['loc_x','loc_y','minutes_remaining','period','playoffs','seconds_remaining','shot_distance',
            'opponent','action_type_NUM','shot_type_NUM','combined_shot_type_NUM','shot_zone_range_NUM',
            'shot_zone_area','shot_zone_basic']
feature2 = ['loc_x','loc_y','minutes_remaining','period','playoffs','seconds_remaining','shot_distance',
            'opponent', 'shot_zone_area','shot_zone_basic']
feature3 = ['action_type','combined_shot_type','loc_x','loc_y','period','playoffs','season','shot_type_NUM',
            'matchup','shot_zone_area','shot_zone_basic','shot_zone_range','opponent']

In [229]:
# This function will help select columns that need to be dummified
to_dummy = list(train[feature3].select_dtypes(include='object'))
print(to_dummy)
to_ss = [name for name in feature3 if name not in to_dummy]
print(to_ss)

['action_type', 'combined_shot_type', 'season', 'matchup', 'shot_zone_area', 'shot_zone_basic', 'shot_zone_range', 'opponent']
['loc_x', 'loc_y', 'period', 'playoffs', 'shot_type_NUM']


# Create X and y from `train` dataframe

---
We will create `X` and `y`. From there, convert them into X_train, X_val, y_train, and y_val.

In [230]:
y = train['shot_made_flag'].map({0.0:0, 1.0:1})

# <font color='green'>These cells below will constantly be ran after every feature selection</font>

In [253]:
X = train[feature3]

train1 = pd.get_dummies(X,columns=to_dummy,drop_first=True)
train1.head()

Unnamed: 0,loc_x,loc_y,period,playoffs,shot_type_NUM,action_type_Alley Oop Layup shot,action_type_Cutting Layup Shot,action_type_Driving Bank shot,action_type_Driving Dunk Shot,action_type_Driving Finger Roll Layup Shot,action_type_Driving Finger Roll Shot,action_type_Driving Floating Bank Jump Shot,action_type_Driving Floating Jump Shot,action_type_Driving Hook Shot,action_type_Driving Jump shot,action_type_Driving Layup Shot,action_type_Driving Reverse Layup Shot,action_type_Driving Slam Dunk Shot,action_type_Dunk Shot,action_type_Fadeaway Bank shot,action_type_Fadeaway Jump Shot,action_type_Finger Roll Layup Shot,action_type_Finger Roll Shot,action_type_Floating Jump shot,action_type_Follow Up Dunk Shot,action_type_Hook Bank Shot,action_type_Hook Shot,action_type_Jump Bank Shot,action_type_Jump Hook Shot,action_type_Jump Shot,action_type_Layup Shot,action_type_Pullup Bank shot,action_type_Pullup Jump shot,action_type_Putback Dunk Shot,action_type_Putback Layup Shot,action_type_Putback Slam Dunk Shot,action_type_Reverse Dunk Shot,action_type_Reverse Layup Shot,action_type_Reverse Slam Dunk Shot,action_type_Running Bank shot,action_type_Running Dunk Shot,action_type_Running Finger Roll Layup Shot,action_type_Running Finger Roll Shot,action_type_Running Hook Shot,action_type_Running Jump Shot,action_type_Running Layup Shot,action_type_Running Pull-Up Jump Shot,action_type_Running Reverse Layup Shot,action_type_Running Slam Dunk Shot,action_type_Running Tip Shot,action_type_Slam Dunk Shot,action_type_Step Back Jump shot,action_type_Tip Layup Shot,action_type_Tip Shot,action_type_Turnaround Bank shot,action_type_Turnaround Fadeaway shot,action_type_Turnaround Finger Roll Shot,action_type_Turnaround Hook Shot,action_type_Turnaround Jump Shot,combined_shot_type_Dunk,combined_shot_type_Hook Shot,combined_shot_type_Jump Shot,combined_shot_type_Layup,combined_shot_type_Tip Shot,season_1997-98,season_1998-99,season_1999-00,season_2000-01,season_2001-02,season_2002-03,season_2003-04,season_2004-05,season_2005-06,season_2006-07,season_2007-08,season_2008-09,season_2009-10,season_2010-11,season_2011-12,season_2012-13,season_2013-14,season_2014-15,season_2015-16,matchup_LAL @ BKN,matchup_LAL @ BOS,matchup_LAL @ CHA,matchup_LAL @ CHH,matchup_LAL @ CHI,matchup_LAL @ CLE,matchup_LAL @ DAL,matchup_LAL @ DEN,matchup_LAL @ DET,matchup_LAL @ GSW,matchup_LAL @ HOU,matchup_LAL @ IND,matchup_LAL @ LAC,matchup_LAL @ MEM,matchup_LAL @ MIA,matchup_LAL @ MIL,matchup_LAL @ MIN,matchup_LAL @ NJN,matchup_LAL @ NOH,matchup_LAL @ NOK,matchup_LAL @ NOP,matchup_LAL @ NYK,matchup_LAL @ OKC,matchup_LAL @ ORL,matchup_LAL @ PHI,matchup_LAL @ PHO,matchup_LAL @ PHX,matchup_LAL @ POR,matchup_LAL @ SAC,matchup_LAL @ SAS,matchup_LAL @ SEA,matchup_LAL @ TOR,matchup_LAL @ UTA,matchup_LAL @ UTH,matchup_LAL @ VAN,matchup_LAL @ WAS,matchup_LAL vs. ATL,matchup_LAL vs. BKN,matchup_LAL vs. BOS,matchup_LAL vs. CHA,matchup_LAL vs. CHH,matchup_LAL vs. CHI,matchup_LAL vs. CLE,matchup_LAL vs. DAL,matchup_LAL vs. DEN,matchup_LAL vs. DET,matchup_LAL vs. GSW,matchup_LAL vs. HOU,matchup_LAL vs. IND,matchup_LAL vs. LAC,matchup_LAL vs. MEM,matchup_LAL vs. MIA,matchup_LAL vs. MIL,matchup_LAL vs. MIN,matchup_LAL vs. NJN,matchup_LAL vs. NOH,matchup_LAL vs. NOK,matchup_LAL vs. NOP,matchup_LAL vs. NYK,matchup_LAL vs. OKC,matchup_LAL vs. ORL,matchup_LAL vs. PHI,matchup_LAL vs. PHO,matchup_LAL vs. PHX,matchup_LAL vs. POR,matchup_LAL vs. SAC,matchup_LAL vs. SAN,matchup_LAL vs. SAS,matchup_LAL vs. SEA,matchup_LAL vs. TOR,matchup_LAL vs. UTA,matchup_LAL vs. VAN,matchup_LAL vs. WAS,shot_zone_area_Center(C),shot_zone_area_Left Side Center(LC),shot_zone_area_Left Side(L),shot_zone_area_Right Side Center(RC),shot_zone_area_Right Side(R),shot_zone_basic_Backcourt,shot_zone_basic_In The Paint (Non-RA),shot_zone_basic_Left Corner 3,shot_zone_basic_Mid-Range,shot_zone_basic_Restricted Area,shot_zone_basic_Right Corner 3,shot_zone_range_24+ ft.,shot_zone_range_8-16 ft.,shot_zone_range_Back Court Shot,shot_zone_range_Less Than 8 ft.,opponent_BKN,opponent_BOS,opponent_CHA,opponent_CHI,opponent_CLE,opponent_DAL,opponent_DEN,opponent_DET,opponent_GSW,opponent_HOU,opponent_IND,opponent_LAC,opponent_MEM,opponent_MIA,opponent_MIL,opponent_MIN,opponent_NJN,opponent_NOH,opponent_NOP,opponent_NYK,opponent_OKC,opponent_ORL,opponent_PHI,opponent_PHX,opponent_POR,opponent_SAC,opponent_SAS,opponent_SEA,opponent_TOR,opponent_UTA,opponent_VAN,opponent_WAS
1,-157,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,-101,135,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,138,175,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,0,0,2,0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
5,-145,-11,3,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [254]:
test1 = pd.get_dummies(test[feature3], columns=to_dummy, drop_first=True)
test1.head()

Unnamed: 0,loc_x,loc_y,period,playoffs,shot_type_NUM,action_type_Alley Oop Layup shot,action_type_Cutting Finger Roll Layup Shot,action_type_Driving Bank shot,action_type_Driving Dunk Shot,action_type_Driving Finger Roll Layup Shot,action_type_Driving Finger Roll Shot,action_type_Driving Floating Jump Shot,action_type_Driving Hook Shot,action_type_Driving Jump shot,action_type_Driving Layup Shot,action_type_Driving Reverse Layup Shot,action_type_Driving Slam Dunk Shot,action_type_Dunk Shot,action_type_Fadeaway Bank shot,action_type_Fadeaway Jump Shot,action_type_Finger Roll Layup Shot,action_type_Finger Roll Shot,action_type_Floating Jump shot,action_type_Follow Up Dunk Shot,action_type_Hook Shot,action_type_Jump Bank Shot,action_type_Jump Hook Shot,action_type_Jump Shot,action_type_Layup Shot,action_type_Pullup Bank shot,action_type_Pullup Jump shot,action_type_Putback Dunk Shot,action_type_Putback Layup Shot,action_type_Reverse Dunk Shot,action_type_Reverse Layup Shot,action_type_Reverse Slam Dunk Shot,action_type_Running Bank shot,action_type_Running Dunk Shot,action_type_Running Finger Roll Layup Shot,action_type_Running Hook Shot,action_type_Running Jump Shot,action_type_Running Layup Shot,action_type_Running Pull-Up Jump Shot,action_type_Running Reverse Layup Shot,action_type_Running Tip Shot,action_type_Slam Dunk Shot,action_type_Step Back Jump shot,action_type_Tip Shot,action_type_Turnaround Bank shot,action_type_Turnaround Fadeaway Bank Jump Shot,action_type_Turnaround Fadeaway shot,action_type_Turnaround Hook Shot,action_type_Turnaround Jump Shot,combined_shot_type_Dunk,combined_shot_type_Hook Shot,combined_shot_type_Jump Shot,combined_shot_type_Layup,combined_shot_type_Tip Shot,season_1997-98,season_1998-99,season_1999-00,season_2000-01,season_2001-02,season_2002-03,season_2003-04,season_2004-05,season_2005-06,season_2006-07,season_2007-08,season_2008-09,season_2009-10,season_2010-11,season_2011-12,season_2012-13,season_2013-14,season_2014-15,season_2015-16,matchup_LAL @ BKN,matchup_LAL @ BOS,matchup_LAL @ CHA,matchup_LAL @ CHH,matchup_LAL @ CHI,matchup_LAL @ CLE,matchup_LAL @ DAL,matchup_LAL @ DEN,matchup_LAL @ DET,matchup_LAL @ GSW,matchup_LAL @ HOU,matchup_LAL @ IND,matchup_LAL @ LAC,matchup_LAL @ MEM,matchup_LAL @ MIA,matchup_LAL @ MIL,matchup_LAL @ MIN,matchup_LAL @ NJN,matchup_LAL @ NOH,matchup_LAL @ NOK,matchup_LAL @ NOP,matchup_LAL @ NYK,matchup_LAL @ OKC,matchup_LAL @ ORL,matchup_LAL @ PHI,matchup_LAL @ PHO,matchup_LAL @ PHX,matchup_LAL @ POR,matchup_LAL @ SAC,matchup_LAL @ SAS,matchup_LAL @ SEA,matchup_LAL @ TOR,matchup_LAL @ UTA,matchup_LAL @ UTH,matchup_LAL @ VAN,matchup_LAL @ WAS,matchup_LAL vs. ATL,matchup_LAL vs. BKN,matchup_LAL vs. BOS,matchup_LAL vs. CHA,matchup_LAL vs. CHH,matchup_LAL vs. CHI,matchup_LAL vs. CLE,matchup_LAL vs. DAL,matchup_LAL vs. DEN,matchup_LAL vs. DET,matchup_LAL vs. GSW,matchup_LAL vs. HOU,matchup_LAL vs. IND,matchup_LAL vs. LAC,matchup_LAL vs. MEM,matchup_LAL vs. MIA,matchup_LAL vs. MIL,matchup_LAL vs. MIN,matchup_LAL vs. NJN,matchup_LAL vs. NOH,matchup_LAL vs. NOK,matchup_LAL vs. NOP,matchup_LAL vs. NYK,matchup_LAL vs. OKC,matchup_LAL vs. ORL,matchup_LAL vs. PHI,matchup_LAL vs. PHO,matchup_LAL vs. PHX,matchup_LAL vs. POR,matchup_LAL vs. SAC,matchup_LAL vs. SAN,matchup_LAL vs. SAS,matchup_LAL vs. SEA,matchup_LAL vs. TOR,matchup_LAL vs. UTA,matchup_LAL vs. VAN,matchup_LAL vs. WAS,shot_zone_area_Center(C),shot_zone_area_Left Side Center(LC),shot_zone_area_Left Side(L),shot_zone_area_Right Side Center(RC),shot_zone_area_Right Side(R),shot_zone_basic_Backcourt,shot_zone_basic_In The Paint (Non-RA),shot_zone_basic_Left Corner 3,shot_zone_basic_Mid-Range,shot_zone_basic_Restricted Area,shot_zone_basic_Right Corner 3,shot_zone_range_24+ ft.,shot_zone_range_8-16 ft.,shot_zone_range_Back Court Shot,shot_zone_range_Less Than 8 ft.,opponent_BKN,opponent_BOS,opponent_CHA,opponent_CHI,opponent_CLE,opponent_DAL,opponent_DEN,opponent_DET,opponent_GSW,opponent_HOU,opponent_IND,opponent_LAC,opponent_MEM,opponent_MIA,opponent_MIL,opponent_MIN,opponent_NJN,opponent_NOH,opponent_NOP,opponent_NYK,opponent_OKC,opponent_ORL,opponent_PHI,opponent_PHX,opponent_POR,opponent_SAC,opponent_SAS,opponent_SEA,opponent_TOR,opponent_UTA,opponent_VAN,opponent_WAS
0,167,72,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
7,1,28,3,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
16,0,0,1,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
19,0,0,3,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
32,163,76,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [255]:
# Check what is in the train not in test
[name for name in train1.columns if name not in test1.columns]

['action_type_Cutting Layup Shot',
 'action_type_Driving Floating Bank Jump Shot',
 'action_type_Hook Bank Shot',
 'action_type_Putback Slam Dunk Shot',
 'action_type_Running Finger Roll Shot',
 'action_type_Running Slam Dunk Shot',
 'action_type_Tip Layup Shot',
 'action_type_Turnaround Finger Roll Shot']

In [256]:
# Check what is in test not in train
[name for name in test1.columns if name not in train1.columns]

['action_type_Cutting Finger Roll Layup Shot',
 'action_type_Turnaround Fadeaway Bank Jump Shot']

In [257]:
# Run if necessary
test1.drop(columns=[name for name in test1.columns if name not in train1.columns],inplace=True)
train1.drop(columns=[name for name in train1.columns if name not in test1.columns],inplace=True)

In [258]:
X_train, X_val, y_train, y_val = train_test_split(train1, y, test_size=.33, random_state=824)

In [259]:
X_train.shape, y_train.shape

((17216, 195), (17216,))

In [260]:
X_val.shape, y_val.shape

((8481, 195), (8481,))

# <font color='green'>These cells above will constantly be ran after every feature selection</font>

# Models

### Logistic Regression

In [261]:
params = {
    'penalty':['l1','l2','elasticnet'],
    'max_iter':[10_000,20_000,30_000]
}

gs = GridSearchCV(LogisticRegression(), param_grid=params, n_jobs=4, cv=5)
gs.fit(X_train,y_train)

print(gs.best_params_)
print('Train Score', gs.score(X_train, y_train))
print('Val Score', gs.score(X_val, y_val))

{'max_iter': 10000, 'penalty': 'l2'}
Train Score 0.6861640334572491
Val Score 0.677514444051409


In [262]:
val_preds = gs.predict(X_val)

In [263]:
f1_score(y_val, val_preds)

0.556797925781883

In [264]:
# Calculate preds on the real test set
test_preds = gs.predict(test1)

In [265]:
test['shot_made_flag'] = test_preds

In [266]:
sub = test[['shot_id','shot_made_flag']]
sub.to_csv('submissions/sub3.csv',index=False)

### Random Forest

In [273]:
# Feature set 3
rf = RandomForestClassifier(random_state=824)

params2 = {
    'n_estimators':[100, 125,150],
    'max_depth': [None,3,4, 5, 6],
    'max_features':['auto','log2'],
    'min_samples_leaf':[1,2]
}

gs2 = GridSearchCV(rf,param_grid=params2, n_jobs=4, cv=5, verbose=1)
gs2.fit(X_train, y_train)

print(gs2.best_params_)
print('Train Score', gs2.score(X_train, y_train))
print('Val Score', gs2.score(X_val, y_val))

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   49.7s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  1.4min
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:  1.8min finished


{'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 2, 'n_estimators': 150}
Train Score 0.7527881040892194
Val Score 0.678103997170145


In [272]:
val_preds2 = gs2.predict(X_val)
f1_score(y_val, val_preds2)

0.5685840707964602

In [274]:
# Calculate preds on the real test set
test_preds2 = gs.predict(test1)

In [275]:
test['shot_made_flag'] = test_preds

In [276]:
sub = test[['shot_id','shot_made_flag']]
sub.to_csv('submissions/sub4.csv',index=False)

### AdaBoosting

In [None]:
ada = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())

ada_params = {
    'n_estimators':[50,100,150,],
    'learning_rate':[0.9,0.1,.95,.85],
    'base_estimator__max_depth': [1,2,3]
}

gs3 = GridSearchCV(ada, param_grid=ada_params, cv=3)
gs3.fit(X_train, y_train)

# Scores
print(gs3.best_params_)
print('Train Score', gs3.score(X_train, y_train))
print('Val Score', gs3.score(X_val, y_val))