## Kobe Bryant Shot Selection Project

### Team Members: Kit Ha, Vinh Ngo, Johnathan Tahod, Anthony Collado

#### Credit to Kaggle for data and idea. Link to webpage below
#### https://www.kaggle.com/c/kobe-bryant-shot-selection
#### Github: https://github.com/johntahod/KobeBryant_ShotSelection/tree/master

##### *Markdown conventions just to be consistent
# Title
## Headers
### Sections
##### Notes/Comments

### Imports and Setting up the data frame + first look at data

In [121]:
import numpy as np
import pandas as pd

kobe_data = pd.read_csv('data.csv')
display(kobe_data.head(5))

full_features = [kobe_data.columns]
print(full_features)

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,...,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,game_date,matchup,opponent,shot_id
0,Jump Shot,Jump Shot,10,20000012,33.9723,167,72,-118.1028,10,1,...,2PT Field Goal,Right Side(R),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,1
1,Jump Shot,Jump Shot,12,20000012,34.0443,-157,0,-118.4268,10,1,...,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,2
2,Jump Shot,Jump Shot,35,20000012,33.9093,-101,135,-118.3708,7,1,...,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,3
3,Jump Shot,Jump Shot,43,20000012,33.8693,138,175,-118.1318,6,1,...,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,4
4,Driving Dunk Shot,Dunk,155,20000012,34.0443,0,0,-118.2698,6,2,...,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,5


[Index(['action_type', 'combined_shot_type', 'game_event_id', 'game_id', 'lat',
       'loc_x', 'loc_y', 'lon', 'minutes_remaining', 'period', 'playoffs',
       'season', 'seconds_remaining', 'shot_distance', 'shot_made_flag',
       'shot_type', 'shot_zone_area', 'shot_zone_basic', 'shot_zone_range',
       'team_id', 'team_name', 'game_date', 'matchup', 'opponent', 'shot_id'],
      dtype='object')]


### Setting up Lists

In [122]:
# Target List (this is our y)
target = ['shot_made_flag']

#### These are features we will not be using.
NO NEW DATA:
team_name: This feature will not be used because Kobe has only ever played for the Lakers so this provides no new data.
team_id: This is the same as team_name
matchup: This tells the same information as opponent (eg. Lakers vs Portland)

Little Data:
game_event_id: Don't need to id the game
game_id: Same reason as above

Arguably Little Data:
season: Season shouldn't have an effect on the shot
game_date: Dates shouldn't effect the shot much

In [90]:
# Features that we will not be using.
unused_features = ['team_name', 'team_id', 'matchup', 'game_event_id', 'game_id', 'season', 'game_date']

#### Removing unused features

In [126]:
removed_features = unused_features
# print(removed_features)

feature_cols = [feature for feature in kobe_data.columns if feature not in removed_features]
# print(feature_cols)

display(kobe_data[feature_cols].head(5))

Unnamed: 0,action_type,combined_shot_type,lat,loc_x,loc_y,lon,minutes_remaining,period,playoffs,seconds_remaining,shot_distance,shot_made_flag,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,opponent,shot_id
0,Jump Shot,Jump Shot,33.9723,167,72,-118.1028,10,1,0,27,18,,2PT Field Goal,Right Side(R),Mid-Range,16-24 ft.,POR,1
1,Jump Shot,Jump Shot,34.0443,-157,0,-118.4268,10,1,0,22,15,0.0,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,POR,2
2,Jump Shot,Jump Shot,33.9093,-101,135,-118.3708,7,1,0,45,16,1.0,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,POR,3
3,Jump Shot,Jump Shot,33.8693,138,175,-118.1318,6,1,0,52,22,0.0,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,POR,4
4,Driving Dunk Shot,Dunk,34.0443,0,0,-118.2698,6,2,0,19,0,1.0,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,POR,5


### Implement One-Hot-Encoding

In [93]:
# Features for One-Hot-Encoding
OHE_features = ['action_type', 'combined_shot_type', 'shot_type', 'shot_zone_area', 'shot_zone_basic', 'shot_zone_range', 'opponent']

In [125]:
# import for one hot encoding
from sklearn.preprocessing import OneHotEncoder
dummies = pd.DataFrame()

for feature in OHE_features:
    dummies = pd.concat([dummies, pd.get_dummies(kobe_data[feature])], axis = 'columns')
    dummies.drop(dummies.columns[[-1,]], axis=1, inplace=True)
    
display(dummies.head(5))

Unnamed: 0,Alley Oop Dunk Shot,Alley Oop Layup shot,Cutting Finger Roll Layup Shot,Cutting Layup Shot,Driving Bank shot,Driving Dunk Shot,Driving Finger Roll Layup Shot,Driving Finger Roll Shot,Driving Floating Bank Jump Shot,Driving Floating Jump Shot,...,ORL,PHI,PHX,POR,SAC,SAS,SEA,TOR,UTA,VAN
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [124]:
merged = pd.concat([kobe_data[feature_cols], dummies], axis='columns')

display(merged.head(5))

Unnamed: 0,action_type,combined_shot_type,lat,loc_x,loc_y,lon,minutes_remaining,period,playoffs,seconds_remaining,...,ORL,PHI,PHX,POR,SAC,SAS,SEA,TOR,UTA,VAN
0,Jump Shot,Jump Shot,33.9723,167,72,-118.1028,10,1,0,27,...,0,0,0,1,0,0,0,0,0,0
1,Jump Shot,Jump Shot,34.0443,-157,0,-118.4268,10,1,0,22,...,0,0,0,1,0,0,0,0,0,0
2,Jump Shot,Jump Shot,33.9093,-101,135,-118.3708,7,1,0,45,...,0,0,0,1,0,0,0,0,0,0
3,Jump Shot,Jump Shot,33.8693,138,175,-118.1318,6,1,0,52,...,0,0,0,1,0,0,0,0,0,0
4,Driving Dunk Shot,Dunk,34.0443,0,0,-118.2698,6,2,0,19,...,0,0,0,1,0,0,0,0,0,0


In [127]:
final_OHE = merged.drop(OHE_features, axis='columns')
final_features = [final.columns]
display(final.head(5))

Unnamed: 0,lat,loc_x,loc_y,lon,minutes_remaining,period,playoffs,seconds_remaining,shot_distance,shot_made_flag,...,ORL,PHI,PHX,POR,SAC,SAS,SEA,TOR,UTA,VAN
0,33.9723,167,72,-118.1028,10,1,0,27,18,,...,0,0,0,1,0,0,0,0,0,0
1,34.0443,-157,0,-118.4268,10,1,0,22,15,0.0,...,0,0,0,1,0,0,0,0,0,0
2,33.9093,-101,135,-118.3708,7,1,0,45,16,1.0,...,0,0,0,1,0,0,0,0,0,0
3,33.8693,138,175,-118.1318,6,1,0,52,22,0.0,...,0,0,0,1,0,0,0,0,0,0
4,34.0443,0,0,-118.2698,6,2,0,19,0,1.0,...,0,0,0,1,0,0,0,0,0,0


#### Y value splits

In [128]:
# Data frame but with the prediction rows removed (rows with a Nan value in the target column)
testing_data = final_OHE.dropna()
y = testing_data[target]
y.shape

(25697, 1)

#### X value splits

In [98]:
# Data with all prediction data dropped (so that we don't learn from it)
X = testing_data.drop(['shot_id', 'shot_made_flag'], axis=1)
X.shape


(25697, 117)

#### Prediction data split.
#### DO NOT USE IN TRAINING!
#### This is for submission

In [129]:
prediction_data = final_OHE[final_OHE['shot_made_flag'].isnull()]
final_prediction_data = prediction_data.drop(['shot_id', 'shot_made_flag'], axis=1)
print(final_prediction_data)

# Second way to write this
# nan_rows = kobe_data[kobe_data[target].isnull().any(1)]
# print(nan_rows)

           lat  loc_x  loc_y       lon  minutes_remaining  period  playoffs  \
0      33.9723    167     72 -118.1028                 10       1         0   
7      34.0163      1     28 -118.2688                  8       3         0   
16     34.0443      0      0 -118.2698                  0       1         0   
19     34.0443      0      0 -118.2698                 10       3         0   
32     33.9683    163     76 -118.1068                 11       1         0   
...        ...    ...    ...       ...                ...     ...       ...   
30668  33.8223    -23    222 -118.2928                  7       4         1   
30680  34.0443      0      0 -118.2698                  0       2         1   
30682  33.9963    -68     48 -118.3378                 11       3         1   
30686  33.9513     16     93 -118.2538                  5       3         1   
30693  34.0443      0      0 -118.2698                  6       4         1   

       seconds_remaining  shot_distance  Alley Oop 

### Finding the accuracy of KNN using all features after OHE

In [136]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [137]:
X_train, X_test, y_train, y_test = train_test_split(X, y.values.ravel(), test_size=0.4, random_state=10)

In [138]:
k = 3
knn = KNeighborsClassifier(n_neighbors=k)

In [139]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [140]:
y_predict = knn.predict(X_test)

In [141]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_predict)

print(accuracy)

0.5584200797742971


### Finding the best accuracy using Random Forest using OHE data

In [143]:
from sklearn.ensemble import RandomForestClassifier

my_RandomForest = RandomForestClassifier(n_estimators = 19, bootstrap = True, random_state=3)

my_RandomForest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=19,
                       n_jobs=None, oob_score=False, random_state=3, verbose=0,
                       warm_start=False)

In [144]:
y_predict_rf = my_RandomForest.predict(X_test)
display(y_predict_rf)

array([0., 0., 0., ..., 1., 0., 0.])

In [145]:
rf_accuracy = accuracy_score(y_test, y_predict_rf)

print(rf_accuracy)

0.638194376884911


#### The accuracy of Random Forest Model using all features after OHE is 63%

### Set up final Prediction

In [146]:
final_predict_rf = my_RandomForest.predict(final_prediction_data)
display(final_predict_rf)

array([0., 1., 1., ..., 1., 1., 0.])

### Store prediction into CSV

In [147]:
rf_submission = pd.DataFrame({'shot_id':prediction_data['shot_id'],'shot_made_flag':final_predict_rf})
display(rf_submission)

Unnamed: 0,shot_id,shot_made_flag
0,1,0.0
7,8,1.0
16,17,1.0
19,20,1.0
32,33,0.0
...,...,...
30668,30669,1.0
30680,30681,0.0
30682,30683,1.0
30686,30687,1.0


In [148]:
filename = 'RF_KobePrediction1.csv'

rf_submission.to_csv(filename, index=False)

print('Saved file: ' + filename)

Saved file: RF_KobePrediction1.csv


### The score of the first submission is 12.33045

### This attempt will be for Random Forest model without OHE

In [155]:
NO_OHE_data = kobe_data.dropna()
display(NO_OHE_data)
# y = NO_OHE_data['shot_made_flag', 'shot_id']
# X = NO_OHE_data.drop(['shot_made_flag', 'shot_id'], axis=1)
# NO_OHE_RandomForest = RandomForestClassifier(n_estimators = 19, bootstrap = True, random_state=3)
# #No_OHE_RandomForest.fit(X_train, y_train)

# y.shape
# X.shape

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,...,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,game_date,matchup,opponent,shot_id
1,Jump Shot,Jump Shot,12,20000012,34.0443,-157,0,-118.4268,10,1,...,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,2
2,Jump Shot,Jump Shot,35,20000012,33.9093,-101,135,-118.3708,7,1,...,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,3
3,Jump Shot,Jump Shot,43,20000012,33.8693,138,175,-118.1318,6,1,...,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,4
4,Driving Dunk Shot,Dunk,155,20000012,34.0443,0,0,-118.2698,6,2,...,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,5
5,Jump Shot,Jump Shot,244,20000012,34.0553,-145,-11,-118.4148,9,3,...,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30691,Driving Layup Shot,Layup,382,49900088,34.0443,0,0,-118.2698,7,4,...,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-06-19,LAL vs. IND,IND,30692
30692,Jump Shot,Jump Shot,397,49900088,33.9963,1,48,-118.2688,6,4,...,2PT Field Goal,Center(C),In The Paint (Non-RA),Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-06-19,LAL vs. IND,IND,30693
30694,Running Jump Shot,Jump Shot,426,49900088,33.8783,-134,166,-118.4038,3,4,...,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-06-19,LAL vs. IND,IND,30695
30695,Jump Shot,Jump Shot,448,49900088,33.7773,31,267,-118.2388,2,4,...,3PT Field Goal,Center(C),Above the Break 3,24+ ft.,1610612747,Los Angeles Lakers,2000-06-19,LAL vs. IND,IND,30696
