# Problem Statement
# Using the K-nearest neighbors algorithm to predict how many points NBA players scored in the 2013-2014 season

In [1]:
# importing the required libraries

import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.neighbors import KNeighborsRegressor

In [2]:
# reading the data into a DataFrame

data = pd.read_csv('nba_2013.csv')

In [3]:
# separating the points('pts') into a target variable

target = data.pts

In [4]:
# checking the data types of features

data.dtypes

player           object
pos              object
age               int64
bref_team_id     object
g                 int64
gs                int64
mp                int64
fg                int64
fga               int64
fg.             float64
x3p               int64
x3pa              int64
x3p.            float64
x2p               int64
x2pa              int64
x2p.            float64
efg.            float64
ft                int64
fta               int64
ft.             float64
orb               int64
drb               int64
trb               int64
ast               int64
stl               int64
blk               int64
tov               int64
pf                int64
pts               int64
season           object
season_end        int64
dtype: object

We can say that 'season', 'season_end' and 'player' columns are insignificant for model training.

Apart from that 'pos' and 'bref_team_id' are the categorical variable.

In [5]:
# checking for any null values

data.isna().any()

player          False
pos             False
age             False
bref_team_id    False
g               False
gs              False
mp              False
fg              False
fga             False
fg.              True
x3p             False
x3pa            False
x3p.             True
x2p             False
x2pa            False
x2p.             True
efg.             True
ft              False
fta             False
ft.              True
orb             False
drb             False
trb             False
ast             False
stl             False
blk             False
tov             False
pf              False
pts             False
season          False
season_end      False
dtype: bool

Clearly some columns have null values and all such columns are continous variable, so we can impute by their mean value

In [6]:
# removing the 'pts', which is a target variable from our features data

data.drop(['pts'],axis=1,inplace=True)

In [7]:
# data with only continous variable

data_cont = data.drop(['season','season_end','player','pos','bref_team_id'],axis=1)

In [8]:
# imputing the null values with their mean

impute = SimpleImputer(strategy = 'mean')
impute = impute.fit(data_cont)
data_imp = impute.transform(data_cont)

In [9]:
# scaling the data with continous values

scale = StandardScaler()
scale = scale.fit(data_imp)
data_scale = scale.transform(data_imp)

In [10]:
# data with only categorical variable

data_cat = data[['pos','bref_team_id']]

In [11]:
# doing a One Hot Encoding transformation on categorical variables

encode = OneHotEncoder(sparse=False)
encode = encode.fit(data_cat)
data_enc = encode.transform(data_cat)

In [12]:
# concatenating the scaled continous features and One Hot Encoded features

data_f = np.concatenate((data_enc,data_scale),axis=1)

In [13]:
X_train,X_test,y_train,y_test = train_test_split(data_f,target)

In [14]:
# applying the GridSearch to find best parameters for KNN Regressor

grid_params = {
    'n_neighbors' : [1,2,3,4,5,10],
    'weights' : ['uniform', 'distance']
}

grid_search = GridSearchCV(KNeighborsRegressor(),grid_params,cv=5,verbose=3)
grid_search.fit(X_train,y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] n_neighbors=1, weights=uniform ..................................
[CV] ...... n_neighbors=1, weights=uniform, score=0.933, total=   0.0s
[CV] n_neighbors=1, weights=uniform ..................................
[CV] ...... n_neighbors=1, weights=uniform, score=0.947, total=   0.0s
[CV] n_neighbors=1, weights=uniform ..................................
[CV] ...... n_neighbors=1, weights=uniform, score=0.918, total=   0.0s
[CV] n_neighbors=1, weights=uniform ..................................
[CV] ...... n_neighbors=1, weights=uniform, score=0.910, total=   0.0s
[CV] n_neighbors=1, weights=uniform ..................................
[CV] ...... n_neighbors=1, weights=uniform, score=0.921, total=   0.0s
[CV] n_neighbors=1, weights=distance .................................
[CV] ..... n_neighbors=1, weights=distance, score=0.933, total=   0.0s
[CV] n_neighbors=1, weights=distance .................................
[CV] ..... n_nei

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s



[CV] n_neighbors=2, weights=uniform ..................................
[CV] ...... n_neighbors=2, weights=uniform, score=0.946, total=   0.0s
[CV] n_neighbors=2, weights=uniform ..................................
[CV] ...... n_neighbors=2, weights=uniform, score=0.959, total=   0.0s
[CV] n_neighbors=2, weights=distance .................................
[CV] ..... n_neighbors=2, weights=distance, score=0.937, total=   0.0s
[CV] n_neighbors=2, weights=distance .................................
[CV] ..... n_neighbors=2, weights=distance, score=0.961, total=   0.0s
[CV] n_neighbors=2, weights=distance .................................
[CV] ..... n_neighbors=2, weights=distance, score=0.919, total=   0.0s
[CV] n_neighbors=2, weights=distance .................................
[CV] ..... n_neighbors=2, weights=distance, score=0.947, total=   0.0s
[CV] n_neighbors=2, weights=distance .................................
[CV] ..... n_neighbors=2, weights=distance, score=0.959, total=   0.0s
[CV] 

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:    0.8s finished


GridSearchCV(cv=5, estimator=KNeighborsRegressor(),
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 10],
                         'weights': ['uniform', 'distance']},
             verbose=3)

In [15]:
grid_search.best_estimator_

KNeighborsRegressor(weights='distance')

In [16]:
# fitting model with best parameters

model = KNeighborsRegressor(weights='distance')
model.fit(X_train,y_train)
print('R2 score on test data is %.2f'%model.score(X_test,y_test))

R2 score on test data is 0.97
