In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.model_selection import GridSearchCV, ShuffleSplit, train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import make_pipeline

In [2]:
#loading the dataset
with open("nba_2013.csv", 'r') as csvfile:
    nba = pd.read_csv(csvfile)

## Data Exploration :

In [3]:
#shape of dataset
nba.shape

(481, 31)

In [4]:
nba.head()

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
0,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,...,144,216,28,23,26,30,122,171,2013-2014,2013
1,Steven Adams,C,20,OKC,81,20,1197,93,185,0.503,...,190,332,43,40,57,71,203,265,2013-2014,2013
2,Jeff Adrien,PF,27,TOT,53,12,961,143,275,0.52,...,204,306,38,24,36,39,108,362,2013-2014,2013
3,Arron Afflalo,SG,28,ORL,73,73,2552,464,1011,0.459,...,230,262,248,35,3,146,136,1330,2013-2014,2013
4,Alexis Ajinca,C,25,NOP,56,30,951,136,249,0.546,...,183,277,40,23,46,63,187,328,2013-2014,2013


In [5]:
#seperating target data from rest of the dataset
target = nba.pop('pts')

In [6]:
target.head()

0     171
1     265
2     362
3    1330
4     328
Name: pts, dtype: int64

In [7]:
nba.head()

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,orb,drb,trb,ast,stl,blk,tov,pf,season,season_end
0,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,...,72,144,216,28,23,26,30,122,2013-2014,2013
1,Steven Adams,C,20,OKC,81,20,1197,93,185,0.503,...,142,190,332,43,40,57,71,203,2013-2014,2013
2,Jeff Adrien,PF,27,TOT,53,12,961,143,275,0.52,...,102,204,306,38,24,36,39,108,2013-2014,2013
3,Arron Afflalo,SG,28,ORL,73,73,2552,464,1011,0.459,...,32,230,262,248,35,3,146,136,2013-2014,2013
4,Alexis Ajinca,C,25,NOP,56,30,951,136,249,0.546,...,94,183,277,40,23,46,63,187,2013-2014,2013


'season' and 'season_end' both values remain the same for all rows. so that means they are not adding any value to the dataset which have a effect on target. also 'player' column contains all the different name of the players. so it would be better if we romove all these three columns.

In [8]:
nba = nba.drop(columns=['player', 'season', 'season_end'], axis=1)
nba.shape

(481, 27)

In [9]:
#checking for the type of data each feature possessed
nba.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 481 entries, 0 to 480
Data columns (total 27 columns):
pos             481 non-null object
age             481 non-null int64
bref_team_id    481 non-null object
g               481 non-null int64
gs              481 non-null int64
mp              481 non-null int64
fg              481 non-null int64
fga             481 non-null int64
fg.             479 non-null float64
x3p             481 non-null int64
x3pa            481 non-null int64
x3p.            414 non-null float64
x2p             481 non-null int64
x2pa            481 non-null int64
x2p.            478 non-null float64
efg.            479 non-null float64
ft              481 non-null int64
fta             481 non-null int64
ft.             461 non-null float64
orb             481 non-null int64
drb             481 non-null int64
trb             481 non-null int64
ast             481 non-null int64
stl             481 non-null int64
blk             481 non-null int64
tov     

Two columns named 'pos' and 'bref_team_id' do not have numerical data type. so first we have to encode it in numerical form.

In [10]:
nba['pos'].unique()

array(['SF', 'C', 'PF', 'SG', 'PG', 'G', 'F'], dtype=object)

In [11]:
nba['bref_team_id'].unique()

array(['TOT', 'OKC', 'ORL', 'NOP', 'NYK', 'POR', 'MIA', 'MEM', 'BRK',
       'PHI', 'MIL', 'ATL', 'WAS', 'GSW', 'DEN', 'HOU', 'SAS', 'BOS',
       'PHO', 'MIN', 'LAC', 'CLE', 'UTA', 'DET', 'CHA', 'DAL', 'CHI',
       'LAL', 'IND', 'TOR', 'SAC'], dtype=object)

In [12]:
nba_dummies = pd.get_dummies(nba, columns=['pos', 'bref_team_id'])
nba_dummies.head()

Unnamed: 0,age,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,...,bref_team_id_ORL,bref_team_id_PHI,bref_team_id_PHO,bref_team_id_POR,bref_team_id_SAC,bref_team_id_SAS,bref_team_id_TOR,bref_team_id_TOT,bref_team_id_UTA,bref_team_id_WAS
0,23,63,0,847,66,141,0.468,4,15,0.266667,...,0,0,0,0,0,0,0,1,0,0
1,20,81,20,1197,93,185,0.503,0,0,,...,0,0,0,0,0,0,0,0,0,0
2,27,53,12,961,143,275,0.52,0,0,,...,0,0,0,0,0,0,0,1,0,0
3,28,73,73,2552,464,1011,0.459,128,300,0.426667,...,1,0,0,0,0,0,0,0,0,0
4,25,56,30,951,136,249,0.546,0,1,0.0,...,0,0,0,0,0,0,0,0,0,0


## Model training :

In [13]:
#Splitting the dataset into train and test
x_train, x_test, y_train, y_test = train_test_split(nba_dummies, target,
                                                    test_size=0.25,
                                                    random_state=0)

#defining the type of cross validation we want to perform
cv = ShuffleSplit(n_splits=10, random_state=42)

#defining the pipeline process i.e. how all different processes should progress 
pipe = make_pipeline(Imputer(),               #Imputing i.e. replacing NaN value by the mean of the column
                     StandardScaler(),        #scaling the data
                     SelectFromModel(RandomForestClassifier(n_estimators=100)),       #defining the model by which we want to do feature selection
                     KNeighborsRegressor())    #estimator

#defining all the hyperparameter we want to tune
param_grid = {'kneighborsregressor__n_neighbors': [3, 5, 7]}

#performing gridsearch
grid = GridSearchCV(pipe, param_grid=param_grid, cv=cv)

In [14]:
#training the model
grid.fit(x_train, y_train)

GridSearchCV(cv=ShuffleSplit(n_splits=10, random_state=42, test_size='default',
       train_size=None),
       error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('selectfrommodel', SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
  ...nkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform'))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'kneighborsregressor__n_neighbors': [3, 5, 7]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [15]:
#predicting the model
grid.score(x_test, y_test)

0.9610329873904839

## finding which features have been selected by feature selection process :

In [16]:
final_pipeline = grid.best_estimator_
final_pipeline

Pipeline(memory=None,
     steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('selectfrommodel', SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
  ...nkowski',
          metric_params=None, n_jobs=1, n_neighbors=7, p=2,
          weights='uniform'))])

In [17]:
final_classifier = final_pipeline.named_steps
final_classifier

{'imputer': Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0),
 'kneighborsregressor': KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='uniform'),
 'selectfrommodel': SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
             oob_score=False, random_state=None, verbose=0,
             warm_start=False),
         norm_order=1, prefit=False, threshold=None),
 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True)}

In [18]:
len(x_train.columns)     #no. of featurs before the feature selection

63

In [19]:
select_indices = final_pipeline.named_steps.selectfrommodel.transform(np.arange(len(x_train.columns)).reshape(1, -1))
select_indices        #indexes of features that have been selected after feature selection

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
        16, 17, 18, 19, 20, 21, 22, 23, 24]])

In [20]:
feature_names = x_train.columns[select_indices]
feature_names        #name of such features

Index([['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf']], dtype='object')

In [21]:
x_train.columns     #name of all feaures present in the dataset before feature selection

Index(['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.',
       'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb',
       'ast', 'stl', 'blk', 'tov', 'pf', 'pos_C', 'pos_F', 'pos_G', 'pos_PF',
       'pos_PG', 'pos_SF', 'pos_SG', 'bref_team_id_ATL', 'bref_team_id_BOS',
       'bref_team_id_BRK', 'bref_team_id_CHA', 'bref_team_id_CHI',
       'bref_team_id_CLE', 'bref_team_id_DAL', 'bref_team_id_DEN',
       'bref_team_id_DET', 'bref_team_id_GSW', 'bref_team_id_HOU',
       'bref_team_id_IND', 'bref_team_id_LAC', 'bref_team_id_LAL',
       'bref_team_id_MEM', 'bref_team_id_MIA', 'bref_team_id_MIL',
       'bref_team_id_MIN', 'bref_team_id_NOP', 'bref_team_id_NYK',
       'bref_team_id_OKC', 'bref_team_id_ORL', 'bref_team_id_PHI',
       'bref_team_id_PHO', 'bref_team_id_POR', 'bref_team_id_SAC',
       'bref_team_id_SAS', 'bref_team_id_TOR', 'bref_team_id_TOT',
       'bref_team_id_UTA', 'bref_team_id_WAS'],
      dtype='object')

In [22]:
#best hyperparameters on which model performs well
grid.best_params_

{'kneighborsregressor__n_neighbors': 7}