In [316]:
import pandas as pd
import numpy as np
from scipy.spatial import distance
from numpy.random import permutation
import math
import random
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import NearestNeighbors

In [317]:
nba = pd.read_csv("datasets/nba_2013.csv")

In [318]:
nba.head(10).transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
player,Quincy Acy,Steven Adams,Jeff Adrien,Arron Afflalo,Alexis Ajinca,Cole Aldrich,LaMarcus Aldridge,Lavoy Allen,Ray Allen,Tony Allen
pos,SF,C,PF,SG,C,C,PF,PF,SG,SG
age,23,20,27,28,25,25,28,24,38,32
bref_team_id,TOT,OKC,TOT,ORL,NOP,NYK,POR,TOT,MIA,MEM
g,63,81,53,73,56,46,69,65,73,55
gs,0,20,12,73,30,2,69,2,9,28
mp,847,1197,961,2552,951,330,2498,1072,1936,1278
fg,66,93,143,464,136,33,652,134,240,204
fga,141,185,275,1011,249,61,1423,300,543,413
fg.,0.468,0.503,0.52,0.459,0.546,0.541,0.458,0.447,0.442,0.494


## Find the Euclidean Distance

In [319]:
selected_player = nba[nba["player"] == "Manu Ginobili"]

In [320]:
selected_player

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
164,Manu Ginobili,SG,36,SAS,68,3,1550,294,627,0.469,...,172,202,293,70,17,139,128,838,2013-2014,2013


In [321]:
# Choose only the numeric columns (we'll use these to compute euclidean distance)

distance_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', \
                    'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', \
                    'x2pa', 'x2p.', 'efg.', 'ft', 'fta', \
                    'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', \
                    'blk', 'tov', 'pf', 'pts']

np.array(distance_columns)

array(['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.',
       'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb',
       'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts'],
      dtype='<U4')

In [322]:
def euclidean_distance(row):
    inner_value = 0
    for k in distance_columns:
        inner_value += (row[k] - selected_player[k]) ** 2
    return math.sqrt(inner_value)

In [323]:
# Find the distance from each player in the dataset to lebron.
player_distance = nba.apply(euclidean_distance, axis=1)

In [324]:
player_distance.head(10)

0    1222.360442
1            NaN
2            NaN
3    1269.029556
4     996.379510
5            NaN
6    2043.687852
7     918.220612
8     490.002046
9     596.252518
dtype: float64

In [325]:
# Select only the numeric columns from the NBA dataset
nba_numeric = nba[distance_columns]

In [326]:
nba_numeric.head().transpose()

Unnamed: 0,0,1,2,3,4
age,23.0,20.0,27.0,28.0,25.0
g,63.0,81.0,53.0,73.0,56.0
gs,0.0,20.0,12.0,73.0,30.0
mp,847.0,1197.0,961.0,2552.0,951.0
fg,66.0,93.0,143.0,464.0,136.0
fga,141.0,185.0,275.0,1011.0,249.0
fg.,0.468,0.503,0.52,0.459,0.546
x3p,4.0,0.0,0.0,128.0,0.0
x3pa,15.0,0.0,0.0,300.0,1.0
x3p.,0.266667,,,0.426667,0.0


In [327]:
# Normalize all of the numeric columns

nba_normalized = (nba_numeric - nba_numeric.mean()) / nba_numeric.std()

In [328]:
nba_normalized.head()

Unnamed: 0,age,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,...,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pts
0,-0.835906,0.384886,-0.862207,-0.435088,-0.738401,-0.768505,0.319884,-0.700282,-0.716608,-0.117009,...,-0.389712,0.26069,-0.129462,-0.013116,-0.64522,-0.468056,0.06141,-0.66765,0.226515,-0.734621
1,-1.550487,1.095711,-0.187863,-0.045011,-0.581271,-0.649215,0.674593,-0.778936,-0.829601,,...,-0.88295,1.387883,0.18702,0.565852,-0.530733,0.02068,1.065446,-0.01376,1.363938,-0.534801
2,0.116868,-0.010016,-0.4576,-0.308035,-0.290291,-0.405214,0.84688,-0.778936,-0.829601,,...,-0.520826,0.743773,0.28334,0.436083,-0.568895,-0.439307,0.385292,-0.524113,0.029924,-0.328603
3,0.355062,0.779789,1.599148,1.465144,1.577804,1.590172,0.228673,1.737992,1.430256,0.898007,...,0.578033,-0.38342,0.462221,0.216475,1.033919,-0.123066,-0.68352,1.18238,0.423107,1.729123
4,-0.359519,0.108454,0.149309,-0.31918,-0.331028,-0.475703,1.110379,-0.778936,-0.822068,-1.808704,...,0.709147,0.614951,0.138859,0.291341,-0.55363,-0.468056,0.709175,-0.141348,1.139262,-0.400878


In [329]:
# fill in NA values in nba_normalized
nba_normalized.fillna(0, inplace = True)

In [330]:
nba_normalized.head()

Unnamed: 0,age,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,...,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pts
0,-0.835906,0.384886,-0.862207,-0.435088,-0.738401,-0.768505,0.319884,-0.700282,-0.716608,-0.117009,...,-0.389712,0.26069,-0.129462,-0.013116,-0.64522,-0.468056,0.06141,-0.66765,0.226515,-0.734621
1,-1.550487,1.095711,-0.187863,-0.045011,-0.581271,-0.649215,0.674593,-0.778936,-0.829601,0.0,...,-0.88295,1.387883,0.18702,0.565852,-0.530733,0.02068,1.065446,-0.01376,1.363938,-0.534801
2,0.116868,-0.010016,-0.4576,-0.308035,-0.290291,-0.405214,0.84688,-0.778936,-0.829601,0.0,...,-0.520826,0.743773,0.28334,0.436083,-0.568895,-0.439307,0.385292,-0.524113,0.029924,-0.328603
3,0.355062,0.779789,1.599148,1.465144,1.577804,1.590172,0.228673,1.737992,1.430256,0.898007,...,0.578033,-0.38342,0.462221,0.216475,1.033919,-0.123066,-0.68352,1.18238,0.423107,1.729123
4,-0.359519,0.108454,0.149309,-0.31918,-0.331028,-0.475703,1.110379,-0.778936,-0.822068,-1.808704,...,0.709147,0.614951,0.138859,0.291341,-0.55363,-0.468056,0.709175,-0.141348,1.139262,-0.400878


In [331]:
# find the normalized vector for Manu Ginobili

ginobili_normalized = nba_normalized[nba["player"] == "Manu Ginobili"]
ginobili_normalized.head()

Unnamed: 0,age,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,...,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pts
164,2.260611,0.582337,-0.761055,0.348409,0.58847,0.549101,0.330018,0.990779,1.113876,0.404269,...,0.8028,-0.415625,0.063179,-0.082991,1.377379,0.883156,-0.230085,1.07074,0.310769,0.683254


In [332]:
# find the distance between ginobili and everyone else

euclidean_distances = nba_normalized.apply(lambda row: distance.euclidean(row, ginobili_normalized), axis = 1)

euclidean_distances.head()

0    6.165023
1    6.613089
2    5.368253
3    4.634847
4    5.989447
dtype: float64

In [333]:
distance_frame = pd.DataFrame(data = {"dist": euclidean_distances, "idx": euclidean_distances.index})

distance_frame.head()

Unnamed: 0,dist,idx
0,6.165023,0
1,6.613089,1
2,5.368253,2
3,4.634847,3
4,5.989447,4


In [334]:
distance_frame.sort_values("dist", inplace = True)

In [335]:
distance_frame.head()

Unnamed: 0,dist,idx
164,0.0,164
8,2.218999,8
467,2.531433,467
220,2.756342,220
84,2.872591,84


In [336]:
# find the most similar player to ginobili
# the lowest distance to ginobili is ginobili
# the second lowest is the most similar to ginobili

second_smallest = distance_frame.iloc[1]["idx"]
most_similar_to_ginobili = nba.loc[int(second_smallest)]["player"]

most_similar_to_ginobili

'Ray Allen'

## Generating training and testing sets

Let's make predictions on a test set. We'll try to predict how many points a player scored using the 5 closest neighbors. We'll find neighbors by using all the numeric columns in the dataset to generate similarity scores. 

First, we have to generate test and train sets.

In [337]:
# shuffle

nba.fillna(0, inplace = True)

random_indices = permutation(nba.index)

test_cutoff = math.floor(len(nba)/3)

test = nba.loc[random_indices[1:test_cutoff]]

train = nba.loc[random_indices[test_cutoff:]]

In [338]:
len(test)

159

In [339]:
len(train)

321

In [340]:
len(nba)

481

In [341]:
train.head()

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
366,J.J. Redick,SG,29,LAC,35,34,987,181,398,0.455,...,65,74,78,28,3,42,65,532,2013-2014,2013
318,Timofey Mozgov,C,27,DEN,82,30,1770,285,545,0.523,...,357,528,62,27,100,122,213,770,2013-2014,2013
459,Russell Westbrook,PG,25,OKC,46,46,1412,346,791,0.437,...,208,263,319,88,7,177,104,1002,2013-2014,2013
271,DeAndre Liggins,SG,25,MIA,1,0,1,1,1,1.0,...,0,1,0,0,0,0,0,2,2013-2014,2013
29,Leandro Barbosa,PG,31,PHO,20,0,368,56,131,0.427,...,32,37,32,7,4,19,30,150,2013-2014,2013


In [342]:
test.head()

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
227,Antawn Jamison,PF,37,LAC,22,0,248,29,92,0.315,...,46,55,7,9,3,7,29,84,2013-2014,2013
289,Jason Maxiell,PF,30,ORL,34,13,488,47,105,0.448,...,58,86,9,8,20,14,47,109,2013-2014,2013
97,Darren Collison,PG,26,LAC,80,35,2069,324,694,0.467,...,141,188,297,93,15,132,150,911,2013-2014,2013
201,Roy Hibbert,C,27,IND,81,81,2409,331,754,0.439,...,336,538,91,29,182,148,269,871,2013-2014,2013
426,Hollis Thompson,SF,22,PHI,77,41,1742,171,372,0.46,...,174,247,73,53,12,60,144,461,2013-2014,2013


## Using sklearn for k nearest neighbors

Documentation: http://scikit-learn.org/stable/modules/neighbors.html

We're using regressor because we have continuous values to predict on. Sklean performs the normalization and distance finding automatically. It also lets us specify how many neighbors we want to look at. 

In [343]:
# columns that we'd be making predictions with. 

x_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', \
             'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', \
             'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', \
             'ast', 'stl', 'blk', 'tov', 'pf']

In [344]:
# column that we want to predict.

y_column = ['pts']

In [345]:
# create the knn model
# look at the 5 closest neighbors

k = 5
knn = KNeighborsRegressor(n_neighbors = 5)
knn

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')

In [346]:
# fit the model on the training data
knn.fit(train[x_columns], train[y_column])

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')

In [347]:
# make point predictions on the test set using the fit model

predictions = knn.predict(test[x_columns])

predictions[0:5]

array([[  82.2],
       [ 141. ],
       [ 779.4],
       [ 972.2],
       [ 548.6]])

In [348]:
test.head()

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
227,Antawn Jamison,PF,37,LAC,22,0,248,29,92,0.315,...,46,55,7,9,3,7,29,84,2013-2014,2013
289,Jason Maxiell,PF,30,ORL,34,13,488,47,105,0.448,...,58,86,9,8,20,14,47,109,2013-2014,2013
97,Darren Collison,PG,26,LAC,80,35,2069,324,694,0.467,...,141,188,297,93,15,132,150,911,2013-2014,2013
201,Roy Hibbert,C,27,IND,81,81,2409,331,754,0.439,...,336,538,91,29,182,148,269,871,2013-2014,2013
426,Hollis Thompson,SF,22,PHI,77,41,1742,171,372,0.46,...,174,247,73,53,12,60,144,461,2013-2014,2013


## Compute error

Let's compute the error involved in our predictions. We can compute the Mean Squared Error (MSE).

In [349]:
# get actual values for the test set

actual = test[y_column]

In [350]:
# compute the MSE 

mse = (((predictions - actual) ** 2).sum()) / len(predictions)

mse

pts    5869.010314
dtype: float64

## Get the nearest neighbors

In [351]:
k = 6

In [352]:
neigh = NearestNeighbors(n_neighbors = k)
neigh.fit(nba_normalized)
neighbors = neigh.kneighbors(ginobili_normalized, return_distance = False)
neighbors = neighbors.tolist()[0]

In [353]:
res = pd.DataFrame(nba.iloc[neighbors, :])
res.transpose()

Unnamed: 0,164,8,467,220,84,64
player,Manu Ginobili,Ray Allen,Mo Williams,Jarrett Jack,Vince Carter,Aaron Brooks
pos,SG,SG,PG,PG,SG,PG
age,36,38,31,30,37,29
bref_team_id,SAS,MIA,POR,CLE,DAL,TOT
g,68,73,74,80,81,72
gs,3,9,0,31,0,12
mp,1550,1936,1834,2252,1973,1557
fg,294,240,280,286,330,233
fga,627,543,672,698,811,581
fg.,0.469,0.442,0.417,0.41,0.407,0.401
