# kNN для произвольных метрик близости

### Сколько очков наберет игрок в сезоне?

In [1]:
import pandas as pd
import numpy as np

- player – имя игрока
- pos – роль игрока на поле
- g – количество матчей, в которых участвовал игрок
- gs – количество матчей, в которых игрок был на поле с самого начала
- pts – общее количество очков

pos:
- SF small forward
- C center
- PF power forward
- SG shooting guard
- PG point guard
- G guard
- F forward

In [2]:
data = pd.read_csv( 'nba_2013.csv' )
data.head()

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
0,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,...,144,216,28,23,26,30,122,171,2013-2014,2013
1,Steven Adams,C,20,OKC,81,20,1197,93,185,0.503,...,190,332,43,40,57,71,203,265,2013-2014,2013
2,Jeff Adrien,PF,27,TOT,53,12,961,143,275,0.52,...,204,306,38,24,36,39,108,362,2013-2014,2013
3,Arron Afflalo,SG,28,ORL,73,73,2552,464,1011,0.459,...,230,262,248,35,3,146,136,1330,2013-2014,2013
4,Alexis Ajinca,C,25,NOP,56,30,951,136,249,0.546,...,183,277,40,23,46,63,187,328,2013-2014,2013


Для построения модели возьмем несколько столбцов

In [3]:
predictionColumns = ['pos', 'age', 'g', 'gs']

In [4]:
dataForPrediction = data[ predictionColumns ]
dataForPrediction.head()

Unnamed: 0,pos,age,g,gs
0,SF,23,63,0
1,C,20,81,20
2,PF,27,53,12
3,SG,28,73,73
4,C,25,56,30


pos - категориальная переменная, надо перевести в количественную

In [5]:
dataForPrediction = pd.get_dummies( dataForPrediction )
dataForPrediction.head()

Unnamed: 0,age,g,gs,pos_C,pos_F,pos_G,pos_PF,pos_PG,pos_SF,pos_SG
0,23,63,0,0,0,0,0,0,1,0
1,20,81,20,1,0,0,0,0,0,0
2,27,53,12,0,0,0,1,0,0,0
3,28,73,73,0,0,0,0,0,0,1
4,25,56,30,1,0,0,0,0,0,0


Заметим, что названия колонок нашего dataframe изменились

In [6]:
predictionColumns = dataForPrediction.columns.values
predictionColumns

array(['age', 'g', 'gs', 'pos_C', 'pos_F', 'pos_G', 'pos_PF', 'pos_PG',
       'pos_SF', 'pos_SG'], dtype=object)

Проведем нормализацию данных

In [7]:
dataNormalized = ( dataForPrediction - dataForPrediction.mean() ) / dataForPrediction.std()
dataNormalized.head()

Unnamed: 0,age,g,gs,pos_C,pos_F,pos_G,pos_PF,pos_PG,pos_SF,pos_SG
0,-0.835906,0.384886,-0.862207,-0.479271,-0.045596,-0.045596,-0.498831,-0.462818,1.962285,-0.540742
1,-1.550487,1.095711,-0.187863,2.082165,-0.045596,-0.045596,-0.498831,-0.462818,-0.50855,-0.540742
2,0.116868,-0.010016,-0.4576,-0.479271,-0.045596,-0.045596,2.00052,-0.462818,-0.50855,-0.540742
3,0.355062,0.779789,1.599148,-0.479271,-0.045596,-0.045596,-0.498831,-0.462818,-0.50855,1.845467
4,-0.359519,0.108454,0.149309,2.082165,-0.045596,-0.045596,-0.498831,-0.462818,-0.50855,-0.540742


Проверим, что у всех столбцов среднее 0 и среднеквадратичное отклонение 1

In [8]:
dataNormalized.describe()

Unnamed: 0,age,g,gs,pos_C,pos_F,pos_G,pos_PF,pos_PG,pos_SF,pos_SG
count,481.0,481.0,481.0,481.0,481.0,481.0,481.0,481.0,481.0,481.0
mean,2.658996e-16,1.551081e-16,-7.386099e-18,-2.585135e-17,0.0,7.386099e-18,2.4004820000000003e-17,7.386099e-18,-1.107915e-16,1.846525e-18
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.78868,-2.063509,-0.8622067,-0.4792708,-0.045596,-0.04559608,-0.4988309,-0.4628177,-0.5085504,-0.5407417
25%,-0.835906,-0.8393113,-0.8622067,-0.4792708,-0.045596,-0.04559608,-0.4988309,-0.4628177,-0.5085504,-0.5407417
50%,-0.1213252,0.3059057,-0.5250348,-0.4792708,-0.045596,-0.04559608,-0.4988309,-0.4628177,-0.5085504,-0.5407417
75%,0.5932556,0.8982593,0.9585214,-0.4792708,-0.045596,-0.04559608,-0.4988309,-0.4628177,-0.5085504,-0.5407417
max,2.975192,1.174691,1.902603,2.082165,21.886116,21.88612,2.00052,2.156186,1.962285,1.845467


Выберем игрока, для которого будем искать наиболее близких

In [9]:
selectedPlayer = dataNormalized.loc[ 0 ]
selectedPlayer

age      -0.835906
g         0.384886
gs       -0.862207
pos_C    -0.479271
pos_F    -0.045596
pos_G    -0.045596
pos_PF   -0.498831
pos_PG   -0.462818
pos_SF    1.962285
pos_SG   -0.540742
Name: 0, dtype: float64

In [10]:
def euclidean_distance( player, selectedPlayer ):
    """
    Считаем "расстояние" между выбранным игроком selected_player и текущим player.
    В нашем случае это обычное евклидово расстояние
    """
      
    dist = 0
    for metricName in predictionColumns:
        dist += ( player[ metricName ] - selectedPlayer[ metricName ] )**2
    
    return dist**0.5

In [11]:
def euclidean_distance2( x, y ):
    """
    Считаем "расстояние" между выбранным игроком selected_player и текущим player.
    Аналогичный расчет для использования в модели KNeighborsRegressor
    """
    
    return np.sum( (x-y)**2 )**0.5

Введем новый столбец "близости" игроков к выбранному

In [12]:
dataNormalized['distance'] = dataNormalized.apply( euclidean_distance, axis = 1, args = (selectedPlayer,) )
dataNormalized.head()

Unnamed: 0,age,g,gs,pos_C,pos_F,pos_G,pos_PF,pos_PG,pos_SF,pos_SG,distance
0,-0.835906,0.384886,-0.862207,-0.479271,-0.045596,-0.045596,-0.498831,-0.462818,1.962285,-0.540742,0.0
1,-1.550487,1.095711,-0.187863,2.082165,-0.045596,-0.045596,-0.498831,-0.462818,-0.50855,-0.540742,3.75987
2,0.116868,-0.010016,-0.4576,-0.479271,-0.045596,-0.045596,2.00052,-0.462818,-0.50855,-0.540742,3.684999
3,0.355062,0.779789,1.599148,-0.479271,-0.045596,-0.045596,-0.498831,-0.462818,-0.50855,1.845467,4.408134
4,-0.359519,0.108454,0.149309,2.082165,-0.045596,-0.045596,-0.498831,-0.462818,-0.50855,-0.540742,3.740656


Объединяем вычисления с исходными данными

In [13]:
dataWithDistance = data.join( dataNormalized['distance'] )
dataWithDistance.head()

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,trb,ast,stl,blk,tov,pf,pts,season,season_end,distance
0,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,...,216,28,23,26,30,122,171,2013-2014,2013,0.0
1,Steven Adams,C,20,OKC,81,20,1197,93,185,0.503,...,332,43,40,57,71,203,265,2013-2014,2013,3.75987
2,Jeff Adrien,PF,27,TOT,53,12,961,143,275,0.52,...,306,38,24,36,39,108,362,2013-2014,2013,3.684999
3,Arron Afflalo,SG,28,ORL,73,73,2552,464,1011,0.459,...,262,248,35,3,146,136,1330,2013-2014,2013,4.408134
4,Alexis Ajinca,C,25,NOP,56,30,951,136,249,0.546,...,277,40,23,46,63,187,328,2013-2014,2013,3.740656


In [14]:
dataWithDistance.sort_values( by = 'distance', ascending = True ).head(6)

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,trb,ast,stl,blk,tov,pf,pts,season,season_end,distance
0,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,...,216,28,23,26,30,122,171,2013-2014,2013,0.0
181,Jordan Hamilton,SF,23,TOT,60,12,1019,147,376,0.391,...,194,52,44,20,46,67,404,2013-2014,2013,0.421594
213,Robbie Hummel,SF,24,MIN,53,5,655,67,177,0.379,...,132,23,16,2,10,57,181,2013-2014,2013,0.491025
22,Darrell Arthur,SF,25,DEN,68,1,1161,162,410,0.395,...,210,61,39,47,58,185,401,2013-2014,2013,0.516787
42,Michael Beasley,SF,25,MIA,55,2,831,177,355,0.499,...,172,42,23,21,57,93,436,2013-2014,2013,0.575586
106,Jae Crowder,SF,23,DAL,78,8,1254,130,296,0.439,...,194,60,59,21,41,100,356,2013-2014,2013,0.650877


Первую строчку не учитываем (это и был selected_player)

Считаем среднее для первых пяти игроков самых близких игроков

In [15]:
dataWithDistance.sort_values( by = 'distance', ascending = True ).iloc[1:6]['pts'].mean()

355.6

### То же самое с помощью KNeighborsRegressor

In [23]:
from sklearn.neighbors import KNeighborsRegressor

In [24]:
knn = KNeighborsRegressor( n_neighbors = 5 )

In [8]:
?sklearn.neighbors.KNeighborsRegressor

Object `sklearn.KNeighborsRegressor` not found.


In [25]:
dataNormalized.iloc[1:, :-1].head()

Unnamed: 0,age,g,gs,pos_C,pos_F,pos_G,pos_PF,pos_PG,pos_SF,pos_SG
1,-1.550487,1.095711,-0.187863,2.082165,-0.045596,-0.045596,-0.498831,-0.462818,-0.50855,-0.540742
2,0.116868,-0.010016,-0.4576,-0.479271,-0.045596,-0.045596,2.00052,-0.462818,-0.50855,-0.540742
3,0.355062,0.779789,1.599148,-0.479271,-0.045596,-0.045596,-0.498831,-0.462818,-0.50855,1.845467
4,-0.359519,0.108454,0.149309,2.082165,-0.045596,-0.045596,-0.498831,-0.462818,-0.50855,-0.540742
5,-0.359519,-0.286448,-0.794772,2.082165,-0.045596,-0.045596,-0.498831,-0.462818,-0.50855,-0.540742


In [26]:
data.loc[1:, 'pts'].head()

1     265
2     362
3    1330
4     328
5      92
Name: pts, dtype: int64

In [27]:
knn.fit( dataNormalized.iloc[1:, :-1], data.loc[1:, 'pts'] )

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')

Предсказание количества очков для selected_player (без крайнего правого столбца distance)

In [30]:
knn.predict( dataNormalized.iloc[:1, :-1] )

array([ 355.6])

### И еще раз с помощью KNeighborsRegressor, передавая метрику как функцию

In [31]:
knn = KNeighborsRegressor( n_neighbors = 5, metric = euclidean_distance2 )

In [32]:
knn.fit( dataNormalized.iloc[1:, :-1], data.loc[1:, 'pts'] )

KNeighborsRegressor(algorithm='auto', leaf_size=30,
          metric=<function euclidean_distance2 at 0x000000ECEEAC4C80>,
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')

In [33]:
knn.predict( dataNormalized.iloc[:1, :-1] )

array([ 355.6])