In [1]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [2]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
# import ml_metrics as metrics
%matplotlib inline

In [3]:
df = pd.read_csv('train.csv')

In [4]:
train, test = train_test_split(df, test_size=0.2, random_state=41)

In [5]:
X_train = train.drop(['place_id'], axis=1)
y_train = train.place_id

X_test = test.drop(['place_id'], axis=1)
y_test = test.place_id

In [6]:
places = train[['x', 'y', 'place_id']].groupby(['place_id']).mean()

In [7]:
places.reset_index(inplace=True)

In [9]:
clf = KNeighborsClassifier(n_neighbors=1)
clf.fit(places[['x', 'y']], places.place_id)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

In [11]:
preds = clf.predict(X_test[['x', 'y']])

In [22]:
apk(list(y_test.values), list(preds), k=3)

1.0

In [18]:
sum(y_test.values == preds)

628201

In [20]:
628201 / len(preds)

0.10787149883963627

In [24]:
np.array([1,2,3]).reshape((3,1))

array([[1],
       [2],
       [3]])

In [27]:
preds_list = preds.reshape((preds.shape[0],1))
y_test_list = y_test.reshape((y_test.shape[0], 1))

In [28]:
mapk(y_test_list, preds_list)

0.10787149883963627

In [38]:
clf.kneighbors(X_test[['x', 'y']].iloc[0:3], 10)

(array([[ 0.01500046,  0.01851305,  0.0215449 ,  0.02573857,  0.04323547,
          0.04525076,  0.04959852,  0.04969417,  0.05160699,  0.05458975],
        [ 0.01134724,  0.0227848 ,  0.05186886,  0.07082409,  0.07945386,
          0.09598644,  0.10464145,  0.12435496,  0.12603294,  0.13197125],
        [ 0.01226417,  0.02663006,  0.02802244,  0.03368451,  0.03401723,
          0.0345607 ,  0.03903916,  0.04194861,  0.0419905 ,  0.04403477]]),
 array([[ 25275,  96552,  80211,  72756,   2359,  90883,  88121,  18329,
          56529,  48043],
        [ 80850,  95254,   8562,  55765,  57063, 104832,  40637,  66684,
          72281,  32687],
        [ 65946,  19041,  98035, 102729,  98895,  34785,  96843,  90503,
          45194,  26482]], dtype=int64))

In [32]:
X_test[['x', 'y']].iloc[0]

x    4.7021
y    2.5319
Name: 18890777, dtype: float64

In [39]:
places[['x', 'y']].iloc[80211]

x    4.699514
y    2.510511
Name: 80211, dtype: float64

In [42]:
places.place_id.iloc[[80211, 80212]]

80211    7678992636
80212    7679053823
Name: place_id, dtype: int64

In [45]:
np.array([1, 2,3]).reshape(-1, 1)

array([[1],
       [2],
       [3]])

In [60]:
clf.kneighbors(X_test[['x', 'y']].iloc[0], 3)[1].flatten()



array([25275, 96552, 80211], dtype=int64)

In [84]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

preds_3 = []
for row in range(X_test.shape[0]):
    point = X_test.iloc[row]
    neighs = clf.kneighbors(point[['x', 'y']], 3)[1].flatten()
    preds_3.append(places.place_id.iloc[neighs].values)
# preds_3

In [72]:
for point in range(X_test[:2].shape[0]):
    print(X_test.iloc[point])

row_id      1.889078e+07
x           4.702100e+00
y           2.531900e+00
accuracy    1.330000e+02
time        3.571550e+05
Name: 18890777, dtype: float64
row_id      2.481919e+07
x           5.130000e-02
y           4.369700e+00
accuracy    2.890000e+02
time        1.888390e+05
Name: 24819186, dtype: float64


In [64]:
X_test[:10].iterrows()

<generator object DataFrame.iterrows at 0x000000001A03EA98>

In [79]:
places.place_id.iloc[preds_3[0]]

25275    3079940754
96552    9022505352
80211    7678992636
Name: place_id, dtype: int64

In [85]:
mapk(y_test_list, preds_3)

0.16184817365417697