In [13]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [10]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as pt
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
%matplotlib inline

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id
0,0,0.7941,9.0809,54,470702,8523065625
1,1,5.9567,4.7968,13,186555,1757726713
2,2,8.3078,7.0407,74,322648,1137537235
3,3,7.3665,2.5165,65,704587,6567393236
4,4,4.0961,1.1307,31,472130,7440663949


In [4]:
n_cell_x = 20
n_cell_y = 40 

size_x = 10. / n_cell_x
size_y = 10. / n_cell_y
eps = 0.00001  
xs = np.where(df.x.values < eps, 0, df.x.values - eps)
ys = np.where(df.y.values < eps, 0, df.y.values - eps)
pos_x = (xs / size_x).astype(np.int)
pos_y = (ys / size_y).astype(np.int)

df['grid_cell'] = pos_y * n_cell_x + pos_x

In [106]:
def process_one_cell(df_train, grid_id, th, df_test=None):
    """   
    Classification inside one grid cell.
    """   
    #Working on df_train
    df_cell_train = df_train.loc[df_train.grid_cell == grid_id]
    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= th).values
    df_cell_train = df_cell_train.loc[mask]
    
    ##For production:
    if (df_test != None) and len(df_test) > 1:
        ##Preparing train data:
        le = LabelEncoder()
        y = le.fit_transform(df_cell_train.place_id.values)
        X = df_cell_train.drop(['place_id', 'grid_cell'], axis=1).values.astype(int)
        
        ##Preparing test data:
        df_cell_test = df_test.loc[df_test.grid_cell == grid_id]
        row_ids = df_cell_test.index
        X_test = df_cell_test.drop(['grid_cell'], axis = 1).values.astype(int)
    else:
        ##For test:
        train, test = train_test_split(df_cell_train, test_size=0.2)
        le = LabelEncoder()
        y = le.fit_transform(train.place_id.values)
        X = train[['x', 'y']]
#         X = train.drop(['place_id', 'grid_cell'], axis=1).values.astype(int)
        
        row_ids = test.index
        y_test = le.transform(test.place_id.values)
        X_test = test[['x', 'y']]
#         X_test = test.drop(['place_id', 'grid_cell'], axis = 1).values.astype(int)
        
    #Applying the classifier
    clf = KNeighborsClassifier(n_neighbors=25, weights='distance', 
                               metric='manhattan')
    clf.fit(X, y)
    y_pred = clf.predict_proba(X_test)
    pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3])  
    score = mapk(le.inverse_transform(y_test.reshape(-1, 1)), pred_labels, 3)
    return score
#     return pred_labels, row_ids


In [110]:
process_one_cell(df, 101, 60)

0.63005436629289835

In [179]:
def process_one_cell(df_train, grid_id, th, df_test=None):
    """   
    Classification inside one grid cell.
    """   
    df_cell_train = df_train.loc[df_train.grid_cell == grid_id]
  

    train, test = train_test_split(df_cell_train, test_size=0.2)
    place_counts = train.place_id.value_counts()
    mask = (place_counts[train.place_id.values] >= th).values
    train = train.loc[mask]

    y = train.place_id
    X = train[['x', 'y']]

    row_ids = test.index
    y_test = test.place_id
    X_test = test[['x', 'y']]
        
    #Applying the classifier
    clf = KNeighborsClassifier(n_neighbors=25, weights='distance', 
                               metric='manhattan')
    clf.fit(X, y)
    y_pred = clf.predict_proba(X_test)
    pred_labels = clf.classes_[np.argsort(y_pred, axis=1)[:,::-1][:,:3]]
    score = mapk(y_test.reshape(-1, 1), pred_labels, 3)
    return score #y_pred, clf
#     return pred_labels, row_ids


In [1]:
process_one_cell(df, 444, 16)

NameError: name 'process_one_cell' is not defined

In [129]:
np.argsort(ans, axis=1)[:,::-1][:,:3]

array([[ 23,  75,  51],
       [ 50, 107,  35],
       [ 65,  34, 108],
       ..., 
       [ 66,  11, 118],
       [ 46,  63,  76],
       [ 18,  43, 118]], dtype=int64)

In [131]:
clf.classes_[np.argsort(ans, axis=1)[:,::-1][:,:3]]

array([[3121762659, 6357183957, 4828787478],
       [4723615516, 9010860669, 3800031667],
       [5909711863, 3785985025, 9047888137],
       ..., 
       [5950612350, 1957375901, 9912237323],
       [4366753786, 5780196127, 6358870572],
       [2552774242, 4280735325, 9912237323]], dtype=int64)