In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
import time

In [2]:
feature_list = ['x','y','w0','w1','w2','w3','w4','w5','w6','day','month','year', 'accuracy']
for i in range(0,24):
    feature_list.append('h' + str(i))

In [3]:
def prepare_data(df):    
    #Feature engineering
    df.x = df.x.values * fw[0]
    df.y = df.y.values * fw[1]
    initial_date = np.datetime64('2014-01-02T01:01', dtype='datetime64[m]') 
    d_times = pd.DatetimeIndex(initial_date + np.timedelta64(int(mn), 'm') 
                               for mn in df.time.values)    
    #df['hour'] = (d_times.hour+ d_times.minute/60) * fw[2]
    for i in range(0,24):
        df['h' + str(i)] = (((d_times.hour+ d_times.minute/60) + i) % 24) * fw[2]
    
    df['w0'] = ((d_times.weekday + 0) % 7) * fw[3]
    df['w1'] = ((d_times.weekday + 1) % 7) * fw[3]
    df['w2'] = ((d_times.weekday + 2) % 7) * fw[3]
    df['w3'] = ((d_times.weekday + 3) % 7) * fw[3]
    df['w4'] = ((d_times.weekday + 4) % 7) * fw[3]
    df['w5'] = ((d_times.weekday + 5) % 7) * fw[3]
    df['w6'] = ((d_times.weekday + 6) % 7) * fw[3]
    
    df['day'] = (d_times.dayofyear * fw[4]).astype(int)
    df['month'] = d_times.month * fw[5]
    df['year'] = (d_times.year - 2013) * fw[6]
    #df.accuracy = df.accuracy.values * fw[7]
    df['accuracy'] = np.log10(df.accuracy) * fw[7]
    df['log_month'] = np.log10(3+df.time/(60 * 24 * 30)) * fw[8]
    df = df.drop(['time'], axis=1)
    
    return df

In [4]:
def process_one_cell(df_train, df_test, th):    
    place_counts = df_train.place_id.value_counts()
    mask = (place_counts[df_train.place_id.values] >= th).values
    df_train = df_train.loc[mask]
    
    row_ids = df_test.row_id
    
    #Applying the classifier
    clf = KNeighborsClassifier(n_neighbors=35, weights='distance', 
                               metric='manhattan')
    clf.fit(df_train[feature_list], df_train.place_id)
    predictions = clf.predict_proba(df_test[feature_list])
    result_index = np.argsort(predictions, axis=1)[:,::-1][:,:3]
    result = pd.DataFrame(df_test.row_id)
    result['p1'] = clf.classes_[result_index][:,:1]
    result['p2'] = clf.classes_[result_index][:,1:2]
    result['p3'] = clf.classes_[result_index][:,2:3]
    
    return result

In [5]:
def run_prediction(df_train, df_test):
    df_train = prepare_data(df_train)
    df_test = prepare_data(df_test)
    df_test['p1'] = np.nan
    df_test['p2'] = np.nan
    df_test['p3'] = np.nan
    
    prediction_result = process_one_cell(df_train, df_test, 5)
    prediction_result.sort_index(inplace=True)
    return prediction_result

In [6]:
def run_validation():
    df = pd.read_csv('../../train.csv',
                               usecols=['row_id','x','y','accuracy','time','place_id'])
    
    n_cell_x = 10
    n_cell_y = 20
    x_length = 10 / n_cell_x
    y_length = 10 / n_cell_y
    total_score = 0
    score_count = 0
    for x_index in range(0, n_cell_x):
        start_time = time.time()
        for y_index in range(0, n_cell_y):
            min_x = x_index * x_length
            max_x = (x_index + 1) * x_length
            min_y = y_index * y_length
            max_y = (y_index + 1) * y_length
            
            # include the edge
            if(y_index + 1 == n_cell_y):
                max_y += 0.1
            if(x_index + 1 == n_cell_x):
                max_x += 0.1

            df_train_cell = df[(df.time <= 786239 * 0.875) & \
                               (df.x >= min_x - 0.1) & \
                               (df.x < max_x + 0.1) & \
                               (df.y >= min_y - 0.1) & \
                               (df.y < max_y + 0.1)].copy()
            
            df_validation_cell = df[(df.time > 786239 * 0.875) & \
                                    (df.x >= min_x) & \
                                    (df.x < max_x) & \
                                    (df.y >= min_y) & \
                                    (df.y < max_y)].copy()
            
            prediction_result = run_prediction(df_train_cell, df_validation_cell)
            
            # Calculate score
            prediction_result.sort_index(inplace=True)
            prediction_result['score'] = (prediction_result.p1 == df_validation_cell.place_id) * 1
            prediction_result['score'] += (prediction_result.p2 == df_validation_cell.place_id) * 0.5
            prediction_result['score'] += (prediction_result.p3 == df_validation_cell.place_id) * 0.33

            score = prediction_result.score.mean()
            #print('s1:', score, flush=True)
            total_score += score
            score_count += 1
            
        print("Elapsed time overall: %s seconds" % (time.time() - start_time), x_index, flush = True)
    print("Final:", total_score/score_count, flush=True)


In [7]:
fw = [400, 1000, 1/10.5, 1/2.0, 1./22., 2, 9, 8, 4.5]
run_validation()

fw = [400, 1000, 1/10.5, 1/2.0, 1./22., 2, 9, 9, 4.5]
run_validation()

fw = [400, 1000, 1/10.5, 1/2.0, 1./22., 2, 9, 10, 4.5]
run_validation()

fw = [400, 1000, 1/10.5, 1/2.0, 1./22., 2, 9, 11, 4.5]
run_validation()

fw = [400, 1000, 1/10.5, 1/2.0, 1./22., 2, 9, 12, 4.5]
run_validation()

Elapsed time overall: 350.60431838035583 seconds 0
Elapsed time overall: 381.93614196777344 seconds 1
Elapsed time overall: 411.40785002708435 seconds 2
Elapsed time overall: 387.68547654151917 seconds 3
Elapsed time overall: 391.1362729072571 seconds 4
Elapsed time overall: 403.5235643386841 seconds 5
Elapsed time overall: 388.27496337890625 seconds 6
Elapsed time overall: 405.81954860687256 seconds 7
Elapsed time overall: 386.27262258529663 seconds 8
Elapsed time overall: 343.6947133541107 seconds 9
Final: 0.5445766529228884
Elapsed time overall: 343.97839426994324 seconds 0
Elapsed time overall: 400.8737576007843 seconds 1
Elapsed time overall: 436.34270000457764 seconds 2
Elapsed time overall: 411.1556420326233 seconds 3
Elapsed time overall: 443.52632784843445 seconds 4
Elapsed time overall: 466.95709586143494 seconds 5
Elapsed time overall: 442.18439531326294 seconds 6
Elapsed time overall: 454.8173727989197 seconds 7
Elapsed time overall: 433.1864244937897 seconds 8
Elapsed time

In [8]:
fw = [400, 1000, 1/10.5, 1/2.0, 1./22., 2, 9, 13, 4.5]
run_validation()

fw = [400, 1000, 1/10.5, 1/2.0, 1./22., 2, 9, 14, 4.5]
run_validation()

fw = [400, 1000, 1/10.5, 1/2.0, 1./22., 2, 9, 15, 4.5]
run_validation()

fw = [400, 1000, 1/10.5, 1/2.0, 1./22., 2, 9, 16, 4.5]
run_validation()

Elapsed time overall: 386.725248336792 seconds 0
Elapsed time overall: 420.13635778427124 seconds 1
Elapsed time overall: 449.48781991004944 seconds 2
Elapsed time overall: 425.0692808628082 seconds 3
Elapsed time overall: 427.0748414993286 seconds 4
Elapsed time overall: 423.0595848560333 seconds 5
Elapsed time overall: 518.0359590053558 seconds 6
Elapsed time overall: 532.0525288581848 seconds 7
Elapsed time overall: 427.21014976501465 seconds 8
Elapsed time overall: 380.37018609046936 seconds 9
Final: 0.5457701005518969
Elapsed time overall: 407.3051564693451 seconds 0
Elapsed time overall: 470.6080973148346 seconds 1
Elapsed time overall: 530.6711823940277 seconds 2
Elapsed time overall: 605.7313392162323 seconds 3
Elapsed time overall: 600.7104864120483 seconds 4
Elapsed time overall: 452.99732065200806 seconds 5
Elapsed time overall: 419.70831179618835 seconds 6
Elapsed time overall: 440.8181960582733 seconds 7
Elapsed time overall: 420.1555495262146 seconds 8
Elapsed time overal

In [9]:
fw = [400, 1000, 1/10.5, 1/2.0, 1./22., 2, 9, 16, 4.5]
run_validation()

fw = [400, 1000, 1/10.5, 1/2.0, 1./22., 2, 9, 17, 4.5]
run_validation()

fw = [400, 1000, 1/10.5, 1/2.0, 1./22., 2, 9, 18, 4.5]
run_validation()

fw = [400, 1000, 1/10.5, 1/2.0, 1./22., 2, 9, 19, 4.5]
run_validation()

fw = [400, 1000, 1/10.5, 1/2.0, 1./22., 2, 9, 20, 4.5]
run_validation()

Elapsed time overall: 380.39176511764526 seconds 0
Elapsed time overall: 426.1389844417572 seconds 1
Elapsed time overall: 466.8970158100128 seconds 2
Elapsed time overall: 434.6646800041199 seconds 3
Elapsed time overall: 425.76096987724304 seconds 4
Elapsed time overall: 442.7030129432678 seconds 5
Elapsed time overall: 424.13830304145813 seconds 6
Elapsed time overall: 418.2466218471527 seconds 7
Elapsed time overall: 399.59033966064453 seconds 8
Elapsed time overall: 354.22032833099365 seconds 9
Final: 0.5461806785765002
Elapsed time overall: 362.0649924278259 seconds 0
Elapsed time overall: 391.95437026023865 seconds 1
Elapsed time overall: 420.95700001716614 seconds 2
Elapsed time overall: 401.4008445739746 seconds 3
Elapsed time overall: 401.4277126789093 seconds 4
Elapsed time overall: 407.45340609550476 seconds 5
Elapsed time overall: 400.38613510131836 seconds 6
Elapsed time overall: 419.4959156513214 seconds 7
Elapsed time overall: 398.5061106681824 seconds 8
Elapsed time ov

In [10]:
fw = [400, 1000, 1/10.5, 1/2.0, 1./22., 2, 9, 21, 4.5]
run_validation()

fw = [400, 1000, 1/10.5, 1/2.0, 1./22., 2, 9, 22, 4.5]
run_validation()

fw = [400, 1000, 1/10.5, 1/2.0, 1./22., 2, 9, 23, 4.5]
run_validation()

fw = [400, 1000, 1/10.5, 1/2.0, 1./22., 2, 9, 24, 4.5]
run_validation()

Elapsed time overall: 372.519095659256 seconds 0
Elapsed time overall: 454.2793607711792 seconds 1
Elapsed time overall: 569.2755527496338 seconds 2
Elapsed time overall: 577.0361332893372 seconds 3
Elapsed time overall: 714.8435182571411 seconds 4
Elapsed time overall: 668.3598656654358 seconds 5
Elapsed time overall: 636.4655358791351 seconds 6
Elapsed time overall: 679.43155169487 seconds 7
Elapsed time overall: 534.8516805171967 seconds 8
Elapsed time overall: 449.3351695537567 seconds 9
Final: 0.5464483210449825
Elapsed time overall: 465.2892961502075 seconds 0
Elapsed time overall: 517.0016515254974 seconds 1
Elapsed time overall: 541.0508635044098 seconds 2
Elapsed time overall: 516.7580959796906 seconds 3
Elapsed time overall: 513.4534049034119 seconds 4
Elapsed time overall: 544.7498416900635 seconds 5
Elapsed time overall: 519.0423657894135 seconds 6
Elapsed time overall: 514.6770749092102 seconds 7
Elapsed time overall: 518.5650312900543 seconds 8
Elapsed time overall: 470.3

In [None]:
# Write to file
prediction_result['place_id'] = prediction_result.p1.astype(str) + " " + \
                                 prediction_result.p2.astype(str) + " " + \
                                 prediction_result.p3.astype(str)
prediction_result[['row_id', 'place_id']].to_csv('new_baseline_validation_result.csv', index=False)

In [None]:
# Run test
df_train = pd.read_csv('../train.csv',
                       usecols=['row_id','x','y','accuracy','time','place_id'])
df_test = pd.read_csv('../test.csv',
                       usecols=['row_id','x','y','accuracy','time'])
n_cell_x = 30
n_cell_y = 60 

df_train = prepare_data(df_train, n_cell_x, n_cell_y)
df_test = prepare_data(df_test, n_cell_x, n_cell_y)
df_test['p1'] = np.nan
df_test['p2'] = np.nan
df_test['p3'] = np.nan
prediction_result = run_prediction(df_train, df_test, 5)

prediction_result.sort_index(inplace=True)
prediction_result['place_id'] = prediction_result.p1.astype(str) + " " + \
                                 prediction_result.p2.astype(str) + " " + \
                                 prediction_result.p3.astype(str)
prediction_result[['row_id', 'place_id']].to_csv('new_baseline_result.csv', index=False)