In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
import time

In [2]:
feature_list = ['x','y','hour','w0','w1','w2','w3','w4','w5','w6','day','month','year', 'accuracy']

In [3]:
def prepare_data(df, n_cell_x, n_cell_y):
    #Creating the grid
    size_x = 10. / n_cell_x
    size_y = 10. / n_cell_y
    eps = 0.00001  
    xs = np.where(df.x.values < eps, 0, df.x.values - eps)
    ys = np.where(df.y.values < eps, 0, df.y.values - eps)
    pos_x = (xs / size_x).astype(np.int)
    pos_y = (ys / size_y).astype(np.int)
    df['grid_x'] = pos_x
    df['grid_y'] = pos_y
    
    #Feature engineering
    df.x = df.x.values * fw[0]
    df.y = df.y.values * fw[1]
    initial_date = np.datetime64('2014-01-01T01:01', dtype='datetime64[m]') 
    d_times = pd.DatetimeIndex(initial_date + np.timedelta64(int(mn), 'm') 
                               for mn in df.time.values)    
    df['hour'] = (d_times.hour+ d_times.minute/60) * fw[2]
    df['w0'] = ((d_times.weekday + 0) % 7) * fw[3]
    df['w1'] = ((d_times.weekday + 1) % 7) * fw[3]
    df['w2'] = ((d_times.weekday + 2) % 7) * fw[3]
    df['w3'] = ((d_times.weekday + 3) % 7) * fw[3]
    df['w4'] = ((d_times.weekday + 4) % 7) * fw[3]
    df['w5'] = ((d_times.weekday + 5) % 7) * fw[3]
    df['w6'] = ((d_times.weekday + 6) % 7) * fw[3]
    #df['weekday'] = d_times.weekday * fw[3]
    df['day'] = (d_times.dayofyear * fw[4]).astype(int)
    df['month'] = d_times.month * fw[5]
    df['year'] = (d_times.year - 2013) * fw[6]
    df.accuracy = df.accuracy.values * fw[7]
    df = df.drop(['time'], axis=1)
    
    return df

In [4]:
def run_prediction(df_train, df_test, n_cell_x, n_cell_y, th):
    total_result = pd.DataFrame()
    for x_grid in range(0, n_cell_x):
        start_time = time.time()
        for y_grid in range(0, n_cell_y):
            total_result = total_result.append(process_one_cell(df_train, df_test, x_grid, y_grid, th))
        print("Elapsed time overall: %s seconds" % (time.time() - start_time), x_grid, flush = True)
    return total_result

In [5]:
def process_one_cell(df_train, df_test, grid_x, grid_y, th):
    start_time = time.time()
    
    df_cell_train = df_train[((df_train.grid_x == grid_x)&(df_train.grid_y == grid_y))]# |
    '''
                             ((df_train.grid_x == grid_x)&(df_train.grid_y == grid_y + 1)) |
                             ((df_train.grid_x == grid_x)&(df_train.grid_y == grid_y - 1)) |
                             ((df_train.grid_x == grid_x + 1)&(df_train.grid_y == grid_y)) |
                             ((df_train.grid_x == grid_x + 1)&(df_train.grid_y == grid_y + 1)) |
                             ((df_train.grid_x == grid_x + 1)&(df_train.grid_y == grid_y - 1)) |
                             ((df_train.grid_x == grid_x - 1)&(df_train.grid_y == grid_y)) |
                             ((df_train.grid_x == grid_x - 1)&(df_train.grid_y == grid_y + 1)) |
                             ((df_train.grid_x == grid_x - 1)&(df_train.grid_y == grid_y - 1))]
    '''
    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= th).values
    df_cell_train = df_cell_train.loc[mask]
    
    df_cell_test = df_test[(df_test.grid_x == grid_x)&(df_test.grid_y == grid_y)]
    row_ids = df_cell_test.row_id
    
    #Applying the classifier
    clf = KNeighborsClassifier(n_neighbors=35, weights='distance', 
                               metric='manhattan')
    clf.fit(df_cell_train[feature_list], df_cell_train.place_id)
    predictions = clf.predict_proba(df_cell_test[feature_list])
    result_index = np.argsort(predictions, axis=1)[:,::-1][:,:3]
    result = pd.DataFrame(df_cell_test.row_id)
    result['p1'] = clf.classes_[result_index][:,:1]
    result['p2'] = clf.classes_[result_index][:,1:2]
    result['p3'] = clf.classes_[result_index][:,2:3]
    
    return result

In [6]:
def run_validation():
    # Run validation
    df_train = pd.read_csv('../../train.csv',
                               usecols=['row_id','x','y','accuracy','time','place_id'])
    
    df_valiation = df_train[df_train.time > 786239 * 0.875]
    
    df_train = df_train[df_train.time <= 786239 * 0.875]
    
    n_cell_x = 10
    n_cell_y = 20 
    
    print('Preparing train data')
    df_train = prepare_data(df_train, n_cell_x, n_cell_y)
    df_valiation = prepare_data(df_valiation, n_cell_x, n_cell_y)
    df_valiation['p1'] = np.nan
    df_valiation['p2'] = np.nan
    df_valiation['p3'] = np.nan
    
    prediction_result = run_prediction(df_train, df_valiation, n_cell_x, n_cell_y, 5)
    
    # Calculate score
    prediction_result.sort_index(inplace=True)
    prediction_result['score'] = (prediction_result.p1 == df_valiation.place_id) * 1
    prediction_result['score'] += (prediction_result.p2 == df_valiation.place_id) * 0.5
    prediction_result['score'] += (prediction_result.p3 == df_valiation.place_id) * 0.33
    
    print(prediction_result.score.describe(), flush=True)

In [7]:
fw = [400, 1000, 4, 1/3.0, 1./22., 2, 9, 0.09] #feature weights
run_validation()
#0.540527

fw = [400, 1000, 4, 1/3.25, 1./22., 2, 9, 0.09] #feature weights
run_validation()
#0.540175

fw = [400, 1000, 4, 1/3.5, 1./22., 2, 9, 0.09] #feature weights
run_validation()
#0.539839

fw = [400, 1000, 4, 1/3.75, 1./22., 2, 9, 0.09] #feature weights
run_validation()
#0.539545

fw = [400, 1000, 4, 1/4, 1./22., 2, 9, 0.09] #feature weights
run_validation()
#0.539281

Preparing train data
Elapsed time overall: 153.33544898033142 seconds 0
Elapsed time overall: 157.68558359146118 seconds 1
Elapsed time overall: 168.27563905715942 seconds 2
Elapsed time overall: 167.3252203464508 seconds 3
Elapsed time overall: 166.82676887512207 seconds 4
Elapsed time overall: 167.42661571502686 seconds 5
Elapsed time overall: 166.576744556427 seconds 6
Elapsed time overall: 167.7895691394806 seconds 7
Elapsed time overall: 168.15053248405457 seconds 8
Elapsed time overall: 146.8195583820343 seconds 9
count    4417684.000000
mean           0.540527
std            0.450974
min            0.000000
25%            0.000000
50%            0.500000
75%            1.000000
max            1.000000
Name: score, dtype: float64
Preparing train data
Elapsed time overall: 154.91449904441833 seconds 0
Elapsed time overall: 157.3971664905548 seconds 1
Elapsed time overall: 164.65860533714294 seconds 2
Elapsed time overall: 164.63047242164612 seconds 3
Elapsed time overall: 164.6305

In [8]:
fw = [400, 1000, 4, 1/2.0, 1./22., 2, 9, 0.09] #feature weights
run_validation()
#0.541842

fw = [400, 1000, 4, 1/2.2, 1./22., 2, 9, 0.09] #feature weights
run_validation()
#0.541678

fw = [400, 1000, 4, 1/2.4, 1./22., 2, 9, 0.09] #feature weights
run_validation()
#0.541397

fw = [400, 1000, 4, 1/2.6, 1./22., 2, 9, 0.09] #feature weights
run_validation()
#0.541131

fw = [400, 1000, 4, 1/2.7, 1./22., 2, 9, 0.09] #feature weights
run_validation()
#0.540975

fw = [400, 1000, 4, 1/2.8, 1./22., 2, 9, 0.09] #feature weights
run_validation()
#0.540813

fw = [400, 1000, 4, 1/2.9, 1./22., 2, 9, 0.09] #feature weights
run_validation()
#0.540671

Preparing train data
Elapsed time overall: 161.44288086891174 seconds 0
Elapsed time overall: 160.96458292007446 seconds 1
Elapsed time overall: 132.72215056419373 seconds 2
Elapsed time overall: 122.23920798301697 seconds 3
Elapsed time overall: 118.27327680587769 seconds 4
Elapsed time overall: 123.7080147266388 seconds 5
Elapsed time overall: 122.01746201515198 seconds 6
Elapsed time overall: 121.9701669216156 seconds 7
Elapsed time overall: 123.06318259239197 seconds 8
Elapsed time overall: 110.28337097167969 seconds 9
count    4417684.000000
mean           0.541842
std            0.451131
min            0.000000
25%            0.000000
50%            0.500000
75%            1.000000
max            1.000000
Name: score, dtype: float64
Preparing train data
Elapsed time overall: 110.50450801849365 seconds 0
Elapsed time overall: 116.24114561080933 seconds 1
Elapsed time overall: 121.31278705596924 seconds 2
Elapsed time overall: 117.1578426361084 seconds 3
Elapsed time overall: 119.7

In [9]:
fw = [400, 1000, 4, 1/1.5, 1./22., 2, 9, 0.09] #feature weights
run_validation()
#0.541393

fw = [400, 1000, 4, 1/1.6, 1./22., 2, 9, 0.09] #feature weights
run_validation()
#0.541662

fw = [400, 1000, 4, 1/1.7, 1./22., 2, 9, 0.09] #feature weights
run_validation()
#0.541825

fw = [400, 1000, 4, 1/1.8, 1./22., 2, 9, 0.09] #feature weights
run_validation()
#0.541897

fw = [400, 1000, 4, 1/1.9, 1./22., 2, 9, 0.09] #feature weights
run_validation()
#0.541874

fw = [400, 1000, 4, 1/2.1, 1./22., 2, 9, 0.09] #feature weights
run_validation()
#0.541766

Preparing train data
Elapsed time overall: 124.9779462814331 seconds 0
Elapsed time overall: 135.5708990097046 seconds 1
Elapsed time overall: 136.78050088882446 seconds 2
Elapsed time overall: 139.19478607177734 seconds 3
Elapsed time overall: 147.48574709892273 seconds 4
Elapsed time overall: 166.846529006958 seconds 5
Elapsed time overall: 159.15909051895142 seconds 6
Elapsed time overall: 167.52445888519287 seconds 7
Elapsed time overall: 161.08399772644043 seconds 8
Elapsed time overall: 138.91574096679688 seconds 9
count    4417684.000000
mean           0.541393
std            0.451350
min            0.000000
25%            0.000000
50%            0.500000
75%            1.000000
max            1.000000
Name: score, dtype: float64
Preparing train data
Elapsed time overall: 142.62879157066345 seconds 0
Elapsed time overall: 158.2632656097412 seconds 1
Elapsed time overall: 154.6650276184082 seconds 2
Elapsed time overall: 153.44291520118713 seconds 3
Elapsed time overall: 153.4810

Validation result: 0.539376

In [None]:
# Write to file
prediction_result['place_id'] = prediction_result.p1.astype(str) + " " + \
                                 prediction_result.p2.astype(str) + " " + \
                                 prediction_result.p3.astype(str)
prediction_result[['row_id', 'place_id']].to_csv('new_baseline_validation_result.csv', index=False)

In [None]:
# Run test
df_train = pd.read_csv('../train.csv',
                       usecols=['row_id','x','y','accuracy','time','place_id'])
df_test = pd.read_csv('../test.csv',
                       usecols=['row_id','x','y','accuracy','time'])
n_cell_x = 30
n_cell_y = 60 

df_train = prepare_data(df_train, n_cell_x, n_cell_y)
df_test = prepare_data(df_test, n_cell_x, n_cell_y)
df_test['p1'] = np.nan
df_test['p2'] = np.nan
df_test['p3'] = np.nan
prediction_result = run_prediction(df_train, df_test, 5)

prediction_result.sort_index(inplace=True)
prediction_result['place_id'] = prediction_result.p1.astype(str) + " " + \
                                 prediction_result.p2.astype(str) + " " + \
                                 prediction_result.p3.astype(str)
prediction_result[['row_id', 'place_id']].to_csv('new_baseline_result.csv', index=False)