In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
import time

In [2]:
feature_list = ['x','y','weekday','day','month','year', 'accuracy']
for i in range(0,24):
    feature_list.append('h' + str(i))

In [3]:
def prepare_data(df):
    #Creating the grid
    #size_x = 10. / n_cell_x
    #size_y = 10. / n_cell_y
    #eps = 0.00001  
    #xs = np.where(df.x.values < eps, 0, df.x.values - eps)
    #ys = np.where(df.y.values < eps, 0, df.y.values - eps)
    #pos_x = (xs / size_x).astype(np.int)
    #pos_y = (ys / size_y).astype(np.int)
    #df['grid_x'] = pos_x
    #df['grid_y'] = pos_y
    
    #Feature engineering
    df.x = df.x.values * fw[0]
    df.y = df.y.values * fw[1]
    initial_date = np.datetime64('2014-01-01T01:01', dtype='datetime64[m]') 
    d_times = pd.DatetimeIndex(initial_date + np.timedelta64(int(mn), 'm') 
                               for mn in df.time.values)    
    #df['hour'] = (d_times.hour+ d_times.minute/60) * fw[2]
    for i in range(0,24):
        df['h' + str(i)] = (((d_times.hour+ d_times.minute/60) + i) % 24) * fw[2]
    df['weekday'] = d_times.weekday * fw[3]
    df['day'] = (d_times.dayofyear * fw[4]).astype(int)
    df['month'] = d_times.month * fw[5]
    df['year'] = (d_times.year - 2013) * fw[6]
    df.accuracy = df.accuracy.values * fw[7]
    df = df.drop(['time'], axis=1)
    
    return df

In [4]:
def process_one_cell(df_train, df_test, th):
    start_time = time.time()
    
    place_counts = df_train.place_id.value_counts()
    mask = (place_counts[df_train.place_id.values] >= th).values
    df_train = df_train.loc[mask]
    
    row_ids = df_test.row_id
    
    #Applying the classifier
    clf = KNeighborsClassifier(n_neighbors=35, weights='distance', 
                               metric='manhattan')
    clf.fit(df_train[feature_list], df_train.place_id)
    predictions = clf.predict_proba(df_test[feature_list])
    result_index = np.argsort(predictions, axis=1)[:,::-1][:,:3]
    result = pd.DataFrame(df_test.row_id)
    result['p1'] = clf.classes_[result_index][:,:1]
    result['p2'] = clf.classes_[result_index][:,1:2]
    result['p3'] = clf.classes_[result_index][:,2:3]
    
    return result

In [5]:
def run_region_validation(min_x, max_x, min_y, max_y):
    df_train = pd.read_csv('../../train.csv',
                               usecols=['row_id','x','y','accuracy','time','place_id'])
    
    df_train = df_train[(df_train.x >= min_x) & (df_train.x < max_x) & (df_train.y >= min_y) & (df_train.y < max_y)]
    
    df_valiation = df_train[df_train.time > 786239 * 0.875]
    
    df_train = df_train[df_train.time <= 786239 * 0.875]
    
    #print('Preparing train data')
    df_train = prepare_data(df_train)
    df_valiation = prepare_data(df_valiation)
    df_valiation['p1'] = np.nan
    df_valiation['p2'] = np.nan
    df_valiation['p3'] = np.nan
    
    prediction_result = process_one_cell(df_train, df_valiation, 5)
    
    # Calculate score
    prediction_result.sort_index(inplace=True)
    prediction_result['score'] = (prediction_result.p1 == df_valiation.place_id) * 1
    prediction_result['score'] += (prediction_result.p2 == df_valiation.place_id) * 0.5
    prediction_result['score'] += (prediction_result.p3 == df_valiation.place_id) * 0.33
    
    return prediction_result.score.mean()


In [6]:
def run_validation():
    n_cell_x = 10
    n_cell_y = 20
    x_length = 10 / n_cell_x
    y_length = 10 / n_cell_y
    total_score = 0
    score_count = 0
    for x_index in range(0, n_cell_x):
        start_time = time.time()
        for y_index in range(0, n_cell_y):
            min_x = x_index * x_length
            max_x = (x_index + 1) * x_length
            min_y = y_index * y_length
            max_y = (y_index + 1) * y_length
            if(y_index + 1 == n_cell_y):
                max_y += 0.1
            if(x_index + 1 == n_cell_x):
                max_x += 0.1
            score = run_region_validation(min_x, max_x, min_y, max_y)
            #print('s1:', score)
            total_score += score
            score_count += 1
        print("Elapsed time overall: %s seconds" % (time.time() - start_time), x_index, flush = True)
    print("Final:", total_score/score_count)

In [7]:
fw = [400, 1000, 1/10.0, 3.5, 1./22., 2, 9, 0.09] #feature weights
run_validation()
#0.542362786229369

fw = [400, 1000, 1/10.5, 3.5, 1./22., 2, 9, 0.09] #feature weights
run_validation()
#0.5424086954637467 <----

fw = [400, 1000, 1/11.0, 3.5, 1./22., 2, 9, 0.09] #feature weights
run_validation()
#0.5423804441981194

fw = [400, 1000, 1/11.5, 3.5, 1./22., 2, 9, 0.09] #feature weights
run_validation()
#0.5422700307196554

fw = [400, 1000, 1/12.0, 3.5, 1./22., 2, 9, 0.09] #feature weights
run_validation()
#0.5421211518819139

fw = [400, 1000, 1/12.5, 3.5, 1./22., 2, 9, 0.09] #feature weights
run_validation()
#0.5418923864810622

fw = [400, 1000, 1/13.0, 3.5, 1./22., 2, 9, 0.09] #feature weights
run_validation()
#0.5416266444204475

Elapsed time overall: 608.2859528064728 seconds 0
Elapsed time overall: 620.0639081001282 seconds 1
Elapsed time overall: 631.4439911842346 seconds 2
Elapsed time overall: 624.6600425243378 seconds 3
Elapsed time overall: 624.167683839798 seconds 4
Elapsed time overall: 682.7129120826721 seconds 5
Elapsed time overall: 670.4940638542175 seconds 6
Elapsed time overall: 667.9048676490784 seconds 7
Elapsed time overall: 626.9168150424957 seconds 8
Elapsed time overall: 608.68798661232 seconds 9
Final: 0.542362786229369
Elapsed time overall: 609.8104159832001 seconds 0
Elapsed time overall: 618.6822545528412 seconds 1
Elapsed time overall: 631.2685248851776 seconds 2
Elapsed time overall: 619.5251092910767 seconds 3
Elapsed time overall: 620.7041544914246 seconds 4
Elapsed time overall: 629.8055839538574 seconds 5
Elapsed time overall: 620.5512566566467 seconds 6
Elapsed time overall: 630.1552188396454 seconds 7
Elapsed time overall: 623.9426128864288 seconds 8
Elapsed time overall: 604.53

Validation result: 0.539376

In [None]:
# Write to file
prediction_result['place_id'] = prediction_result.p1.astype(str) + " " + \
                                 prediction_result.p2.astype(str) + " " + \
                                 prediction_result.p3.astype(str)
prediction_result[['row_id', 'place_id']].to_csv('new_baseline_validation_result.csv', index=False)

In [None]:
# Run test
df_train = pd.read_csv('../train.csv',
                       usecols=['row_id','x','y','accuracy','time','place_id'])
df_test = pd.read_csv('../test.csv',
                       usecols=['row_id','x','y','accuracy','time'])
n_cell_x = 30
n_cell_y = 60 

df_train = prepare_data(df_train, n_cell_x, n_cell_y)
df_test = prepare_data(df_test, n_cell_x, n_cell_y)
df_test['p1'] = np.nan
df_test['p2'] = np.nan
df_test['p3'] = np.nan
prediction_result = run_prediction(df_train, df_test, 5)

prediction_result.sort_index(inplace=True)
prediction_result['place_id'] = prediction_result.p1.astype(str) + " " + \
                                 prediction_result.p2.astype(str) + " " + \
                                 prediction_result.p3.astype(str)
prediction_result[['row_id', 'place_id']].to_csv('new_baseline_result.csv', index=False)