New improvement including duplicate data instead of using dimension expansion on periodical data


In [9]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
import time

In [10]:
feature_list = ['x','y','hour','weekday','day','month','year', 'accuracy']

In [11]:
def prepare_data(df):    
    #Feature engineering
    df.x = df.x.values * fw[0]
    df.y = df.y.values * fw[1]
    initial_date = np.datetime64('2014-01-02T01:01', dtype='datetime64[m]') 
    d_times = pd.DatetimeIndex(initial_date + np.timedelta64(int(mn), 'm') 
                               for mn in df.time.values)
    df['hour'] = (d_times.hour+ d_times.minute/60) * fw[2]
    df['weekday'] = d_times.weekday* fw[3]
    df['day'] = (d_times.dayofyear * fw[4]).astype(int)
    df['month'] = d_times.month * fw[5]
    df['year'] = (d_times.year - 2013) * fw[6]
    #df.accuracy = df.accuracy.values * fw[7]
    df['accuracy'] = np.log10(df.accuracy) * fw[7]
    df['log_month'] = np.log10(3+df.time/(60 * 24 * 30)) * fw[8]
    df = df.drop(['time'], axis=1)
    
    return df

In [12]:
def calculate_distance(distances):
    return distances ** -2

In [13]:
def process_one_cell(df_train, df_test, th):    
    place_counts = df_train.place_id.value_counts()
    mask = (place_counts[df_train.place_id.values] >= th).values
    df_train = df_train.loc[mask]
    row_ids = df_test.row_id
    
    best_k=np.floor(np.sqrt(len(df_train.index)/4*best_mul)/5)
    
    #Applying the classifier
    clf = KNeighborsClassifier(n_neighbors=best_k.astype(int), weights=calculate_distance, 
                               metric='manhattan',n_jobs=2)
    clf.fit(df_train[feature_list], df_train.place_id)
    predictions = clf.predict_proba(df_test[feature_list])
    result_index = np.argsort(predictions, axis=1)[:,::-1][:,:3]
    result = pd.DataFrame(df_test.row_id)
    result['p1'] = clf.classes_[result_index][:,:1]
    result['p2'] = clf.classes_[result_index][:,1:2]
    result['p3'] = clf.classes_[result_index][:,2:3]
    
    return result

In [14]:
def run_prediction(df_train, df_test):
    df_train = prepare_data(df_train)
    
    df_weekstart = df_train[df_train.weekday == 0].copy()
    df_weekstart.weekday = 7
    df_weekend = df_train[df_train.weekday == 6].copy()
    df_weekend.weekday = -1
    df_daystart = df_train[df_train.hour == 0].copy()
    df_daystart.hour = 24
    df_dayend = df_train[df_train.hour == 23].copy()
    df_dayend.hour = -1
    df_train = df_train.append(df_weekstart)
    df_train = df_train.append(df_weekend)
    df_train = df_train.append(df_daystart)
    df_train = df_train.append(df_dayend)
    
    df_test = prepare_data(df_test)
    df_test['p1'] = np.nan
    df_test['p2'] = np.nan
    df_test['p3'] = np.nan
    
    prediction_result = process_one_cell(df_train, df_test, 1)
    prediction_result.sort_index(inplace=True)
    return prediction_result

In [15]:
def run_validation():
    df = pd.read_csv('../../train.csv',
                               usecols=['row_id','x','y','accuracy','time','place_id'])
    
    n_cell_x = 10
    n_cell_y = 20
    x_length = 10 / n_cell_x
    y_length = 10 / n_cell_y
    total_score = 0
    score_count = 0
    for x_index in range(0, n_cell_x):
        start_time = time.time()
        for y_index in range(0, n_cell_y):
            min_x = x_index * x_length
            max_x = (x_index + 1) * x_length
            min_y = y_index * y_length
            max_y = (y_index + 1) * y_length
            
            # include the edge
            if(y_index + 1 == n_cell_y):
                max_y += 0.1
            if(x_index + 1 == n_cell_x):
                max_x += 0.1

            df_train_cell = df[(df.time <= 786239 * 0.875) & \
                               (df.x >= min_x - 0.1) & \
                               (df.x < max_x + 0.1) & \
                               (df.y >= min_y - 0.1) & \
                               (df.y < max_y + 0.1)].copy()
            
            df_validation_cell = df[(df.time > 786239 * 0.875) & \
                                    (df.x >= min_x) & \
                                    (df.x < max_x) & \
                                    (df.y >= min_y) & \
                                    (df.y < max_y)].copy()
            
            prediction_result = run_prediction(df_train_cell, df_validation_cell)
            
            # Calculate score
            prediction_result.sort_index(inplace=True)
            prediction_result['score'] = (prediction_result.p1 == df_validation_cell.place_id) * 1
            prediction_result['score'] += (prediction_result.p2 == df_validation_cell.place_id) * 0.5
            prediction_result['score'] += (prediction_result.p3 == df_validation_cell.place_id) * 0.33

            score = prediction_result.score.mean()
            print('s1:', score, flush=True)
            total_score += score
            score_count += 1
            
        print("Elapsed time overall: %s seconds" % (time.time() - start_time), x_index, flush = True)
    print("Final:", total_score/score_count, flush=True)


In [16]:
fw = [400, 1000, 4.3, 6, 1./22., 2, 9, 23, 4.5]
best_mul = 0.6
run_validation()

s1: 0.5429779818073579
s1: 0.566366539287019
s1: 0.5611475076297014
s1: 0.5614804506295521
s1: 0.5372003301010895
s1: 0.5449524969549298
s1: 0.5516388353168028
s1: 0.5641233622847011
s1: 0.5706995830590265
s1: 0.5696721235233896
s1: 0.5655871041261058
s1: 0.5748225056005474
s1: 0.5691780240509278
s1: 0.5879687009316409
s1: 0.5698994897959149
s1: 0.5907835388344361
s1: 0.5598668196328772
s1: 0.5978700564971711
s1: 0.5752887729399025
s1: 0.5607730101539429
Elapsed time overall: 125.63119864463806 seconds 0
s1: 0.5513886841135237
s1: 0.5472508280638327
s1: 0.5309401311810906
s1: 0.5475472328244237
s1: 0.5065554293701369
s1: 0.5485642823749375
s1: 0.5244738221142357
s1: 0.5387115455893215
s1: 0.49527631132851885
s1: 0.5118935557541414
s1: 0.54082793017456
s1: 0.5461005020410044
s1: 0.5282089790701351
s1: 0.5065382982336917
s1: 0.5298419461242403
s1: 0.5126082723165565
s1: 0.5508974109402339
s1: 0.5414279735887619
s1: 0.5012476243268886
s1: 0.5196470642370661
Elapsed time overall: 136.91059

In [9]:
def run_test():
    
    # Run test
    df = pd.read_csv('../../train.csv',
                           usecols=['row_id','x','y','accuracy','time','place_id'])
    df_test = pd.read_csv('../../test.csv',
                           usecols=['row_id','x','y','accuracy','time'])
    
    n_cell_x = 10
    n_cell_y = 20
    x_length = 10 / n_cell_x
    y_length = 10 / n_cell_y
    total_score = 0
    score_count = 0
    total_result = pd.DataFrame()
    for x_index in range(0, n_cell_x):
        start_time = time.time()
        for y_index in range(0, n_cell_y):
            min_x = x_index * x_length
            max_x = (x_index + 1) * x_length
            min_y = y_index * y_length
            max_y = (y_index + 1) * y_length
            
            # include the edge
            if(y_index + 1 == n_cell_y):
                max_y += 0.1
            if(x_index + 1 == n_cell_x):
                max_x += 0.1

            df_train_cell = df[(df.x >= min_x - 0.1) & \
                               (df.x < max_x + 0.1) & \
                               (df.y >= min_y - 0.1) & \
                               (df.y < max_y + 0.1)].copy()
            
            df_test_cell = df_test[(df_test.x >= min_x) & \
                                   (df_test.x < max_x) & \
                                   (df_test.y >= min_y) & \
                                   (df_test.y < max_y)].copy()
            
            prediction_result = run_prediction(df_train_cell, df_test_cell)
            total_result = total_result.append(prediction_result)
        print("Elapsed time overall: %s seconds" % (time.time() - start_time), x_index, flush = True)
            
    
    total_result.sort_index(inplace=True)
    total_result['place_id'] = total_result.p1.astype(str) + " " + \
                               total_result.p2.astype(str) + " " + \
                               total_result.p3.astype(str)
    total_result[['row_id', 'place_id']].to_csv('Baseline620.csv', index=False)

In [10]:
fw = [400, 1000, 1/10.5, 1/2.0, 1./22., 2, 9, 23, 4.5]
best_mul = 0.6
run_test()

Elapsed time overall: 753.6635875701904 seconds 0
Elapsed time overall: 812.3613085746765 seconds 1
Elapsed time overall: 1146.4357242584229 seconds 2
Elapsed time overall: 1208.2849400043488 seconds 3
Elapsed time overall: 1088.6301219463348 seconds 4
Elapsed time overall: 1261.2664031982422 seconds 5
Elapsed time overall: 1085.0014958381653 seconds 6
Elapsed time overall: 1029.870967388153 seconds 7
Elapsed time overall: 885.3105454444885 seconds 8
Elapsed time overall: 644.443009853363 seconds 9


In [11]:
0.58065

0.58065