In [1]:
import datetime
from heapq import nlargest
from operator import itemgetter
import os
import time
import math
from collections import defaultdict

def apk(actual, predicted, k=3):
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def prep_xy(x, y, virtual_range_x, virtual_range_y, real_range_x, real_range_y):    
    ix = math.floor(virtual_range_x*x/real_range_x)
    if ix < 0:
        ix = 0
    if ix >= virtual_range_x:
        ix = virtual_range_x-1

    iy = math.floor(virtual_range_y*y/real_range_y)
    if iy < 0:
        iy = 0
    if iy >= virtual_range_y:
        iy = virtual_range_y-1

    return ix, iy

In [2]:
def calculate_region(start_x, end_x, start_y, end_y, result_name):
    start_time = time.time()
    print('Preparing arrays...', flush = True)
    f = open("../train.csv", "r")
    f.readline()
    total = 0
    virtual_range_x = 290 #580 #870 #290 #500
    virtual_range_y = 725 #1450 #2175 #725 #1000
    real_range_x = end_x - start_x
    real_range_y = end_y - start_y
    train_extra_margin = 0.05
    train_start_x = max(start_x - train_extra_margin, 0)
    train_end_x = min(end_x + train_extra_margin, 10)
    train_start_y = max(start_y - train_extra_margin, 0)
    train_end_y = min(end_y + train_extra_margin, 10)
    day_partition_size = 4
    
    split_t = math.floor((1.0 - 0.125) * 786239)
    out_of_business_time = 0.125
    split_test_out_of_business = math.floor((1.0 - 0.125 - out_of_business_time) * 786239)
    split_submit_out_of_business = math.floor((1.0 - out_of_business_time) * 786239)
    test_arr = []
    
    grid = defaultdict(lambda: defaultdict(int))
    grid_valid = defaultdict(lambda: defaultdict(int))
    submit_out_of_business = dict()
    test_out_of_business = dict()
    grid_sorted = dict()
    grid_sorted_valid = dict()
    
    train_samples = 0
    test_samples = 0
    while 1:
        line = f.readline().strip()
        total += 1
        
        if line == '':
            break
        
        if(total % 6000000 == 0):
            print('Checkpoint ...',total, flush = True)
        
        arr = line.split(",")
        #row_id = arr[0]
        x = float(arr[1])
        y = float(arr[2])
        if(x < train_start_x or x > train_end_x or y < train_start_y or y > train_end_y):
            continue
        
        #accuracy = int(arr[3])
        time1 = int(arr[4])
        place_id = arr[5]
        partition_of_day = math.floor((time1 + 120) / (day_partition_size*60)) % int(24/day_partition_size)
        log_month = math.log10(3+((time1 + 120.0) / (60 * 24 * 30)))
        
        
        ix, iy = prep_xy(x - start_x, y - start_y, virtual_range_x, virtual_range_y, real_range_x, real_range_y)
        grid[(ix, iy, partition_of_day)][place_id] += (1) * log_month 
        grid[(ix, iy + 1, partition_of_day)][place_id] += (1) * log_month 
        grid[(ix, iy - 1, partition_of_day)][place_id] += (1) * log_month 
        grid[(ix + 1, iy, partition_of_day)][place_id] += (1) * log_month 
        grid[(ix + 1, iy + 1, partition_of_day)][place_id] += (1) * log_month 
        grid[(ix + 1, iy - 1, partition_of_day)][place_id] += (1) * log_month 
        grid[(ix - 1, iy, partition_of_day)][place_id] += (1) * log_month 
        grid[(ix - 1, iy + 1, partition_of_day)][place_id] += (1) * log_month 
        grid[(ix - 1, iy - 1, partition_of_day)][place_id] += (1) * log_month 
        
        grid[(ix, iy)][place_id] += 1 * log_month
        grid[(ix, iy + 1)][place_id] += 1 * log_month
        grid[(ix, iy - 1)][place_id] += 1 * log_month
        grid[(ix + 1, iy)][place_id] += 1 * log_month
        grid[(ix + 1, iy + 1)][place_id] += 1 * log_month
        grid[(ix + 1, iy - 1)][place_id] += 1 * log_month
        grid[(ix - 1, iy)][place_id] += 1 * log_month
        grid[(ix - 1, iy + 1)][place_id] += 1 * log_month
        grid[(ix - 1, iy - 1)][place_id] += 1 * log_month
        
        if time1 < split_t:
            grid_valid[(ix, iy, partition_of_day)][place_id] += (1) * log_month 
            grid_valid[(ix, iy + 1, partition_of_day)][place_id] += (1) * log_month 
            grid_valid[(ix, iy - 1, partition_of_day)][place_id] += (1) * log_month 
            grid_valid[(ix + 1, iy, partition_of_day)][place_id] += (1) * log_month 
            grid_valid[(ix + 1, iy + 1, partition_of_day)][place_id] += (1) * log_month 
            grid_valid[(ix + 1, iy - 1, partition_of_day)][place_id] += (1) * log_month 
            grid_valid[(ix - 1, iy, partition_of_day)][place_id] += (1) * log_month 
            grid_valid[(ix - 1, iy + 1, partition_of_day)][place_id] += (1) * log_month 
            grid_valid[(ix - 1, iy - 1, partition_of_day)][place_id] += (1) * log_month 
            
            grid_valid[(ix, iy)][place_id] += 1 * log_month
            grid_valid[(ix, iy + 1)][place_id] += 1 * log_month
            grid_valid[(ix, iy - 1)][place_id] += 1 * log_month
            grid_valid[(ix + 1, iy)][place_id] += 1 * log_month
            grid_valid[(ix + 1, iy + 1)][place_id] += 1 * log_month
            grid_valid[(ix + 1, iy - 1)][place_id] += 1 * log_month
            grid_valid[(ix - 1, iy)][place_id] += 1 * log_month
            grid_valid[(ix - 1, iy + 1)][place_id] += 1 * log_month
            grid_valid[(ix - 1, iy - 1)][place_id] += 1 * log_month
            train_samples += 1
            if time1 >= split_test_out_of_business:
                test_out_of_business[place_id] = 1
        else:
            test_arr.append(arr)
            test_samples += 1
        
        if time1 >= split_submit_out_of_business:
            submit_out_of_business[place_id] = 1
    
    f.close()
    
    print('Sorting arrays...', flush = True)
    for el in grid:
        grid_sorted[el] = nlargest(3, sorted(grid[el].items()), key=itemgetter(1))
    for el in grid_valid:
        grid_sorted_valid[el] = nlargest(3, sorted(grid_valid[el].items()), key=itemgetter(1))
      
    print('Run validation...', flush = True)
    total = 0
    score = 0.0
    score_num = 0
    
    for arr in test_arr:
        total += 1
        #row_id = arr[0]
        x = float(arr[1])
        y = float(arr[2])
        if(x < start_x or x > end_x or y < start_y or y > end_y):
            continue
        #accuracy = int(arr[3])
        time1 = int(arr[4])
        place_id = arr[5]
        partition_of_day = math.floor((time1 + 120) / (day_partition_size*60)) % int(24/day_partition_size)
        filled = []
        ix, iy = prep_xy(x - start_x, y - start_y, virtual_range_x, virtual_range_y, real_range_x, real_range_y)
        s1 = (ix, iy, partition_of_day)
        s2 = (ix, iy)
        if len(filled) < 3 and s1 in grid_sorted_valid:
            topitems = grid_sorted_valid[s1]
            for i in range(len(topitems)):
                if topitems[i][0] in filled:
                    continue
                if len(filled) == 3:
                    break
                if topitems[i][0] in test_out_of_business:
                    filled.append(topitems[i][0])
        if len(filled) < 3 and s2 in grid_sorted_valid:
            topitems = grid_sorted_valid[s2]
            for i in range(len(topitems)):
                if topitems[i][0] in filled:
                    continue
                if len(filled) == 3:
                    break
                if topitems[i][0] in test_out_of_business:
                    filled.append(topitems[i][0])
        if len(filled) < 3 and s1 in grid_sorted_valid:
            topitems = grid_sorted_valid[s1]
            for i in range(len(topitems)):
                if topitems[i][0] in filled:
                    continue
                if len(filled) == 3:
                    break
                filled.append(topitems[i][0])
        if len(filled) < 3 and s2 in grid_sorted_valid:
            topitems = grid_sorted_valid[s2]
            for i in range(len(topitems)):
                if topitems[i][0] in filled:
                    continue
                if len(filled) == 3:
                    break
                filled.append(topitems[i][0])
        score += apk([place_id], filled, 3)
            
        score_num += 1
    
    score /= score_num
    print('Predicted score: {}'.format(score), flush = True)
    print('Train samples: ', train_samples, flush = True)
    print('Test samples: ', test_samples, flush = True)
    
    print('Generate submission...')
    out = open(result_name, "w")
    f = open("../test.csv", "r")
    f.readline()
    total = 0
    count_empty0 = 0
    count_empty1 = 0
    count_empty2 = 0
    out.write("row_id,place_id\n")
    
    while 1:
        line = f.readline().strip()
        total += 1
        
        if line == '':
            break
        
        arr = line.split(",")
        row_id = arr[0]
        x = float(arr[1])
        y = float(arr[2])
        
        if(x < start_x or x > end_x or y < start_y or y > end_y):
            continue
        
        time1 = int(arr[4])
        partition_of_day = math.floor((time1 + 120) / (day_partition_size*60)) % int(24/day_partition_size)
        
        out.write(str(row_id) + ',')
        filled = []
        
        ix, iy = prep_xy(x - start_x, y - start_y, virtual_range_x, virtual_range_y, real_range_x, real_range_y)
        
        s1 = (ix, iy, partition_of_day)
        s2 = (ix, iy)
        if len(filled) < 3 and s1 in grid_sorted:
            topitems = grid_sorted[s1]
            for i in range(len(topitems)):
                if topitems[i][0] in filled:
                    continue
                if len(filled) == 3:
                    break
                if topitems[i][0] in submit_out_of_business:
                    out.write(' ' + topitems[i][0])
                    filled.append(topitems[i][0])
        if len(filled) < 3 and s2 in grid_sorted:
            topitems = grid_sorted[s2]
            for i in range(len(topitems)):
                if topitems[i][0] in filled:
                    continue
                if len(filled) == 3:
                    break
                if topitems[i][0] in submit_out_of_business:
                    out.write(' ' + topitems[i][0])
                    filled.append(topitems[i][0])
        if len(filled) < 3 and s1 in grid_sorted:
            topitems = grid_sorted[s1]
            for i in range(len(topitems)):
                if topitems[i][0] in filled:
                    continue
                if len(filled) == 3:
                    break
                out.write(' ' + topitems[i][0])
                filled.append(topitems[i][0])
        if len(filled) < 3 and s2 in grid_sorted:
            topitems = grid_sorted[s2]
            for i in range(len(topitems)):
                if topitems[i][0] in filled:
                    continue
                if len(filled) == 3:
                    break
                out.write(' ' + topitems[i][0])
                filled.append(topitems[i][0])
        
        if len(filled) == 0:
            count_empty0 += 1
        if len(filled) == 1:
            count_empty1 += 1
        if len(filled) == 2:
            count_empty2 += 1
        out.write("\n")
    
    print('Empty0 cases:', str(count_empty0), flush = True)
    print('Empty1 cases:', str(count_empty1), flush = True)
    print('Empty2 cases:', str(count_empty2), flush = True)
    out.close()
    f.close()
    
    print("Elapsed time overall: %s seconds" % (time.time() - start_time), flush = True)
    return score

In [3]:
total_score = 0
total_score += calculate_region(0, 3.3,   0, 3.3,    'result1.csv')
total_score += calculate_region(0, 3.3,   3.3, 6.6,  'result2.csv')
total_score += calculate_region(0, 3.3,   6.6, 10,   'result3.csv')
total_score += calculate_region(3.3, 6.6, 0, 3.3,    'result4.csv')
total_score += calculate_region(3.3, 6.6, 3.3, 6.6,  'result5.csv')
total_score += calculate_region(3.3, 6.6, 6.6, 10,   'result6.csv')
total_score += calculate_region(6.6, 10,  0, 3.3,    'result7.csv')
total_score += calculate_region(6.6, 10,  3.3, 6.6,  'result8.csv')
total_score += calculate_region(6.6, 10,  6.6, 10,   'result9.csv')
print("Final score ", total_score / 9)


Preparing arrays...
Checkpoint ... 6000000
Checkpoint ... 12000000
Checkpoint ... 18000000
Checkpoint ... 24000000
Sorting arrays...
Run validation...
Predicted score: 0.46555049198628434
Train samples:  2787616
Test samples:  491584
Generate submission...
Empty0 cases: 0
Empty1 cases: 0
Empty2 cases: 0
Elapsed time overall: 312.6515748500824 seconds
Preparing arrays...
Checkpoint ... 6000000
Checkpoint ... 12000000
Checkpoint ... 18000000
Checkpoint ... 24000000
Sorting arrays...
Run validation...
Predicted score: 0.46729703156871405
Train samples:  2761481
Test samples:  491479
Generate submission...
Empty0 cases: 0
Empty1 cases: 0
Empty2 cases: 1
Elapsed time overall: 322.97029876708984 seconds
Preparing arrays...
Checkpoint ... 6000000
Checkpoint ... 12000000
Checkpoint ... 18000000
Checkpoint ... 24000000
Sorting arrays...
Run validation...
Predicted score: 0.46190640152682866
Train samples:  2846023
Test samples:  519532
Generate submission...
Empty0 cases: 0
Empty1 cases: 0
Empt

Summary: 
	(1) Line 136: Final score  0.4622352725578575
	(2) Line 272: Final score  0.4802688404559934
	(3) Line 408: Final score  0.4887455647765599
	(4) Line 544: Final score  0.49205710365765004
	(6) Line 680: Final score  0.4913447204256991

Selected 4 hour partition for submission: 