In [1]:
import datetime
from heapq import nlargest
from operator import itemgetter
import os
import time
import math
from collections import defaultdict

def apk(actual, predicted, k=3):
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def prep_xy(x, y, virtual_range_x, virtual_range_y, real_range_x, real_range_y):    
    ix = math.floor(virtual_range_x*x/real_range_x)
    if ix < 0:
        ix = 0
    if ix >= virtual_range_x:
        ix = virtual_range_x-1

    iy = math.floor(virtual_range_y*y/real_range_y)
    if iy < 0:
        iy = 0
    if iy >= virtual_range_y:
        iy = virtual_range_y-1

    return ix, iy

In [2]:
def calculate_region(start_x, end_x, start_y, end_y, xyp, validation, result_name):
    start_time = time.time()
    print('Preparing arrays...', flush = True)
    f = open("../train.csv", "r")
    f.readline()
    total = 0
    virtual_range_x = 290 #580 #870 #290 #500
    virtual_range_y = 725 #1450 #2175 #725 #1000
    real_range_x = end_x - start_x
    real_range_y = end_y - start_y
    train_extra_margin = 0.1
    train_start_x = max(start_x - train_extra_margin, 0)
    train_end_x = min(end_x + train_extra_margin, 10)
    train_start_y = max(start_y - train_extra_margin, 0)
    train_end_y = min(end_y + train_extra_margin, 10)
    day_partition_size = 1
    
    out_of_business_time = 0.125
    
    if(validation):
        still_in_business_if_after = math.floor((1.0 - 0.125 - out_of_business_time) * 786239)
        train_validation_split_time = math.floor((1.0 - 0.125) * 786239)
    else:
        still_in_business_if_after = math.floor((1.0 - out_of_business_time) * 786239)
        train_validation_split_time = 786239
    
    test_arr = []
    
    grid = defaultdict(lambda: defaultdict(int))
    still_in_business = dict()
    grid_sorted = dict()
    
    train_samples = 0
    test_samples = 0
    while 1:
        line = f.readline().strip()
        total += 1
        
        if(total % 6000000 == 0):
            print('Checkpoint ...',total, flush = True)
        
        if line == '':
            break
        
        arr = line.split(",")
        #row_id = arr[0]
        x = float(arr[1])
        y = float(arr[2])
        if(x < train_start_x or x > train_end_x or y < train_start_y or y > train_end_y):
            continue
        
        #accuracy = int(arr[3])
        time1 = int(arr[4])
        place_id = arr[5]
        partition_of_day = math.floor((time1 + 120) / (day_partition_size*60)) % int(24/day_partition_size)
        log_month = math.log10(3+((time1 + 120.0) / (60 * 24 * 30)))
        
        ix, iy = prep_xy(x - start_x, y - start_y, virtual_range_x, virtual_range_y, real_range_x, real_range_y)
        
        if time1 < train_validation_split_time:
            for weight in xyp:
                grid[(ix + weight[0], iy + weight[1], partition_of_day + weight[2])][place_id] += (1) * log_month 
            
            train_samples += 1
            if time1 >= still_in_business_if_after:
                still_in_business[place_id] = 1
        else:
            test_arr.append(arr)
            test_samples += 1
    f.close()
    
    print('Sorting arrays...', flush = True)
    for element in grid:
        grid_sorted[element] = nlargest(6, sorted(grid[element].items()), key=itemgetter(1))
      
    print('Get result...', flush = True)
    if(not validation):
        out = open(result_name, "w")
        f = open("../test.csv", "r")
        f.readline()
        
    score = 0.0
    score_num = 0
    
    prediction_count = 0
    count_empty0 = 0
    count_empty1 = 0
    count_empty2 = 0
    
    while 1:
        if(validation):
            if(prediction_count == len(test_arr)):
                break;
            arr = test_arr[prediction_count]
        else:
            line = f.readline().strip()
            if line == '':
                break
            arr = line.split(",")
        prediction_count+=1
        
        x = float(arr[1])
        y = float(arr[2])
        if(x < start_x or x > end_x or y < start_y or y > end_y):
            continue
        #accuracy = int(arr[3])
        time1 = int(arr[4])
        
        partition_of_day = math.floor((time1 + 120) / (day_partition_size*60)) % int(24/day_partition_size)
        filled = []
        ix, iy = prep_xy(x - start_x, y - start_y, virtual_range_x, virtual_range_y, real_range_x, real_range_y)
        s1 = (ix, iy, partition_of_day)
        if s1 in grid_sorted:
            topitems = grid_sorted[s1]
            for i in range(len(topitems)):
                if topitems[i][0] in filled:
                    continue
                if len(filled) == 3:
                    break
                if topitems[i][0] in still_in_business:
                    filled.append(topitems[i][0])
        
        if len(filled) == 0:
            count_empty0 += 1
        if len(filled) == 1:
            count_empty1 += 1
        if len(filled) == 2:
            count_empty2 += 1
            
        if(validation):
            place_id = arr[5]
            score += apk([place_id], filled, 3)
            score_num += 1
        else:
            out.write(' '.join(filled))
            out.write("\n")

    if(validation):
        score /= score_num
        print('Predicted score: {}'.format(score), flush = True)
        print('Train samples: ', train_samples, flush = True)
        print('Test samples: ', test_samples, flush = True)
    
        print("Elapsed time overall: %s seconds" % (time.time() - start_time), flush = True)
        return score
    else:
        out.close()
        return 0

In [3]:
xyp = dict()
for x in range(-2,3):
    for y in range(-2,3):
        for p in range(-2,3):
            xyp[(x,y,p)] = 1

total_score = 0
total_score += calculate_region(0, 3.3,   0, 3.3,    xyp, True, 'result1.csv')
total_score += calculate_region(0, 3.3,   3.3, 6.6,  xyp, True, 'result2.csv')
total_score += calculate_region(0, 3.3,   6.6, 10,   xyp, True, 'result3.csv')
total_score += calculate_region(3.3, 6.6, 0, 3.3,    xyp, True, 'result4.csv')
total_score += calculate_region(3.3, 6.6, 3.3, 6.6,  xyp, True, 'result5.csv')
total_score += calculate_region(3.3, 6.6, 6.6, 10,   xyp, True, 'result6.csv')
total_score += calculate_region(6.6, 10,  0, 3.3,    xyp, True, 'result7.csv')
total_score += calculate_region(6.6, 10,  3.3, 6.6,  xyp, True, 'result8.csv')
total_score += calculate_region(6.6, 10,  6.6, 10,   xyp, True, 'result9.csv')
print("Final score ", total_score / 9)


xyp = dict()
for x in range(-2,3):
    for y in range(-2,3):
        for p in range(-2,3):
            if(p == -2 or p == 2):
                xyp[(x,y,p)] = 0.5
            else:
                xyp[(x,y,p)] = 1

total_score = 0
total_score += calculate_region(0, 3.3,   0, 3.3,    xyp, True, 'result1.csv')
total_score += calculate_region(0, 3.3,   3.3, 6.6,  xyp, True, 'result2.csv')
total_score += calculate_region(0, 3.3,   6.6, 10,   xyp, True, 'result3.csv')
total_score += calculate_region(3.3, 6.6, 0, 3.3,    xyp, True, 'result4.csv')
total_score += calculate_region(3.3, 6.6, 3.3, 6.6,  xyp, True, 'result5.csv')
total_score += calculate_region(3.3, 6.6, 6.6, 10,   xyp, True, 'result6.csv')
total_score += calculate_region(6.6, 10,  0, 3.3,    xyp, True, 'result7.csv')
total_score += calculate_region(6.6, 10,  3.3, 6.6,  xyp, True, 'result8.csv')
total_score += calculate_region(6.6, 10,  6.6, 10,   xyp, True, 'result9.csv')
print("Final score ", total_score / 9)

xyp = dict()
for x in range(-2,3):
    for y in range(-2,3):
        for p in range(-2,3):
            if(p == -2 or p == 2):
                xyp[(x,y,p)] = 0.3
            else:
                xyp[(x,y,p)] = 1

total_score = 0
total_score += calculate_region(0, 3.3,   0, 3.3,    xyp, True, 'result1.csv')
total_score += calculate_region(0, 3.3,   3.3, 6.6,  xyp, True, 'result2.csv')
total_score += calculate_region(0, 3.3,   6.6, 10,   xyp, True, 'result3.csv')
total_score += calculate_region(3.3, 6.6, 0, 3.3,    xyp, True, 'result4.csv')
total_score += calculate_region(3.3, 6.6, 3.3, 6.6,  xyp, True, 'result5.csv')
total_score += calculate_region(3.3, 6.6, 6.6, 10,   xyp, True, 'result6.csv')
total_score += calculate_region(6.6, 10,  0, 3.3,    xyp, True, 'result7.csv')
total_score += calculate_region(6.6, 10,  3.3, 6.6,  xyp, True, 'result8.csv')
total_score += calculate_region(6.6, 10,  6.6, 10,   xyp, True, 'result9.csv')
print("Final score ", total_score / 9)

xyp = dict()
for x in range(-2,3):
    for y in range(-2,3):
        for p in range(-2,3):
            if(p == -2 or p == 2):
                xyp[(x,y,p)] = 0.7
            else:
                xyp[(x,y,p)] = 1

total_score = 0
total_score += calculate_region(0, 3.3,   0, 3.3,    xyp, True, 'result1.csv')
total_score += calculate_region(0, 3.3,   3.3, 6.6,  xyp, True, 'result2.csv')
total_score += calculate_region(0, 3.3,   6.6, 10,   xyp, True, 'result3.csv')
total_score += calculate_region(3.3, 6.6, 0, 3.3,    xyp, True, 'result4.csv')
total_score += calculate_region(3.3, 6.6, 3.3, 6.6,  xyp, True, 'result5.csv')
total_score += calculate_region(3.3, 6.6, 6.6, 10,   xyp, True, 'result6.csv')
total_score += calculate_region(6.6, 10,  0, 3.3,    xyp, True, 'result7.csv')
total_score += calculate_region(6.6, 10,  3.3, 6.6,  xyp, True, 'result8.csv')
total_score += calculate_region(6.6, 10,  6.6, 10,   xyp, True, 'result9.csv')
print("Final score ", total_score / 9)


xyp = dict()
for x in range(-2,3):
    for y in range(-2,3):
        for p in range(-2,3):
            if(p == -2 or p == 2):
                pm = 0.7
            else:
                pm = 1
            if(x == -2 or x == 2 or y == -2 or y == 2):
                cm = 0.6
            xyp[(x,y,p)] = 1 * pm * cm

total_score = 0
total_score += calculate_region(0, 3.3,   0, 3.3,    xyp, True, 'result1.csv')
total_score += calculate_region(0, 3.3,   3.3, 6.6,  xyp, True, 'result2.csv')
total_score += calculate_region(0, 3.3,   6.6, 10,   xyp, True, 'result3.csv')
total_score += calculate_region(3.3, 6.6, 0, 3.3,    xyp, True, 'result4.csv')
total_score += calculate_region(3.3, 6.6, 3.3, 6.6,  xyp, True, 'result5.csv')
total_score += calculate_region(3.3, 6.6, 6.6, 10,   xyp, True, 'result6.csv')
total_score += calculate_region(6.6, 10,  0, 3.3,    xyp, True, 'result7.csv')
total_score += calculate_region(6.6, 10,  3.3, 6.6,  xyp, True, 'result8.csv')
total_score += calculate_region(6.6, 10,  6.6, 10,   xyp, True, 'result9.csv')
print("Final score ", total_score / 9)

Preparing arrays...
Checkpoint ... 6000000
Checkpoint ... 12000000
Checkpoint ... 18000000
Checkpoint ... 24000000
Sorting arrays...
Get result...
Predicted score: 0.5115023848156642
Train samples:  2872715
Test samples:  506848
Elapsed time overall: 607.1383671760559 seconds
Preparing arrays...
Checkpoint ... 6000000
Checkpoint ... 12000000
Checkpoint ... 18000000
Checkpoint ... 24000000
Sorting arrays...
Get result...
Predicted score: 0.5145690865484207
Train samples:  2888122
Test samples:  514429
Elapsed time overall: 628.5774927139282 seconds
Preparing arrays...
Checkpoint ... 6000000
Checkpoint ... 12000000
Checkpoint ... 18000000
Checkpoint ... 24000000
Sorting arrays...
Get result...
Predicted score: 0.5074098186778607
Train samples:  2939517
Test samples:  536685
Elapsed time overall: 627.7100930213928 seconds
Preparing arrays...
Checkpoint ... 6000000
Checkpoint ... 12000000
Checkpoint ... 18000000
Checkpoint ... 24000000
Sorting arrays...
Get result...
Predicted score: 0.498