In [2]:
import pandas as pd
import numpy as np
import os
from IPython.display import display
pd.set_option('display.max_columns', None)
import geopandas as gpd
from shapely import Point, Polygon, MultiPolygon
import contextily as ctx

import mapclassify
from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from scipy.spatial import Voronoi
import osmnx as ox
ox.config(log_console=True, use_cache=True)
import pyproj
import math
from tqdm.notebook import tqdm
tqdm.pandas()
from collections import defaultdict

In [3]:
from sklearn.neighbors import BallTree

In [4]:
PATH_PROJECT = '..'
PATH_DATA = f'{PATH_PROJECT}/data'

In [5]:
df = pd.read_pickle(f'{PATH_DATA}/processed/df.pickle')

In [6]:
mask_train = df['split'] == 'train'

In [7]:
ball = BallTree(df.loc[mask_train, ["lat_rad", "lon_rad"]].values, metric='haversine')
dic_index2id = dict(enumerate(df.loc[mask_train, 'id'].values))

In [8]:
k = 40

distances, indices = ball.query(df[["lat_rad", "lon_rad"]].values, k = k)
top_n = pd.DataFrame(indices, index=df['id'])
top_n = top_n \
    .applymap(dic_index2id.get) \
    .stack() \
    .rename_axis(['id', 'rank']) \
    .to_frame('id_2') \
    .reset_index()
top_n['dist'] = distances.reshape(-1) * 6371000
top_n['dist_inv'] = top_n.eval('1/dist')
top_n = top_n.query('id != id_2')
top_n['rank'] = top_n.groupby('id')['dist'].rank(method='first').astype(int)
top_n['sales_per_month'] = top_n['id_2'].map(df.set_index('id')['sales_per_month'])
top_n['sales'] = top_n.groupby('id')['sales_per_month'].cumsum() / top_n['rank']
top_n = top_n.query(f'rank != {k}').set_index(['id', 'rank'])[['sales', 'dist']].unstack()
top_n.columns = [f'{c1}_top_{c2}' for c1, c2 in top_n.columns]
top_n = top_n.reset_index()

In [10]:
top_n.head()

Unnamed: 0,id,sales_top_1,sales_top_2,sales_top_3,sales_top_4,sales_top_5,sales_top_6,sales_top_7,sales_top_8,sales_top_9,sales_top_10,sales_top_11,sales_top_12,sales_top_13,sales_top_14,sales_top_15,sales_top_16,sales_top_17,sales_top_18,sales_top_19,sales_top_20,sales_top_21,sales_top_22,sales_top_23,sales_top_24,sales_top_25,sales_top_26,sales_top_27,sales_top_28,sales_top_29,sales_top_30,sales_top_31,sales_top_32,sales_top_33,sales_top_34,sales_top_35,sales_top_36,sales_top_37,sales_top_38,sales_top_39,dist_top_1,dist_top_2,dist_top_3,dist_top_4,dist_top_5,dist_top_6,dist_top_7,dist_top_8,dist_top_9,dist_top_10,dist_top_11,dist_top_12,dist_top_13,dist_top_14,dist_top_15,dist_top_16,dist_top_17,dist_top_18,dist_top_19,dist_top_20,dist_top_21,dist_top_22,dist_top_23,dist_top_24,dist_top_25,dist_top_26,dist_top_27,dist_top_28,dist_top_29,dist_top_30,dist_top_31,dist_top_32,dist_top_33,dist_top_34,dist_top_35,dist_top_36,dist_top_37,dist_top_38,dist_top_39
0,1,1695.0,1557.5,1610.666667,1634.25,1752.8,1802.5,1940.142857,1921.625,1924.777778,1940.9,1991.181818,2035.333333,2011.692308,2003.142857,2040.733333,2038.5,2018.235294,2008.611111,1984.736842,1993.7,2006.47619,2060.363636,2065.869565,2084.5,2077.28,2093.384615,2085.037037,2127.571429,2159.448276,2162.9,2154.83871,2144.78125,2148.30303,2124.617647,2129.314286,2122.972222,2147.891892,2140.052632,2172.948718,994.463281,1474.496342,1552.048817,1706.925403,2563.711814,3265.583797,3493.935248,10555.799017,10650.065353,24667.73439,25947.249748,26209.802043,26294.399745,31635.605233,50861.743972,62042.451374,62224.985594,68770.436924,77426.526895,115526.519674,131493.165905,132108.172869,132180.477597,144374.768797,145738.958046,145780.052369,148724.713029,148960.605379,149044.345464,149364.178555,150152.446618,153799.142199,153960.843267,154683.411201,155886.526355,156150.94269,156806.292069,157599.677842,157654.095884
1,2,2020.0,2638.0,2336.333333,2450.5,2529.2,2466.0,2354.714286,2470.0,2515.111111,2567.5,2512.545455,2466.75,2421.461538,2368.857143,2355.333333,2352.6875,2330.176471,2382.388889,2445.315789,2476.15,2439.190476,2420.863636,2394.434783,2386.25,2393.04,2365.230769,2375.703704,2397.071429,2379.793103,2378.333333,2377.741935,2415.53125,2397.333333,2399.382353,2433.485714,2423.472222,2433.810811,2433.973684,2466.871795,580.621803,713.0833,972.421408,1003.664341,2221.315175,6870.30607,7325.985186,10782.638952,10843.364992,11357.395034,14313.203773,14470.003715,14654.591572,31416.827118,31785.603984,31840.445122,32707.058126,33447.791332,33498.500727,33667.133896,33930.212685,34114.338013,34488.484236,34497.145681,34548.842805,34926.285573,35187.491574,35208.815208,35305.30591,35446.420931,35781.085945,35837.193748,36099.651522,36234.859234,36514.008987,36576.124093,37065.881944,37857.068581,38102.736392
2,3,1706.0,2109.0,1973.0,1965.75,1920.4,1860.0,2081.857143,2135.5,2186.0,2208.8,2239.363636,2307.083333,2278.923077,2365.5,2334.333333,2320.25,2433.882353,2478.444444,2463.894737,2446.75,2438.333333,2454.590909,2462.782609,2469.291667,2509.32,2542.384615,2535.740741,2516.107143,2500.758621,2474.9,2458.16129,2461.5,2442.787879,2431.794118,2428.428571,2439.777778,2425.864865,2432.947368,2442.179487,51248.409236,62971.604303,66213.95634,66694.938755,67055.912957,68698.513228,70737.78505,73221.454279,74705.460532,74744.565979,75481.192949,76069.826972,76494.455302,76501.065285,77103.590289,77284.150809,77426.188136,77621.547981,78629.275544,79486.417742,79625.24594,80137.494802,80383.582657,80556.68043,81350.484694,81363.097243,81677.237128,81807.003562,82449.948958,82765.087109,82837.292239,83033.900514,83363.168466,83369.187377,84106.120382,84394.00194,84572.026913,84631.727787,85016.586033
3,4,2654.0,2387.0,2245.0,2157.0,2168.0,2142.333333,2111.428571,2049.625,2160.555556,2198.2,2205.0,2269.916667,2289.538462,2241.642857,2221.866667,2193.5625,2187.764706,2277.833333,2256.736842,2236.75,2259.142857,2275.045455,2269.391304,2279.041667,2262.0,2291.923077,2299.666667,2300.285714,2331.758621,2315.5,2305.870968,2314.15625,2306.909091,2311.088235,2301.914286,2302.111111,2355.27027,2356.263158,2365.820513,435.781495,2419.061669,3760.427889,8831.636523,11087.950396,21060.438906,26626.671389,28402.918129,40768.725424,40776.952781,42568.026381,42722.443344,43020.344852,43748.368357,43910.560498,43947.341199,43995.531241,44006.194018,44254.32118,44383.110987,44420.306848,44514.126018,44643.318244,44748.968022,45023.592839,45037.43841,45320.372745,45414.285904,45655.515696,46582.076305,46653.57511,47126.972461,47173.443882,47259.098349,47480.854271,47781.450769,47924.191439,49302.081236,49319.816768
4,5,3166.0,3137.5,3725.666667,3544.5,3315.8,3200.666667,3323.0,3268.625,3172.777778,3111.9,3042.181818,3111.416667,3119.230769,3065.642857,3020.133333,3003.625,2970.470588,2951.5,2908.578947,2884.1,2879.952381,2861.590909,2887.565217,2860.416667,2838.2,2839.846154,2853.185185,2876.285714,2857.586207,2854.866667,2871.483871,2866.1875,2855.636364,2836.088235,2856.571429,2860.083333,2848.702703,2852.684211,2851.358974,303.955595,803.812294,959.358273,1309.859667,1547.412306,1708.143742,1798.082565,1816.801249,1933.396431,1940.611529,2121.075035,2184.254188,2188.226425,2438.173871,2549.543481,2551.460199,2636.009184,2883.355483,2927.965083,2966.060139,2989.286308,3003.416723,3085.950352,3096.275726,3628.056516,3710.937903,3796.694658,3911.244642,3955.651023,3989.628185,4045.60682,4051.289081,4080.93628,4097.247905,4124.641965,4245.817968,4255.870159,4288.016831,4291.525299


In [11]:
top_n.isna().sum().max()

0

In [12]:
df[['id', 'sales_per_month']].merge(top_n) \
    .corr()['sales_per_month'] \
    .sort_values(ascending=False) \
    .head(50)

sales_per_month    1.000000
sales_top_20       0.244649
sales_top_13       0.244364
sales_top_15       0.244146
sales_top_16       0.243909
sales_top_14       0.243663
sales_top_18       0.243288
sales_top_19       0.243189
sales_top_17       0.242898
sales_top_21       0.241583
sales_top_24       0.240689
sales_top_12       0.240480
sales_top_22       0.240429
sales_top_23       0.239564
sales_top_11       0.238783
sales_top_25       0.238496
sales_top_26       0.236169
sales_top_27       0.235742
sales_top_28       0.234809
sales_top_29       0.234662
sales_top_10       0.234489
sales_top_30       0.234273
sales_top_9        0.233754
sales_top_32       0.233520
sales_top_31       0.232947
sales_top_33       0.232018
sales_top_8        0.231417
sales_top_34       0.229078
sales_top_35       0.227389
sales_top_36       0.226919
sales_top_38       0.225541
sales_top_37       0.225250
sales_top_7        0.225206
sales_top_39       0.224477
sales_top_6        0.222614
sales_top_5        0

In [13]:
top_n.to_pickle(f'{PATH_DATA}/processed/top_n.pickle')