In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
## Imports
import warnings
warnings.filterwarnings('ignore')

import sys

if "google.colab" in sys.modules:
    !pip uninstall lightgbm -y
    !pip install lightgbm==3.3.1
    !pip install Levenshtein

import os
import gc
import time
import random
import pickle
import Levenshtein
import itertools
import difflib
import multiprocessing
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics.pairwise import haversine_distances
from tqdm.auto import tqdm
from requests import get
from collections import Counter, defaultdict
from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

Found existing installation: lightgbm 3.3.1
Uninstalling lightgbm-3.3.1:
  Successfully uninstalled lightgbm-3.3.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lightgbm==3.3.1
  Using cached lightgbm-3.3.1-py3-none-manylinux1_x86_64.whl (2.0 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-3.3.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
def stratified_group_k_fold(X, y, groups, k, seed=None):
    labels_num = np.max(y) + 1
    y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
    y_distr = Counter()
    for label, g in zip(y, groups):
        y_counts_per_group[g][label] += 1
        y_distr[label] += 1

    y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
    groups_per_fold = defaultdict(set)

    def eval_y_counts_per_fold(y_counts, fold):
        y_counts_per_fold[fold] += y_counts
        std_per_label = []
        for label in range(labels_num):
            label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(k)])
            std_per_label.append(label_std)
        y_counts_per_fold[fold] -= y_counts
        return np.mean(std_per_label)
    
    groups_and_y_counts = list(y_counts_per_group.items())
    random.Random(seed).shuffle(groups_and_y_counts)

    for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
        best_fold = None
        min_eval = None
        for i in range(k):
            fold_eval = eval_y_counts_per_fold(y_counts, i)
            if min_eval is None or fold_eval < min_eval:
                min_eval = fold_eval
                best_fold = i
        y_counts_per_fold[best_fold] += y_counts
        groups_per_fold[best_fold].add(g)

    all_groups = set(groups)
    for i in range(k):
        train_groups = all_groups - groups_per_fold[i]
        test_groups = groups_per_fold[i]

        train_indices = [i for i, g in enumerate(groups) if g in train_groups]
        test_indices = [i for i, g in enumerate(groups) if g in test_groups]

        yield train_indices, test_indices

In [4]:
def get_distribution(y_vals):
    y_distr = Counter(y_vals)
    y_vals_sum = sum(y_distr.values())
    return [f'{y_distr[i] / y_vals_sum:.2%}' for i in range(np.max(y_vals) + 1)]

In [5]:
## Parameters
class CFG:
    AUTHOR = "kuruton"
    expID = ""
    if "google.colab" in sys.modules:
        expID = get("http://172.28.0.2:9000/api/sessions").json()[0]["name"].split(".")[0].split("-")[0]
    ROOT_DIR = '/content/drive/MyDrive/Kaggle/Foursquare'
    DATASET_DIR = os.path.join(ROOT_DIR, 'Dataset')
    INPUT_DIR = os.path.join(ROOT_DIR, 'Input')
    OUTPUT_DIR = os.path.join(ROOT_DIR, 'Output')
    is_debug = False
    SEED = 2022
    num_neighbors = 20
    num_split = 5
    feat_columns = ['name', 'address', 'city', 
                'state', 'zip', 'categories']
    vec_columns = ['name', 'categories', 'address', 
                  'state']

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(CFG.SEED)

In [6]:
## Data load
if "google.colab" in sys.modules:
    data_root = CFG.INPUT_DIR
else:
    data_root = '../input/foursquare-location-matching'
data = pd.read_csv(os.path.join(data_root, 'train.csv'))

if CFG.is_debug:
    data = data.sample(n = 10000, random_state = CFG.SEED)
    data = data.reset_index(drop = True)

In [7]:
data['POI_count'] = data['point_of_interest'].map(dict(data.groupby('point_of_interest')['id'].apply(len)))
data.loc[data['POI_count'] > 5, 'POI_count'] = 5
data['POI_count'] -= 1
data.head()

Unnamed: 0,id,name,latitude,longitude,address,city,state,zip,country,url,phone,categories,point_of_interest,POI_count
0,E_000001272c6c5d,Café Stad Oudenaarde,50.859975,3.634196,Abdijstraat,Nederename,Oost-Vlaanderen,9700.0,BE,,,Bars,P_677e840bb6fc7e,1
1,E_000002eae2a589,Carioca Manero,-22.907225,-43.178244,,,,,BR,,,Brazilian Restaurants,P_d82910d8382a83,1
2,E_000007f24ebc95,ร้านตัดผมการาเกด,13.780813,100.4849,,,,,TH,,,Salons / Barbershops,P_b1066599e78477,0
3,E_000008a8ba4f48,Turkcell,37.84451,27.844202,Adnan Menderes Bulvarı,,,,TR,,,Mobile Phone Shops,P_b2ed86905a4cd3,0
4,E_00001d92066153,Restaurante Casa Cofiño,43.338196,-4.326821,,Caviedes,Cantabria,,ES,,,Spanish Restaurants,P_809a884d4407fb,1


In [8]:
distrs = [get_distribution(data['POI_count'])]
index = ['training set']

for set_ind, (dev_ind, val_ind) in enumerate(stratified_group_k_fold(data, data['POI_count'], data["point_of_interest"], k=2, seed=CFG.SEED)):
    data.loc[val_ind, "set"] = set_ind

    dev_y, val_y = data.loc[dev_ind, 'POI_count'], data.loc[val_ind, 'POI_count']
    dev_groups, val_groups = data.loc[dev_ind, "point_of_interest"], data.loc[val_ind, "point_of_interest"]
    
    assert len(set(dev_groups) & set(val_groups)) == 0
    
    distrs.append(get_distribution(dev_y))
    index.append(f'development set - set {set_ind}')
    distrs.append(get_distribution(val_y))
    index.append(f'validation set - set {set_ind}')

display('Distribution per class:')
pd.DataFrame(distrs, index=index, columns=[f'Label {l}' for l in range(np.max(data['POI_count']) + 1)])

'Distribution per class:'

Unnamed: 0,Label 0,Label 1,Label 2,Label 3,Label 4
training set,37.32%,48.77%,6.38%,2.09%,5.44%
development set - set 0,37.32%,48.77%,6.38%,2.08%,5.44%
validation set - set 0,37.32%,48.77%,6.38%,2.09%,5.44%
development set - set 1,37.32%,48.77%,6.38%,2.09%,5.44%
validation set - set 1,37.32%,48.77%,6.38%,2.08%,5.44%


In [9]:
data = data.loc[data['set'] == 0]

# Calculate distances between matched entities

In [10]:
poi2distances = {}
all_distances = []
for poi, df in tqdm(data[["latitude", "longitude", "point_of_interest"]].groupby("point_of_interest"),
                    total=data["point_of_interest"].nunique()):
    if len(df) == 1:
        # no matches
        continue
        
    distances = []
    distances_mat = haversine_distances(df[["latitude", "longitude"]].values)
    for i in range(len(df) - 1):
        for j in range(i + 1, len(df)):
            # haversine distance -> meters
            distances.append(distances_mat[i, j] * 6371000)
            all_distances.append(distances_mat[i, j] * 6371000)
    poi2distances[poi] = distances

  0%|          | 0/369986 [00:00<?, ?it/s]

In [11]:
poi2distances_df = pd.DataFrame({
    "point_of_interest": list(poi2distances.keys()),
    "distances": list(poi2distances.values())
})
poi2distances_df.head()

Unnamed: 0,point_of_interest,distances
0,P_0000c58a53df6d,[14579.255944833496]
1,P_00010455487fcf,[3094.8833476749014]
2,P_000116a2d0a467,[2151.675426585253]
3,P_00019fb954100b,[104860.41431057264]
4,P_00027058a58996,[1804.0154625141658]


# Split categories, merge with distances per point_of_interest

In [12]:
data["categories"] = data["categories"].fillna("").map(lambda x: x.split(", "))
data_ = data[["id", "name", "categories", "point_of_interest"]].merge(
    poi2distances_df, on="point_of_interest", how="inner")
data_["median_distances"] = data_["distances"].map(np.median)
data_.head()

Unnamed: 0,id,name,categories,point_of_interest,distances,median_distances
0,E_000002eae2a589,Carioca Manero,[Brazilian Restaurants],P_d82910d8382a83,[1446.3191326542133],1446.319133
1,E_e80db432029aea,Carioca Manero,"[Bars, Snack Places]",P_d82910d8382a83,[1446.3191326542133],1446.319133
2,E_000023d8f4be44,Island Spa,[Spas],P_020de174484ec6,[4872.665506293463],4872.665506
3,E_12453effe251db,Island Spa Theater,[Spas],P_020de174484ec6,[4872.665506293463],4872.665506
4,E_0000d9e584ed9f,Signature Properties Savannah,[Real Estate Offices],P_af856e3abdcebc,[29501.310296722237],29501.310297


# Aggregate distances for each categories

In [13]:
# make each row has only one category
exploded = data_.explode("categories").drop_duplicates(["point_of_interest", "categories"])
exploded.head()

Unnamed: 0,id,name,categories,point_of_interest,distances,median_distances
0,E_000002eae2a589,Carioca Manero,Brazilian Restaurants,P_d82910d8382a83,[1446.3191326542133],1446.319133
1,E_e80db432029aea,Carioca Manero,Bars,P_d82910d8382a83,[1446.3191326542133],1446.319133
1,E_e80db432029aea,Carioca Manero,Snack Places,P_d82910d8382a83,[1446.3191326542133],1446.319133
2,E_000023d8f4be44,Island Spa,Spas,P_020de174484ec6,[4872.665506293463],4872.665506
4,E_0000d9e584ed9f,Signature Properties Savannah,Real Estate Offices,P_af856e3abdcebc,[29501.310296722237],29501.310297


In [14]:
cat2dist = exploded.groupby("categories")['distances'].apply(lambda x: list(itertools.chain.from_iterable(x)))
cat2dist = cat2dist.reset_index()
cat2dist

Unnamed: 0,categories,distances
0,,"[29501.310296722237, 3936.741324562409, 81808...."
1,ATMs,"[0.0, 3781.3211005048947, 54248.547142553994, ..."
2,Acai Houses,"[8696.758395534598, 135079.96223462516, 25044...."
3,Accessories Stores,"[50.97510391589751, 753929.3166753021, 10747.0..."
4,Acehnese Restaurants,"[7515.6660715718945, 1034.5811504425606, 19992..."
...,...,...
870,Yunnan Restaurants,"[111470.61406565801, 6530.581842168889, 33470...."
871,Zhejiang Restaurants,"[995.6200193478272, 14971.323687329656, 2465.7..."
872,Zoo Exhibits,"[2145.6911414854635, 1552.8955612173959, 2051...."
873,Zoos,"[1802.1528434044117, 3617.88582576169, 11928.3..."


In [15]:
cat2dist.loc[0, 'categories'] = 'unknown'
cat2dist['distances'][0] = all_distances

In [16]:
cat2dist['distances_num'] = cat2dist['distances'].map(len)

In [17]:
cat2dist.head()

Unnamed: 0,categories,distances,distances_num
0,unknown,"[14579.255944833496, 3094.8833476749014, 2151....",488144
1,ATMs,"[0.0, 3781.3211005048947, 54248.547142553994, ...",1231
2,Acai Houses,"[8696.758395534598, 135079.96223462516, 25044....",14
3,Accessories Stores,"[50.97510391589751, 753929.3166753021, 10747.0...",634
4,Acehnese Restaurants,"[7515.6660715718945, 1034.5811504425606, 19992...",110


In [18]:
cat2dist.to_pickle(os.path.join(CFG.DATASET_DIR, "cat2dist.pkl"))

In [19]:
cat2dist_ = pd.read_pickle(os.path.join(CFG.DATASET_DIR, "cat2dist.pkl"))
cat2dist_.head()

Unnamed: 0,categories,distances,distances_num
0,unknown,"[14579.255944833496, 3094.8833476749014, 2151....",488144
1,ATMs,"[0.0, 3781.3211005048947, 54248.547142553994, ...",1231
2,Acai Houses,"[8696.758395534598, 135079.96223462516, 25044....",14
3,Accessories Stores,"[50.97510391589751, 753929.3166753021, 10747.0...",634
4,Acehnese Restaurants,"[7515.6660715718945, 1034.5811504425606, 19992...",110
