In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
## Imports
import warnings
warnings.filterwarnings('ignore')

import sys

if "google.colab" in sys.modules:
    !pip uninstall lightgbm -y
    !pip install lightgbm==3.3.1
    !pip install Levenshtein

import os
import gc
import time
import random
import pickle
import Levenshtein
import difflib
import multiprocessing
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import lightgbm as lgb
from tqdm.auto import tqdm
from requests import get
from collections import Counter, defaultdict
from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

Found existing installation: lightgbm 3.3.1
Uninstalling lightgbm-3.3.1:
  Successfully uninstalled lightgbm-3.3.1
Collecting lightgbm==3.3.1
  Using cached lightgbm-3.3.1-py3-none-manylinux1_x86_64.whl (2.0 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-3.3.1


In [3]:
def stratified_group_k_fold(X, y, groups, k, seed=None):
    labels_num = np.max(y) + 1
    y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
    y_distr = Counter()
    for label, g in zip(y, groups):
        y_counts_per_group[g][label] += 1
        y_distr[label] += 1

    y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
    groups_per_fold = defaultdict(set)

    def eval_y_counts_per_fold(y_counts, fold):
        y_counts_per_fold[fold] += y_counts
        std_per_label = []
        for label in range(labels_num):
            label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(k)])
            std_per_label.append(label_std)
        y_counts_per_fold[fold] -= y_counts
        return np.mean(std_per_label)
    
    groups_and_y_counts = list(y_counts_per_group.items())
    random.Random(seed).shuffle(groups_and_y_counts)

    for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
        best_fold = None
        min_eval = None
        for i in range(k):
            fold_eval = eval_y_counts_per_fold(y_counts, i)
            if min_eval is None or fold_eval < min_eval:
                min_eval = fold_eval
                best_fold = i
        y_counts_per_fold[best_fold] += y_counts
        groups_per_fold[best_fold].add(g)

    all_groups = set(groups)
    for i in range(k):
        train_groups = all_groups - groups_per_fold[i]
        test_groups = groups_per_fold[i]

        train_indices = [i for i, g in enumerate(groups) if g in train_groups]
        test_indices = [i for i, g in enumerate(groups) if g in test_groups]

        yield train_indices, test_indices

In [4]:
def get_distribution(y_vals):
    y_distr = Counter(y_vals)
    y_vals_sum = sum(y_distr.values())
    return [f'{y_distr[i] / y_vals_sum:.2%}' for i in range(np.max(y_vals) + 1)]

In [5]:
## Parameters
class CFG:
    AUTHOR = "kuruton"
    expID = ""
    if "google.colab" in sys.modules:
        expID = get("http://172.28.0.2:9000/api/sessions").json()[0]["name"].split(".")[0].split("-")[0]
    ROOT_DIR = '/content/drive/MyDrive/Kaggle/Foursquare'
    DATASET_DIR = os.path.join(ROOT_DIR, 'Dataset')
    INPUT_DIR = os.path.join(ROOT_DIR, 'Input')
    OUTPUT_DIR = os.path.join(ROOT_DIR, 'Output')
    is_debug = True
    SEED = 2022
    num_neighbors = 20
    num_split = 5
    feat_columns = ['name', 'address', 'city', 
                'state', 'zip', 'url', 
              'phone', 'categories', 'country']
    vec_columns = ['name', 'categories', 'address', 
                  'state', 'url', 'country']

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(CFG.SEED)

In [6]:
if not os.path.exists(os.path.join(CFG.OUTPUT_DIR, CFG.expID)):
    os.makedirs(os.path.join(CFG.OUTPUT_DIR, CFG.expID))

In [7]:
%load_ext Cython

In [8]:
%%cython
def LCS(str S, str T):
    cdef int i, j
    cdef list dp = [[0] * (len(T) + 1) for _ in range(len(S) + 1)]
    for i in range(len(S)):
        for j in range(len(T)):
            dp[i + 1][j + 1] = max(dp[i][j] + (S[i] == T[j]), dp[i + 1][j], dp[i][j + 1], dp[i + 1][j + 1])
    return dp[len(S)][len(T)]

In [9]:
def recall_knn(df, Neighbors = 10):
    print('Start knn grouped by country')
    train_df_country = []
    for country, country_df in tqdm(df.groupby('country')):
        country_df = country_df.reset_index(drop = True)

        neighbors = min(len(country_df), Neighbors)
        knn = KNeighborsRegressor(n_neighbors = neighbors,
                                    metric = 'haversine',
                                    n_jobs = -1)
        knn.fit(country_df[['latitude','longitude']], country_df.index)
        dists, nears = knn.kneighbors(country_df[['latitude', 'longitude']], 
                                        return_distance = True)

        for k in range(neighbors):            
            cur_df = country_df[['id', 'name']]
            cur_df['match_id'] = country_df['id'].values[nears[:, k]]
            cur_df['kdist_country'] = dists[:, k]
            cur_df['kneighbors_country'] = k
            
            train_df_country.append(cur_df)
    train_df_country = pd.concat(train_df_country)
    
    print('Start knn')
    train_df = []
    knn = NearestNeighbors(n_neighbors = Neighbors)
    knn.fit(df[['latitude','longitude']], df.index)
    dists, nears = knn.kneighbors(df[['latitude','longitude']])
    
    for k in range(Neighbors):
        cur_df = df[['id']]
        cur_df['match_id'] = df['id'].values[nears[:, k]]
        cur_df['kdist'] = dists[:, k]
        cur_df['kneighbors'] = k
        train_df.append(cur_df)
    
    train_df = pd.concat(train_df)
    train_df = train_df.merge(train_df_country,
                                 on = ['id', 'match_id'],
                                 how = 'outer')
    del train_df_country
    
    return train_df

In [10]:
def add_features(df, id2index_d):    
    for col in tqdm(CFG.feat_columns):       
        if col in CFG.vec_columns:
            tfidf = tfidf_d[col]
            print(df.loc[0, col])
            tv_fit = tfidf.transform(df[col].fillna('nan'))
            indexs = [id2index_d[i] for i in df['id']]
            match_indexs = [id2index_d[i] for i in df['match_id']]                    
            df[f'{col}_sim'] = tv_fit[indexs].multiply(tv_fit[match_indexs]).sum(axis = 1).A.ravel()
        
        col_values = data.loc[df['id']][col].values.astype(str)
        matcol_values = data.loc[df['match_id']][col].values.astype(str)
        
        geshs = []
        levens = []
        jaros = []
        lcss = []
        for s, match_s in zip(col_values, matcol_values):
            if s != 'nan' and match_s != 'nan':                    
                geshs.append(difflib.SequenceMatcher(None, s, match_s).ratio())
                levens.append(Levenshtein.distance(s, match_s))
                jaros.append(Levenshtein.jaro_winkler(s, match_s))
                lcss.append(LCS(str(s), str(match_s)))
            else:
                geshs.append(np.nan)
                levens.append(np.nan)
                jaros.append(np.nan)
                lcss.append(np.nan)
        
        df[f'{col}_gesh'] = geshs
        df[f'{col}_leven'] = levens
        df[f'{col}_jaro'] = jaros
        df[f'{col}_lcs'] = lcss
        
        if col not in ['phone', 'zip']:
            df[f'{col}_len'] = list(map(len, col_values))
            df[f'match_{col}_len'] = list(map(len, matcol_values)) 
            df[f'{col}_len_diff'] = np.abs(df[f'{col}_len'] - df[f'match_{col}_len'])
            df[f'{col}_nleven'] = df[f'{col}_leven'] / \
                                    df[[f'{col}_len', f'match_{col}_len']].max(axis = 1)
            
            df[f'{col}_nlcsk'] = df[f'{col}_lcs'] / df[f'match_{col}_len']
            df[f'{col}_nlcs'] = df[f'{col}_lcs'] / df[f'{col}_len']
            
            df = df.drop(f'{col}_len', axis = 1)
            df = df.drop(f'match_{col}_len', axis = 1)
            gc.collect()
    
    df['kdist_diff'] = (df['kdist'] - df['kdist_country']) /\
                                df['kdist_country']
    df['kneighbors_mean'] = df[['kneighbors', 'kneighbors_country']].mean(axis = 1)
    
    return df

In [11]:
def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

def get_poi2ids(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

def get_score(input_df: pd.DataFrame, id2poi: dict, poi2ids: dict):
    scores = []
    for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        targets = poi2ids[id2poi[id_str]]
        preds = set(matches.split())
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

def analysis(df):
    print('Num of data: %s' % len(df))
    print('Num of unique id: %s' % df['id'].nunique())
    print('Num of unique poi: %s' % df['point_of_interest'].nunique())
    
    poi_grouped = df.groupby('point_of_interest')['id'].count().reset_index()
    print('Mean num of unique poi: %s' % poi_grouped['id'].mean())

In [12]:
## Data load
if "google.colab" in sys.modules:
    data_root = CFG.INPUT_DIR
else:
    data_root = '../input/foursquare-location-matching'
data = pd.read_csv(os.path.join(data_root, 'train.csv'))

if CFG.is_debug:
    data = data.sample(n = 10000, random_state = CFG.SEED)
    data = data.reset_index(drop = True)

In [13]:
## Data split
kf = GroupKFold(n_splits=2)
for i, (trn_idx, val_idx) in enumerate(kf.split(data, 
                                                data['point_of_interest'], 
                                                data['point_of_interest'])):
    data.loc[val_idx, 'set'] = i

print('Num of train data: %s' % len(data))
print(data['set'].value_counts())

valid_data = data[data['set'] == 0]
train_data = data[data['set'] == 1]

print('Train data: ')
analysis(train_data)
print('Valid data: ')
analysis(valid_data)

train_poi = train_data['point_of_interest'].unique().tolist()
valid_poi = valid_data['point_of_interest'].unique().tolist()

print(set(train_poi) & set(valid_poi))

train_ids = train_data['id'].unique().tolist()
valid_ids = valid_data['id'].unique().tolist()
      
print(set(train_ids) & set(valid_ids))
      
# tv_ids_d = {}
# tv_ids_d['train_ids'] = train_ids
# tv_ids_d['valid_ids'] = valid_ids

# np.save('tv_ids_d.npy', tv_ids_d)

# del train_data, valid_data
# gc.collect()

# data = data.set_index('id')
# data = data.loc[tv_ids_d['train_ids']]
# data = data.reset_index()

Num of train data: 10000
1.0    5000
0.0    5000
Name: set, dtype: int64
Train data: 
Num of data: 5000
Num of unique id: 5000
Num of unique poi: 4964
Mean num of unique poi: 1.0072522159548751
Valid data: 
Num of data: 5000
Num of unique id: 5000
Num of unique poi: 4964
Mean num of unique poi: 1.0072522159548751
set()
set()


In [14]:
## Train data generated by knn
train_id2index_d = dict(zip(train_data['id'].values, train_data.index))
valid_id2index_d = dict(zip(valid_data['id'].values, valid_data.index))

tfidf_d = {}
for col in CFG.vec_columns:
    tfidf = TfidfVectorizer()
    tfidf.fit(train_data[col].fillna('nan'))
    tfidf_d[col] = tfidf

train_data_ = train_data.copy()
valid_data_ = valid_data.copy()
train_data = recall_knn(train_data, CFG.num_neighbors)
valid_data = recall_knn(valid_data, CFG.num_neighbors)

train_data_ = train_data_.set_index('id')
valid_data_ = valid_data_.set_index('id')
train_ids = train_data['id'].tolist()
train_match_ids = train_data['match_id'].tolist()
valid_ids = valid_data['id'].tolist()
valid_match_ids = valid_data['match_id'].tolist()

train_poi = train_data_.loc[train_ids]['point_of_interest'].values
train_match_poi = train_data_.loc[train_match_ids]['point_of_interest'].values

valid_poi = valid_data_.loc[valid_ids]['point_of_interest'].values
valid_match_poi = valid_data_.loc[valid_match_ids]['point_of_interest'].values

train_data['label'] = np.array(train_poi == train_match_poi, dtype = np.int8)
valid_data['label'] = np.array(valid_poi == valid_match_poi, dtype = np.int8)
del train_poi, train_match_poi, valid_poi, valid_match_poi, train_ids, train_match_ids, valid_ids, valid_match_ids
gc.collect()

print('Num of unique id: %s' % train_data['id'].nunique())
print('Num of train data: %s' % len(train_data))
print('Pos rate: %s' % train_data['label'].mean())
display(train_data.sample(5))

print('Num of unique id: %s' % valid_data['id'].nunique())
print('Num of valid data: %s' % len(valid_data))
print('Pos rate: %s' % valid_data['label'].mean())
display(valid_data.sample(5))

Start knn grouped by country


  0%|          | 0/105 [00:00<?, ?it/s]

Start knn
Start knn grouped by country


  0%|          | 0/99 [00:00<?, ?it/s]

Start knn
Num of unique id: 5000
Num of train data: 136923
Pos rate: 0.03699159381550214


Unnamed: 0,id,match_id,kdist,kneighbors,name,kdist_country,kneighbors_country,label
60724,E_547064db6f7065,E_d84edd4b75ef23,0.249186,12.0,,,,0
15491,E_768d5a8716848d,E_8bbc72d520de51,0.389615,3.0,ชายคลองซีฟู้ด,0.387333,5.0,0
129901,E_d13bf372dfdeb8,E_f0f61b9c9bc62b,,,The Lofts at West 7th,0.099514,11.0,0
68370,E_5ac2a9e7921868,E_e876ab6eb9cdd5,0.054268,13.0,XIIA 3- SMANELA,0.043242,8.0,0
112648,E_4e40cda9c77def,E_ebd4dbfb9121e9,,,Bazaar Ramadhan Kamunting,0.29151,6.0,0


Num of unique id: 5000
Num of valid data: 137115
Pos rate: 0.03689603617401451


Unnamed: 0,id,match_id,kdist,kneighbors,name,kdist_country,kneighbors_country,label
85201,E_060b96b2448d15,E_e9bc5e2d2f78f8,10.223656,17.0,,,,0
20743,E_509914b0f9dfe7,E_09cba38b701fb6,0.886871,4.0,,,,0
130057,E_b428673bde10cc,E_405184e607c4a3,,,"Menards, Brown Road, Auburn Hills, Mi",0.155471,11.0,0
121573,E_a8ec8a424bf91b,E_d2f17fbb2db324,,,Erciyes Konağı B Blok,0.22589,17.0,0
73707,E_875e83a0218975,E_088f6b1f3e7cf1,2.6193,14.0,,,,0


In [15]:
train_data = add_features(train_data, train_id2index_d)
valid_data = add_features(valid_data, valid_id2index_d)

  0%|          | 0/9 [00:00<?, ?it/s]

Banco Itau


KeyError: ignored

In [None]:
## Eval
train_data_ = train_data_.reset_index()
valid_data_ = valid_data_.reset_index()

train_id2poi = get_id2poi(train_data_)
train_poi2ids = get_poi2ids(train_data_)

valid_id2poi = get_id2poi(valid_data_)
valid_poi2ids = get_poi2ids(valid_data_)

eval_df = pd.DataFrame()
eval_df['id'] = valid_data_['id'].unique().tolist()
eval_df['match_id'] = eval_df['id']
print('Unique id: %s' % len(eval_df))

eval_df_ = valid_data[valid_data['label'] == 1][['id', 'match_id']]
eval_df = pd.concat([eval_df, eval_df_])

eval_df = eval_df.groupby('id')['match_id'].\
                        apply(list).reset_index()
eval_df['matches'] = eval_df['match_id'].apply(lambda x: ' '.join(set(x)))
print('Unique id: %s' % len(eval_df))

iou_score = get_score(eval_df, valid_id2poi, valid_poi2ids)
print('IoU score: %s' % iou_score)

In [None]:
# distrs = [get_distribution(train_data["label"])]
# index = ['training set']

# for fold_ind, (dev_ind, val_ind) in enumerate(stratified_group_k_fold(train_data, train_data["label"], train_data["id"], k=CFG.num_split, seed=CFG.SEED)):
#     train_data.loc[val_ind, "fold"] = fold_ind

#     dev_y, val_y = train_data.loc[dev_ind, "label"], train_data.loc[val_ind, "label"]
#     dev_groups, val_groups = train_data.loc[dev_ind, "id"], train_data.loc[val_ind, "id"]
    
#     assert len(set(dev_groups) & set(val_groups)) == 0
    
#     distrs.append(get_distribution(dev_y))
#     index.append(f'development set - fold {fold_ind}')
#     distrs.append(get_distribution(val_y))
#     index.append(f'validation set - fold {fold_ind}')

# display('Distribution per class:')
# pd.DataFrame(distrs, index=index, columns=[f'Label {l}' for l in range(np.max(train_data["label"]) + 1)])

# model learning

In [None]:
train_data_ = train_data_.set_index('id')
valid_data_ = valid_data_.set_index('id')

In [None]:
features = ['kdist','kneighbors','kdist_country','kneighbors_country', 'kdist_diff', 'kneighbors_mean']

columns = ['name', 'address', 'city', 'state',
       'zip', 'country', 'url', 'phone', 'categories']

for c in columns:
    if c == 'country':
        features += ['country_leven', 'country_sim', 'country_gesh', 'country_nleven']
        continue
    features += [f"{c}_gesh", f"{c}_jaro", f"{c}_lcs", f"{c}_leven"]
    if c == 'city':
        features += [f"{c}_len_diff", f"{c}_nleven", f"{c}_nlcsk", f"{c}_nlcs"]
    if c in ['address', 'categories', 'country', 'name', 'state', 'url']:
        features += [f"{c}_len_diff", f"{c}_nleven", f"{c}_nlcsk", f"{c}_nlcs", f"{c}_sim"]

print(len(features))
print(features)

In [None]:
# def fit_lgb(X, y, params=None, es_rounds=20, seed=42, N_SPLITS=5, 
#              n_class=None, model_dir=None, folds=None):
#     models = []
#     oof = np.zeros((len(y), 2), dtype=np.float64)
    
#     for i in tqdm(range(CFG.num_split)):
        
#         print(f"== fold {i} ==")
#         trn_idx = folds==((i - 1) % CFG.num_split)
#         val_idx = folds==i
#         X_train, y_train = X.iloc[trn_idx], y.iloc[trn_idx]
#         X_valid, y_valid = X.iloc[val_idx], y.iloc[val_idx]
        
#         X_train = add_features(X_train)
#         X_valid = add_features(X_valid)
#         X_train = X_train[features]
#         X_valid = X_valid[features]

#         if model_dir is None:
#             model = lgb.LGBMClassifier(**params)
#             model.fit(
#                 X_train, y_train, 
#                 eval_set=[(X_valid, y_valid)],  
#                 early_stopping_rounds=es_rounds, 
#                 eval_metric='binary',  
#     #             verbose=-1)
#                 verbose=50)
#         else:
#             with open(f'{model_dir}/lgb_fold{i}.pkl', 'rb') as f:
#                 model = pickle.load(f)
            
#         pred = model.predict_proba(X_valid)
#         oof[val_idx] = pred
#         models.append(model)
        
#         file = os.path.join(CFG.OUTPUT_DIR, os.path.join(CFG.expID, f'lgb_fold{i}.pkl'))
#         pickle.dump(model, open(file, 'wb'))
#         print()

#         del X_train, X_valid
#         gc.collect()

#     cv = (oof.argmax(axis=-1) == y).mean()
#     print(f"CV-accuracy: {cv}")

#     return oof, models

# def inference_lgb(models, feat_df):
#     pred = np.array([model.predict_proba(feat_df) for model in models])
#     pred = np.mean(pred, axis=0)
#     return pred

In [None]:
def fit_lgb(X_train, y_train, X_valid, y_valid, params=None, es_rounds=20, seed=42, N_SPLITS=5, 
             n_class=None, model_dir=None, folds=None):
    models = []
    oof = np.zeros((len(y_valid), 2), dtype=np.float64)
    
    if model_dir is None:
        model = lgb.LGBMClassifier(**params)
        model.fit(
            X_train, y_train, 
            eval_set=[(X_valid, y_valid)],  
            early_stopping_rounds=es_rounds, 
            eval_metric='binary',  
#             verbose=-1)
            verbose=50)
    else:
        with open(f'{model_dir}/lgb_fold.pkl', 'rb') as f:
            model = pickle.load(f)
        
    pred = model.predict_proba(X_valid)
    oof = pred
    models.append(model)
    
    file = os.path.join(CFG.OUTPUT_DIR, os.path.join(CFG.expID, f'lgb_fold.pkl'))
    pickle.dump(model, open(file, 'wb'))
    print()

    del X_train, X_valid
    gc.collect()

    cv = (oof.argmax(axis=-1) == y).mean()
    print(f"CV-accuracy: {cv}")

    return oof, models

def inference_lgb(models, feat_df):
    pred = np.array([model.predict_proba(feat_df) for model in models])
    pred = np.mean(pred, axis=0)
    return pred

In [None]:
params = {
    'objective': "binary",
    'learning_rate': 0.2,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'random_state': 42,

    'max_depth': 7,   
    'num_leaves': 35, 
    'n_estimators': 1000000, 
    "colsample_bytree": 0.9,
}


oof, models = fit_lgb(train_data[featues], train_data["label"].astype(int), 
                      valid_data[featues], valid_data["label"].astype(int), 
                      params=params, n_class=2, 
                      N_SPLITS=CFG.num_split, folds=train_data["fold"].values)

In [None]:
valid_data["pred"] = oof[:, -1]

#Check Feature Importances

In [None]:
def plot_importances(models):
    importance_df = pd.DataFrame(models[0].feature_importances_, 
                                 index=features, 
                                 columns=['importance'])\
                        .sort_values("importance", ascending=False)

    plt.subplots(figsize=(len(features) // 4, 5))
    plt.bar(importance_df.index, importance_df.importance)
    plt.grid()
    plt.xticks(rotation=90)
    plt.ylabel("importance")
    plt.tight_layout()
    plt.show()

plot_importances(models)

#Check CV

In [None]:
!pip install optuna

In [None]:
def post_process(df):
    id2match = dict(zip(df['id'].values, df['matches'].str.split()))

    for base, match in df[['id', 'matches']].values:
        match = match.split()
        if len(match) == 1:        
            continue

        for m in match:
            if base not in id2match[m]:
                id2match[m].append(base)
    df['matches'] = df['id'].map(id2match).map(' '.join)
    return df 

In [None]:
import optuna

def objective(trial):
    x = trial.suggest_uniform('threshold', 0, 1)
    valid_pred_df = valid_data[valid_data['pred'] > x][['id', 'match_id']]
    out_df = pd.DataFrame()
    out_df['id'] = valid_data['id'].unique().tolist()
    out_df['match_id'] = out_df['id']
    out_df = pd.concat([out_df, valid_pred_df])
    out_df = out_df.groupby('id')['match_id'].\
                        apply(list).reset_index()
    out_df['matches'] = out_df['match_id'].apply(lambda x: ' '.join(set(x)))
    out_df = post_process(out_df)

    score = get_score(out_df, valid_id2poi, valid_poi2ids)
    print(f"CV: {score:.6f}")
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

In [None]:
threshold = study.best_params['threshold']
threshold

In [None]:
study.best_value

In [None]:
valid_pred_df = valid_data[valid_data['pred'] > threshold][['id', 'match_id']]
out_df = pd.DataFrame()
out_df['id'] = valid_data['id'].unique().tolist()
out_df['match_id'] = out_df['id']
out_df = pd.concat([out_df, valid_pred_df])
out_df = out_df.groupby('id')['match_id'].\
                    apply(list).reset_index()
out_df['matches'] = out_df['match_id'].apply(lambda x: ' '.join(set(x)))
out_df = post_process(out_df)

score = get_score(out_df, valid_id2poi, valid_poi2ids)
print(f"CV: {score:.6f}")