In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.neighbors import BallTree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsRegressor
import Levenshtein
import difflib
import lightgbm as lgb
import collections
from haversine import haversine

In [2]:
# read data
test = pd.read_csv('../input/foursquare-location-matching/test.csv')
sample_submission = pd.read_csv('../input/foursquare-location-matching/sample_submission.csv')
model_weights = '../input/lgbm-weights/lgb_model.txt'
sample_submission.head(5)

Unnamed: 0,id,matches
0,E_00001118ad0191,E_00001118ad0191
1,E_000020eb6fed40,E_000020eb6fed40
2,E_00002f98667edf,E_00002f98667edf
3,E_001b6bad66eb98,E_001b6bad66eb98 E_0283d9f61e569d
4,E_0283d9f61e569d,E_0283d9f61e569d E_001b6bad66eb98


In [3]:
%load_ext Cython

In [4]:
%%cython
def LCS(str S, str T):
    cdef int i, j
    cdef list dp = [[0] * (len(T) + 1) for _ in range(len(S) + 1)]
    for i in range(len(S)):
        for j in range(len(T)):
            dp[i + 1][j + 1] = max(dp[i][j] + (S[i] == T[j]), dp[i + 1][j], dp[i][j + 1], dp[i + 1][j + 1])
    return dp[len(S)][len(T)]

In [5]:
def init_data(df,knn_data):
    ids = []
    match_ids = []
    
    for k in tqdm(knn_data.keys()):
        for m in knn_data.get(k):
            ids.append(k)
            match_ids.append(m)
    
    df['id'] = ids
    df['match_id'] = match_ids
    
    # add knn haversine distance
    id_1 = [test.loc[df.id].latitude.values,test.loc[df.id].longitude.values]
    id_2 = [test.loc[df.match_id].latitude.values,test.loc[df.match_id].longitude.values]
    dist = []
    for i in tqdm(range(len(id_1[0]))):
        a = [id_1[0][i],id_1[1][i]]
        b = [id_2[0][i],id_2[1][i]]
        dist.append(haversine(a,b))

    df['dist'] = dist
    
    del id_1[:]
    del id_1
    del id_2[:]
    del id_2
    del dist[:]
    del dist
    del ids[:]
    del ids
    del match_ids[:]
    del match_ids
        
    return df

def add_features(df,features_col): # df for train_data, data for original train.csv
    
    # word feature similarity
    for col in features_col:
        print('column:',col)
        
        geshs = []
        levens = []
        jaros = []
        lcss = []
        
        col_values = test.loc[df['id']][col].values.astype(str)
        matcol_values = test.loc[df['match_id']][col].values.astype(str)
        
        for i in tqdm(range(len(col_values))):
            s = col_values[i]
            match_s = matcol_values[i]
            if s != 'nan' and match_s != 'nan':                    
                geshs.append(difflib.SequenceMatcher(None, s, match_s).ratio())
                levens.append(Levenshtein.distance(s, match_s))
                jaros.append(Levenshtein.jaro_winkler(s, match_s))
                lcss.append(LCS(str(s), str(match_s)))
            else:
                geshs.append(np.nan)
                levens.append(np.nan)
                jaros.append(np.nan)
                lcss.append(np.nan)
                
        df[f'{col}_gesh'] = geshs
        df[f'{col}_leven'] = levens
        df[f'{col}_jaro'] = jaros
        df[f'{col}_lcs'] = lcss
        
        # features about string length
        if col in ['name','country','categories']:
            df[f'{col}_len'] = list(map(len, col_values))
            df[f'match_{col}_len'] = list(map(len, matcol_values)) 
            df[f'{col}_len_diff'] = np.abs(df[f'{col}_len'] - df[f'match_{col}_len'])
            df[f'{col}_nleven'] = df[f'{col}_leven'] / df[[f'{col}_len', f'match_{col}_len']].max(axis = 1)

            df[f'{col}_nlcsk'] = df[f'{col}_lcs'] / df[f'match_{col}_len']
            df[f'{col}_nlcs'] = df[f'{col}_lcs'] / df[f'{col}_len']

            df = df.drop(f'{col}_len', axis = 1)
            df = df.drop(f'match_{col}_len', axis = 1)
            
        # reduce memory usage
        # int64 to unit8
        df_int = df.select_dtypes(include=['int'])
        converted_int = df_int.apply(pd.to_numeric,downcast='unsigned')
        for cc in df_int.columns:
            df[cc] = converted_int[cc]
        # float64 to float32
        df_float = df.select_dtypes(include=['float'])
        converted_float = df_float.apply(pd.to_numeric,downcast='float')
        for cc in df_float.columns:
            df[cc] = converted_float[cc]
        
        # clear memory usage of list
        del col_values
        del matcol_values
        del geshs[:]
        del geshs
        del levens[:]
        del levens
        del jaros[:]
        del jaros
        del lcss[:]
        del lcss
        
    return df

In [6]:
def find_neighbors(df,neighbors):
    knn = {} # dictionary, key=id, value=[neighbors ids]
    knn_dist = []
    rads = np.deg2rad(df[['latitude', 'longitude']].values)
    tree = BallTree(rads, metric='haversine')
    for i in tqdm(range(len(df))):
        dist, ind = tree.query(rads[i].reshape(1, -1), k = neighbors)
        neighbors_list = []
        
        for index in ind[0]:
            neighbors_list.append(df['id'].loc[index])
            
        for d in dist[0]:
            knn_dist.append(d)
        
        knn[df['id'].loc[i]] = neighbors_list
        
    return knn, knn_dist

## Generate model input

In [7]:
neighbors = min(len(test),10)
knn_data, knn_dist = find_neighbors(test,neighbors)

100%|██████████| 5/5 [00:00<00:00, 1363.38it/s]


In [8]:
# generate training data
test_data = pd.DataFrame()
features_col = ['name','categories','address','state','city','country','url','zip','phone']

# fill nan values in train.csv
# fill nan
for col in features_col:
    test[col].fillna('nan',inplace=True)
test.set_index('id',inplace=True)

test_data = init_data(test_data, knn_data)
#test_data = add_features(test_data,features_col)

100%|██████████| 5/5 [00:00<00:00, 45100.04it/s]
100%|██████████| 25/25 [00:00<00:00, 42145.34it/s]


In [9]:
# split data
n = 3
data_split = np.array_split(test_data,n)

In [10]:
model_load = lgb.Booster(model_file=model_weights)
pred = []
for nn in range(n):
    test_data = add_features(data_split[nn], features_col)
    X_test = test_data.drop(columns=['id','match_id'])
    X_test = X_test.to_numpy()
    temp_pred = model_load.predict(X_test) # return probability
    for i in temp_pred:
        pred.append(i)

column: name


100%|██████████| 9/9 [00:00<00:00, 3413.70it/s]


column: categories


100%|██████████| 9/9 [00:00<00:00, 14914.55it/s]


column: address


100%|██████████| 9/9 [00:00<00:00, 18586.28it/s]


column: state


100%|██████████| 9/9 [00:00<00:00, 59074.70it/s]


column: city


100%|██████████| 9/9 [00:00<00:00, 26905.73it/s]


column: country


100%|██████████| 9/9 [00:00<00:00, 21041.66it/s]


column: url


100%|██████████| 9/9 [00:00<00:00, 62601.55it/s]


column: zip


100%|██████████| 9/9 [00:00<00:00, 41896.49it/s]


column: phone


100%|██████████| 9/9 [00:00<00:00, 74898.29it/s]


column: name


100%|██████████| 8/8 [00:00<00:00, 6856.24it/s]


column: categories


100%|██████████| 8/8 [00:00<00:00, 16685.45it/s]


column: address


100%|██████████| 8/8 [00:00<00:00, 8701.88it/s]


column: state


100%|██████████| 8/8 [00:00<00:00, 30039.78it/s]


column: city


100%|██████████| 8/8 [00:00<00:00, 19831.22it/s]


column: country


100%|██████████| 8/8 [00:00<00:00, 16989.59it/s]


column: url


100%|██████████| 8/8 [00:00<00:00, 25991.04it/s]


column: zip


100%|██████████| 8/8 [00:00<00:00, 32640.50it/s]


column: phone


100%|██████████| 8/8 [00:00<00:00, 25536.10it/s]


column: name


100%|██████████| 8/8 [00:00<00:00, 5101.01it/s]


column: categories


100%|██████████| 8/8 [00:00<00:00, 9409.54it/s]


column: address


100%|██████████| 8/8 [00:00<00:00, 8095.16it/s]


column: state


100%|██████████| 8/8 [00:00<00:00, 19854.69it/s]

column:




 city


100%|██████████| 8/8 [00:00<00:00, 16929.58it/s]


column: country


100%|██████████| 8/8 [00:00<00:00, 13871.20it/s]


column: url


100%|██████████| 8/8 [00:00<00:00, 40329.85it/s]


column: zip


100%|██████████| 8/8 [00:00<00:00, 35172.36it/s]


column: phone


100%|██████████| 8/8 [00:00<00:00, 51941.85it/s]


## Write predicted result into submission.csv

In [11]:
output = pd.DataFrame()
i = 0
threshold = 0.95
matches = []

for k in tqdm(knn_data.keys()):
    temp = str()
    for m in knn_data.get(k):
        if pred[i] >= threshold:
            temp = temp + m + ' '
        i += 1
        
    matches.append(temp)

test = pd.read_csv('../input/foursquare-location-matching/test.csv')
output['id'] = test['id']
output['matches'] = matches
output.head(5)

100%|██████████| 5/5 [00:00<00:00, 31631.25it/s]


Unnamed: 0,id,matches
0,E_00001118ad0191,E_00001118ad0191 E_001b6bad66eb98 E_000020eb6f...
1,E_000020eb6fed40,E_000020eb6fed40 E_00001118ad0191
2,E_00002f98667edf,E_00002f98667edf
3,E_001b6bad66eb98,E_001b6bad66eb98 E_00001118ad0191
4,E_0283d9f61e569d,E_0283d9f61e569d


In [12]:
output.to_csv('submission.csv',index=False)
print('done')

done
