In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.neighbors import BallTree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import Levenshtein
import difflib
import lightgbm as lgb
import collections
from haversine import haversine


In [2]:
# read data
train = pd.read_csv('../input/foursquare-location-matching/train.csv')
test = pd.read_csv('../input/foursquare-location-matching/test.csv')
sample_submission = pd.read_csv('../input/foursquare-location-matching/sample_submission.csv')

## Generate training data from train.csv

In [3]:
# 找出 train.csv 裡面每個 id 所匹配的 POI id
true_matches = {} # dictionary, key=id, value=[matched ids]

for i, row in train.groupby('point_of_interest'): # 按照 POI 將資料分成群組
    ids_ = set(row.id.values) # 無序集合且不包含重複資料
    
    for id_ in ids_:
        true_matches[id_] = ids_
        
# true_matches

In [4]:
# for each id, find its nearest neighbors

def find_neighbors(df,neighbors):
    knn = {} # dictionary, key=id, value=[neighbors ids]
    knn_dist = []
    rads = np.deg2rad(df[['latitude', 'longitude']].values)
    tree = BallTree(rads, metric='haversine')
    
    for i in tqdm(range(len(df))):
        dist, ind = tree.query(rads[i].reshape(1, -1), k = neighbors)
        neighbors_list = []
        for index in ind[0]:
            neighbors_list.append(df['id'].loc[index])
        
        for d in dist[0]:
            knn_dist.append(d)
            
        knn[df['id'].loc[i]] = neighbors_list
        
    return knn, knn_dist

In [5]:
%load_ext Cython

In [6]:
%%cython
def LCS(str S, str T):
    cdef int i, j
    cdef list dp = [[0] * (len(T) + 1) for _ in range(len(S) + 1)]
    for i in range(len(S)):
        for j in range(len(T)):
            dp[i + 1][j + 1] = max(dp[i][j] + (S[i] == T[j]), dp[i + 1][j], dp[i][j + 1], dp[i + 1][j + 1])
    return dp[len(S)][len(T)]

In [7]:
def init_train_data(df,knn_data,true_matches):
    ids = []
    match_ids = []
    label = []
    
    for k in tqdm(knn_data.keys()):
        # true match
        for m in true_matches.get(k):
            ids.append(k)
            match_ids.append(m)
            label.append(1)
        # false match
        for m in knn_data.get(k):
            if not(m in true_matches.get(k)):
                ids.append(k)
                match_ids.append(m)
                label.append(0)
    
    df['id'] = ids
    df['match_id'] = match_ids
    '''
    # add knn haversine distance
    id_1 = [train.loc[df.id].latitude.values,train.loc[df.id].longitude.values]
    id_2 = [train.loc[df.match_id].latitude.values,train.loc[df.match_id].longitude.values]
    dist = []
    for i in tqdm(range(len(id_1[0]))):
        a = [id_1[0][i],id_1[1][i]]
        b = [id_2[0][i],id_2[1][i]]
        dist.append(haversine(a,b))

    df['dist'] = dist
    
    del id_1[:]
    del id_1
    del id_2[:]
    del id_2
    del dist[:]
    del dist
    '''
    del ids[:]
    del ids
    del match_ids[:]
    del match_ids
    
    label_counter = collections.Counter(label)
    print('true match:',label_counter[1]/(label_counter[1]+label_counter[0]))
    print('false match:',label_counter[0]/(label_counter[1]+label_counter[0]))
        
    return df, label

In [8]:
neighbors = 10
knn_data,knn_dist = find_neighbors(train,neighbors)

In [9]:
del knn_dist

In [10]:
# generate training data
train = pd.read_csv('../input/foursquare-location-matching/train.csv')
df = pd.DataFrame()
features_col = ['name','categories','address','state','city','country']

# fill nan
for col in features_col:
    train[col].fillna('nan',inplace=True)
train.set_index('id',inplace=True)

df, label = init_train_data(df,knn_data,true_matches)
# train_data = add_features(train_data,features_col)

In [11]:
df.head(10)

In [12]:
# split data
n = 5
df_split = np.array_split(df, n)
del df

In [13]:
# add features
for nn in range(n):
    sub_df = df_split[nn]
    for col in features_col:
            print('column:',col)

            geshs = []
            levens = []
            jaros = []
            lcss = []

            col_values = train.loc[sub_df['id']][col].values.astype(str)
            matcol_values = train.loc[sub_df['match_id']][col].values.astype(str)

            for i in tqdm(range(len(col_values))):
                s = col_values[i]
                match_s = matcol_values[i]
                if s != 'nan' and match_s != 'nan':                    
                    geshs.append(difflib.SequenceMatcher(None, s, match_s).ratio())
                    levens.append(Levenshtein.distance(s, match_s))
                    jaros.append(Levenshtein.jaro_winkler(s, match_s))
                    lcss.append(LCS(str(s), str(match_s)))
                else:
                    geshs.append(np.nan)
                    levens.append(np.nan)
                    jaros.append(np.nan)
                    lcss.append(np.nan)
            sub_df[f'{col}_gesh'] = geshs
            sub_df[f'{col}_leven'] = levens
            sub_df[f'{col}_jaro'] = jaros
            sub_df[f'{col}_lcs'] = lcss

            # features about string length
            if col in ['name','country','categories']:
                sub_df[f'{col}_len'] = list(map(len, col_values))
                sub_df[f'match_{col}_len'] = list(map(len, matcol_values)) 
                sub_df[f'{col}_len_diff'] = np.abs(sub_df[f'{col}_len'] - sub_df[f'match_{col}_len'])
                sub_df[f'{col}_nleven'] = sub_df[f'{col}_leven'] / sub_df[[f'{col}_len', f'match_{col}_len']].max(axis = 1)

                sub_df[f'{col}_nlcsk'] = sub_df[f'{col}_lcs'] / sub_df[f'match_{col}_len']
                sub_df[f'{col}_nlcs'] = sub_df[f'{col}_lcs'] / sub_df[f'{col}_len']

                sub_df = sub_df.drop(f'{col}_len', axis = 1)
                sub_df = sub_df.drop(f'match_{col}_len', axis = 1)
            
    sub_df.to_csv('../working/train_data'+str(nn+1)+'.csv', index = False)
    del sub_df

In [14]:
# clear memory usage of list
del col_values
del matcol_values
del geshs[:]
del geshs
del levens[:]
del levens
del jaros[:]
del jaros
del lcss[:]
del lcss
del knn_data
del train
del df_split

## Train model

In [15]:
# concate training data
train_data = pd.DataFrame()

for i in tqdm(range(n)):
    temp = pd.read_csv('../working/train_data'+str(i+1)+'.csv')
    
    # reduce memory usage
    # int64 to unit8
    df_int = temp.select_dtypes(include=['int'])
    converted_int = df_int.apply(pd.to_numeric,downcast='unsigned')
    for cc in df_int.columns:
         temp[cc] = converted_int[cc]
    # float64 to float32
    df_float = temp.select_dtypes(include=['float'])
    converted_float = df_float.apply(pd.to_numeric,downcast='float')
    for cc in df_float.columns:
        temp[cc] = converted_float[cc]
        
    train_data = pd.concat([train_data,temp])
    del temp


In [16]:
del df_int
del converted_int
del df_float
del converted_float

In [17]:
train_data.info(memory_usage='deep')

In [18]:
train_data.head(3)

In [19]:
# generate train and test data
X = train_data.drop(columns=['id','match_id'])
y = label
del train_data
del label

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,stratify=y)
print('X_train:', X_train.shape)
print('X_test:', X_test.shape)

In [21]:
# LightGBM
model = lgb.LGBMClassifier(is_unbalance=True,learning_rate=0.05)
model.fit(X_train,y_train)
model.booster_.save_model('../working/lgb_model.txt')

In [22]:
# load model and predict
model_load = lgb.Booster(model_file='../working/lgb_model.txt')
pred = model_load.predict(X_test) # return probability

In [23]:
del X_train
del y_train

In [24]:
# caculate accuracy
TP = 0
TN = 0
FP = 0
FN = 0
threshold = 0.5
for i in range(len(pred)):
    if pred[i] >= threshold:
        if y_test[i] == 1:
            TP += 1
        else:
            FP += 1
    else:
        if y_test[i] == 0:
            TN += 1
        else:
            FN += 1
    
precision = TP / (TP+FP)
recall = TP / (TP+FN)
f1_score = (2*precision*recall) / (precision+recall)

print('F1 score',f1_score)
print('Accuracy',(TP+TN)/(TP+TN+FP+FN))