In [1]:
# my_functions contains the pre-defined custom functions to avoid defining again during modelling and predicting process
# The module needs to be copied to the working directory so we can access itfrom shutil import copyfile
from shutil import copyfile
copyfile(src = "../input/foursquare-my-data/my_functions.py", dst = "../working/my_functions.py")

'../working/my_functions.py'

In [2]:
import pandas as pd
import networkx as nx
import joblib
from my_functions import *

Collecting recordlinkage
  Downloading recordlinkage-0.15-py3-none-any.whl (926 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 926.5/926.5 kB 1.2 MB/s eta 0:00:00
Collecting jellyfish>=0.8.0
  Downloading jellyfish-0.9.0.tar.gz (132 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 132.6/132.6 kB 5.8 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: jellyfish
  Building wheel for jellyfish (setup.py): started
  Building wheel for jellyfish (setup.py): finished with status 'done'
  Created wheel for jellyfish: filename=jellyfish-0.9.0-cp37-cp37m-linux_x86_64.whl size=102005 sha256=7cb9a3030cb9b8fe684f2e2bf9d654e8797a1aa9bb62bef6861e8f7dc6400ef9
  Stored in directory: /root/.cache/pip/wheels/fe/99/4e/646ce766df0d070b0ef04db27aa11543e2767fda3075aec31b
Successfully built jellyfish
Installing collected packages: jellyfish, recordlinkage
Successfully installed jellyfish-0.9.0 recordl



Collecting pykakasi
  Downloading pykakasi-2.2.1-py3-none-any.whl (2.4 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.4/2.4 MB 2.0 MB/s eta 0:00:00
Collecting jaconv
  Downloading jaconv-0.3.tar.gz (15 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting deprecated
  Downloading Deprecated-1.2.13-py2.py3-none-any.whl (9.6 kB)
Building wheels for collected packages: jaconv
  Building wheel for jaconv (setup.py): started
  Building wheel for jaconv (setup.py): finished with status 'done'
  Created wheel for jaconv: filename=jaconv-0.3-py3-none-any.whl size=15564 sha256=767247fddc57032c6d89c7d754cc6c07f791d519654f208b2fd1fb1fed50b31f
  Stored in directory: /root/.cache/pip/wheels/8f/4f/c2/a2a3b14d0e94f855f4aa8887bf0267bee9ecfb8e62a9ee2d92
Successfully built jaconv
Installing collected packages: jaconv, deprecated, pykakasi
Successfully installed deprecated-1.2.13 jaconv-0.3 pykakasi-2.2.1




# Load test dataset

In [3]:
test_df = pd.read_csv('../input/foursquare-location-matching/test.csv', index_col='id')

# Load pre-trained model

In [4]:
model = joblib.load('../input/foursquare-my-data/saved_model.pkl')

# Preprocess and make predictions

In [5]:
# This process is similar to the one during model building
text_preprocess(test_df)
test_score_dfs = get_score(test_df)
test_score = pd.concat(test_score_dfs)

In [6]:
# Predict on the official testing dataset and add predictions as a new column
feat_cols = ['name_score', 'latitude_score', 'longitude_score', 'city_score', 
             'address_score', 'country_score', 'categories_score']

X = test_score[feat_cols].values
final_pred = model.predict(X)
test_score['pred'] = final_pred

# Generate proper format for submission

In [7]:
# Create new DataFrame that contains the IDs first
submission = pd.read_csv('../input/foursquare-location-matching/test.csv', usecols=['id'])

In [8]:
# Keep only IDs of the matching pairs
test_score = test_score[test_score['pred'] == 1][['id_1', 'id_2']]

In [9]:
# Create a swapped version of test_score
test_score_swapped = pd.DataFrame()
test_score_swapped['id_1'] = test_score['id_2']
test_score_swapped['id_2'] = test_score['id_1']

In [10]:
# Concatenate test_score and test_score_swapped so we will not miss any pairs
test_score = pd.concat([test_score, test_score_swapped])
test_score

Unnamed: 0,id_1,id_2
1,E_0283d9f61e569d,E_001b6bad66eb98
1,E_001b6bad66eb98,E_0283d9f61e569d


In [11]:
# Get the groups of entries predicted as pointing to same location 
grouped = test_score.groupby('id_1')['id_2'].apply(list)
grouped = grouped.apply(' '.join)
matches = grouped.index + ' ' + grouped
matches = matches.str.split(' ')
matches = matches.tolist()

G = nx.Graph()
for match in matches:
    nx.add_path(G, match)
    
match_groups = list(nx.connected_components(G))

In [12]:
# Separate duplicated and non-duplicated entities so we will go through less loops in the next step
duplicates = test_score['id_1'].tolist()
submission['duplicated'] = (submission['id'].isin(duplicates)).astype('int64')

single = submission[submission['duplicated'] == 0]
multiple = submission[submission['duplicated'] == 1]

In [13]:
# No further processing on non-duplicated entities
single['matches'] = single['id']
single

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,id,duplicated,matches
0,E_00001118ad0191,0,E_00001118ad0191
1,E_000020eb6fed40,0,E_000020eb6fed40
2,E_00002f98667edf,0,E_00002f98667edf


In [14]:
# Find group of POIs the place entries belong to and put all the matching entries together
rs = []
for s in multiple['id']:
    rs.append(get_group(s, match_groups))

multiple['matches'] = rs
multiple['matches'] = multiple['matches'].apply(' '.join)
multiple

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,id,duplicated,matches
3,E_001b6bad66eb98,1,E_0283d9f61e569d E_001b6bad66eb98
4,E_0283d9f61e569d,1,E_0283d9f61e569d E_001b6bad66eb98


In [15]:
# Get the final output for submission
submission = pd.concat([single, multiple])
submission = submission.drop('duplicated', axis=1)
submission

Unnamed: 0,id,matches
0,E_00001118ad0191,E_00001118ad0191
1,E_000020eb6fed40,E_000020eb6fed40
2,E_00002f98667edf,E_00002f98667edf
3,E_001b6bad66eb98,E_0283d9f61e569d E_001b6bad66eb98
4,E_0283d9f61e569d,E_0283d9f61e569d E_001b6bad66eb98


In [16]:
submission.to_csv('submission.csv', index=False)