# Notebook Summary
In this notebook we will use the sentence_transformers package to encode text features in the pairs dataset and compute their cosine similarities (dot-products). We will also calculate the geodesic distances between pairs.

In [1]:
# import basic packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# import pairs dataset
pairs = pd.read_csv('pairs.csv')

## Vector Encoding of Text Features Using BERT
We'll use a pretrained multilanguage BERT model.

In [3]:
!pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 6.7 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 57.7 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 60.6 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 14.8 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 61.6 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloa

In [4]:
# import SentenceTransformers and cosine similarity function

from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim



We will initialize a pre-trained language model selected from this list: https://www.sbert.net/docs/pretrained_models.html. It will be chosen for simultaneous high performance and low computational overhead. We also want a multi-lingual model.

In [5]:
# Initialize transformer model
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')



Downloading:   0%|          | 0.00/968 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.79k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/645 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/471M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

In [6]:
features = ['name', 'address', 'city', 'state', 'zip', 'country', 'url', 'phone', 'categories']

for f in features:
  missing_bools = pairs[f+'_1'].isnull() | pairs[f+'_2'].isnull() #boolean series where a value of the feature is missing for each
  filled_bools = ~missing_bools #boolean series where both values of the feature are filled

  #Vector encode text features, where filled, for all pairs.
  encode_1 = model.encode(pairs[filled_bools][f+'_1'].values) 
  encode_2 = model.encode(pairs[filled_bools][f+'_2'].values)
  #compute cosine similarities with lazy loading (i.e. looping) to save RAM
  n = len(encode_1)
  sims = np.zeros(n)
  for i in range(n):
    sims[i]=cos_sim(encode_1[i],encode_2[i])
  pairs[f+'_sim']=np.zeros(len(pairs)) # create new column of feature similarities and set all to zero
  pairs.loc[filled_bools,f+'_sim']=sims # add calculated similarities where values are filled
  mean_sim = np.mean(sims) #mean cosine similarity
  pairs.loc[missing_bools,f+'_sim']=mean_sim*np.ones(len(pairs[missing_bools])) #put mean similarities where there is a missing value - mean imputation.




In [7]:
pairs.to_csv('pairs.csv')

## Computing Geodesic distances between pairs

In [11]:
#import geodesic distance from geopy
from geopy.distance import geodesic

In [12]:
# Takes two lists of the same length, returns a list of pairs
def to_pair(l1, l2):
    return [(l1[i],l2[i]) for i in range(len(l1))]

# Takes two lists of long/lat pairs, returns their geodesic distance
def get_dists(l1, l2):
    return [geodesic(l1[i], l2[i]).km for i in range(len(l1))]


In [13]:
loc1s = to_pair(pairs['latitude_1'], pairs['longitude_1'])
loc2s = to_pair(pairs['latitude_2'], pairs['longitude_2'])
pairs['distance'] = get_dists(loc1s, loc2s)

In [14]:
pairs.to_csv('pairs.csv')

## Removing Outliers

In [16]:
top = pairs.distance[pairs['match']].quantile(0.99)
long_indices = pairs[pairs['match']][pairs['distance']>=top].index
pairs_clean =pairs.copy()
pairs_clean.drop(long_indices,inplace=True)

bottom = pairs.distance[pairs['match']==False].quantile(0.01)
short_indices = pairs[pairs['match']==False][pairs['distance']<=bottom].index
pairs_clean.drop(short_indices,inplace=True)

pairs_clean.to_csv('pairs_clean.csv')

  
  import sys


In [17]:
pairs_clean.head(50)

Unnamed: 0,id_1,name_1,latitude_1,longitude_1,address_1,city_1,state_1,zip_1,country_1,url_1,...,name_sim,address_sim,city_sim,state_sim,zip_sim,country_sim,url_sim,phone_sim,categories_sim,distance
0,E_000001272c6c5d,Café Stad Oudenaarde,50.859975,3.634196,Abdijstraat,Nederename,Oost-Vlaanderen,9700,BE,,...,0.966012,0.739737,0.896136,0.911244,0.97721,1.0,0.842439,0.93041,1.0,1.046198
1,E_000008a8ba4f48,Turkcell,37.84451,27.844202,Adnan Menderes Bulvarı,,,,TR,,...,1.0,0.467641,0.896136,0.911244,0.97721,1.0,0.842439,0.93041,0.607998,0.730228
2,E_000023d8f4be44,Island Spa,14.51897,121.018702,"5th Flr, Newport Mall, Resorts World Manila",Pasay City,Metro Manila,,PH,,...,0.820979,0.739737,0.896136,0.911244,0.97721,1.0,0.842439,0.93041,1.0,0.140466
3,E_00007dcd2bb53f,TOGO'S Sandwiches,38.257797,-122.064599,"1380 Holiday Ln., Ste. B",Fairfield,CA,94534,US,https://locations.togos.com/ll/US/CA/Fairfield...,...,0.774163,0.739737,1.0,1.000001,0.97721,1.0,0.842439,0.93041,1.0,0.015708
4,E_0000c362229d93,Coffee Cat,7.082218,125.610244,F. Torres St.,Davao City,Davao Region,8000,PH,,...,0.781658,0.157384,1.0,1.0,1.0,1.0,0.842439,0.93041,0.925032,0.698792
5,E_0000c566a81ea1,つじ田 味噌の章,35.694796,139.767348,神田小川町1-1,千代田区,東京都,101-0052,JP,https://tsukemen-tsujita.com,...,0.879329,0.739737,0.896136,0.911244,0.97721,1.0,0.842439,0.93041,0.499441,0.061369
6,E_0000d9e584ed9f,Signature Properties Savannah,32.012582,-81.113156,100 Commercial Ct Ste C,Savannah,GA,31406,US,http://www.oursignatureproperties.com,...,1.0,0.422008,1.0,1.0,1.0,1.0,0.842439,0.93041,0.75828,0.525861
7,E_00011cca3f0bd6,Inner Workings,35.68693,-105.939167,Serving General Area,Santa Fe,NM,87501,US,http://innerworkingsmassage.com,...,1.0,1.0,1.0,1.0,1.0,1.0,0.842439,0.778478,1.0,0.119343
8,E_00015cd7e0227f,Lake Destiny,28.643613,-81.389132,,Maitland,FL,32751,US,,...,0.871865,0.739737,1.0,1.0,1.0,1.0,0.842439,0.93041,0.184734,1.609819
9,E_0001827d6b4ee2,Marshall Boya Bayi,38.534796,30.548005,İplik Mah. Zafer Sok. 4 Şuhut,Afyonkarahisar,,,TR,http://www.marshallboya.com,...,1.0,0.735139,1.0,0.911244,0.97721,1.0,1.0,0.981449,1.0,0.118222
