# Geolocation Matching

In [None]:
import numpy as np
import pandas as pd

pd.set_option('display.max_colwidth', -1)

In [None]:
country_code = 'AR'

## Data ingestion

**Raw operator data**

In [None]:
opr_geo_raw = pd.read_csv('../../data/Phase_I/Phase_I_Input/AR_geolocation.csv', sep=';', encoding = 'latin1')
print(opr_geo_raw.shape)
opr_geo_raw.head().T

**Cleaned operator data**

In [None]:
keep_columns = ['Distribuidor', 'Cliente', 'lat', 'lng']

In [None]:
opr_geo = pd.read_csv('../../data/Phase_I/Phase_I_Input/AR_sample.csv', sep=';').rename(columns={'Latitud': 'lat', 'Longitud': 'lng'})[keep_columns]
print(opr_geo.shape)
opr_geo.head()

**Google places data**

In [None]:
keep_columns = ['inputCountry', 'formattedAddress', 'formattedPhoneNumber', 'inputCity', 'inputKeyword', 'lat', 'lng', 'name', 'outputCountry', 'outputPostalCode', 'placeId']

In [None]:
opr_google = pd.read_csv('../../data/Phase_II/Phase_II_Output/OPR_ids_details.csv', sep=',')
opr_google = opr_google.loc[opr_google.inputCountry == country_code, keep_columns].drop_duplicates(subset=['placeId'])
print(opr_google.shape)
opr_google.sample(10)

## Match using latitude and longitude

In [None]:
opr_geo.dtypes

In [None]:
opr_google.dtypes

**Retrieve closest observation based on geolocation**

In [None]:
def match_geolocations(row, keep_columns=None):
    keep_columns = keep_columns or ['lat', 'lng', 'formattedAddress', 'outputCountry', 'name', 'placeId']
    diff = np.abs(row.lat - opr_google.lat) + np.abs(row.lng - opr_google.lng)
    index = diff.idxmin()
    row = row.append(opr_google.loc[index, keep_columns].rename({'lat': 'lat_google', 'lng': 'lng_google'}))
    row['geo_diff'] = diff.loc[index]
    return row

In [None]:
opr_matches = opr_geo.apply(match_geolocations, axis=1)
opr_matches.head()

**Summary of differences**


In [None]:
opr_matches.geo_diff.describe()

###  Example of name matches

In [None]:
opr_matches[['Cliente', 'name', 'formattedAddress', 'outputCountry', 'lat', 'lat_google', 'lng', 'lng_google', 'geo_diff']].sort_values('geo_diff')