In [None]:
import numpy as np
import pandas as pd
import os
import sys
root_dir = os.path.dirname(os.path.dirname(os.getcwd())) #TODO: find way to put this into some global settings
if root_dir not in sys.path:
    sys.path.append(root_dir)

from geopy import geocoders
from geopy.extra.rate_limiter import RateLimiter
from tqdm import tqdm  # enable progress bar display
tqdm.pandas()

from references import common_cfg, istat_kpi, add_geolocation

In [None]:
farmacie_raw = pd.read_csv('../../data/raw/health/farmacie.csv', low_memory=False, sep=';', decimal=',')

In [None]:
farmacie = farmacie_raw[farmacie_raw.DATAFINEVALIDITA == '-']
farmacie.set_index('CODICEIDENTIFICATIVOFARMACIA', inplace=True)
assert not any(farmacie.index.duplicated()), 'Duplicates in pharmacy ids'

# rename location columns
minsalute_loc_names = ['orig_long', 'orig_lat']
farmacie = farmacie.rename(columns=dict(zip(['LONGITUDINE','LATITUDINE'], minsalute_loc_names)))

In [None]:
coder = geocoders.ArcGIS()
limited_coder = RateLimiter(coder.geocode, min_delay_seconds=1.5)

In [None]:
arcgis_col_name = 'arcGIS_response'

def patch_coordinates(df):
    df[common_cfg.coord_col_names[0]] = np.nan
    df[common_cfg.coord_col_names[1]] = np.nan
    
    for _, row_tuple in enumerate(df.iterrows()):
        row = row_tuple[1]
        if not row[arcgis_col_name]:
            # copy over availble location from the dataset
            df.loc[row_tuple[0], common_cfg.coord_col_names] = row[minsalute_loc_names].values
        else:
            # use arcGIS response
            df.loc[row_tuple[0], common_cfg.coord_col_names] = (
                row[arcgis_col_name].longitude, row[arcgis_col_name].latitude)
    return df

for citta_scelta in [common_cfg.city_list[-1]]:
    print('\n \n ---  %s ' % citta_scelta)
    farmacie_citta = farmacie[farmacie.DESCRIZIONECOMUNE.apply(str.lower) == citta_scelta.lower()].copy()
    
    # update geolocation if arcGIS provides an answer
    query_col = 'indirizzo_query'
    farmacie_citta[query_col] = farmacie_citta.INDIRIZZO + ', ' + \
                                farmacie_citta.CAP + ', ' + \
                                farmacie_citta.DESCRIZIONECOMUNE
    
    farmacie_citta[arcgis_col_name] = farmacie_citta[query_col].progress_apply(limited_coder)
    #farmacie_citta[arcgis_col_name] = None
    
    farmacie_citta = patch_coordinates(farmacie_citta)
    
    # drop missing coordinates
    print('Pharmacies that still have no location:')
    print(farmacie_citta[farmacie_citta[common_cfg.coord_col_names[0]].isin(['-', np.nan])].index.values)
    farmacie_citta = farmacie_citta[~farmacie_citta[
        common_cfg.coord_col_names[0]].isin(['-', np.nan])]  # do not reset index
    
    # convert the valid ones to floats
    farmacie_citta[common_cfg.coord_col_names] = farmacie_citta[common_cfg.coord_col_names].astype(float)
    b_positive = (farmacie_citta[common_cfg.coord_col_names[0]] > 0) & (farmacie_citta[common_cfg.coord_col_names[1]] > 0)
    print('Bad ones: %i' % sum(~b_positive))
    farmacie_citta = farmacie_citta[b_positive]