In [15]:
import findspark 
findspark.init()
findspark.find()
import requests
import json
import csv
import os
import pandas as pd 
import numpy as np 

# # accesing env file 
# import os 
# from dotenv import load_dotenv, find_dotenv 
# load_dotenv(find_dotenv()) 

In [16]:
from geopy.geocoders import Nominatim
from pyspark.sql import SparkSession
from pyspark.sql.functions import isnan, when, count, col, lit, trim, ltrim, rtrim, split, regexp_extract, expr, coalesce
import time

# entry point for spark's dataframes
spark = SparkSession.builder \
    .master("local") \
    .appName("coordinates extractor") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .getOrCreate()

In [17]:
# convert parquet into list so we could grab its coordinates
df_drugstores = spark.read.parquet("../data/transformed-data/cleaned_drugstores.parquet")

drugstores_row = df_drugstores.select("Row").rdd.flatMap(lambda x: x).collect()
drugstores_name = df_drugstores.select("Name").rdd.flatMap(lambda x: x).collect()
drugstores_address = df_drugstores.select("Address").rdd.flatMap(lambda x: x).collect()
drugstores_contact = df_drugstores.select("Contact").rdd.flatMap(lambda x: x).collect()

drugstores_list = [list(x) for x in zip(drugstores_row, drugstores_name, drugstores_address, drugstores_contact)]

# read last row to find out where you left off incase an error occured in grabbing address coordinates  
if os.path.isfile('../data/transformed-data/cleaned_drugstores_with_coordinates.csv') == True:
    df = pd.read_csv('../data/transformed-data/cleaned_drugstores_with_coordinates.csv', encoding='latin-1', header=None)
    last_row = df.tail(1).values.tolist()[0][0]
else:
    last_row = 0

# grab address coordinates and append them into a csv file 
for index,row in enumerate(drugstores_list):
    # avoid appending to csv if we're done with a row - this occurs if there's an error and we re-run the program
    if row[0] <= last_row:
        continue 
        
    try:
        apiAddress = row[2]

        parameters = {
            "key": "G2Ww9Y234ISou8cGry8LhlUCw1cTTmbV",
            "location": apiAddress
        }
        
        # grab address coordinates 
        response = requests.get("http://www.mapquestapi.com/geocoding/v1/address", params=parameters)
        data = response.text
        dataJ = json.loads(data)['results']
        lat = (dataJ[0]['locations'][0]['latLng']['lat'])
        lng = (dataJ[0]['locations'][0]['latLng']['lng'])
            
        # append lat and long at a csv file 
        updated_drugstores_list = [row[0],row[1],row[2],row[3],lat,lng]
        with open('cleaned_drugstores_with_coordinates.csv', 'a' if os.path.isfile('cleaned_drugstores_with_coordinates.csv') == True else 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(updated_drugstores_list)
            file.close()

    except Exception as e: 
        print("error caused by row {}".format(index+1))
        print(e)

In [30]:
# grab the rows with error 
read_csv = pd.read_csv('../data/transformed-data/cleaned_drugstores_with_coordinates.csv', encoding='latin-1', header=None)
updated_drugstores_list = read_csv.values.tolist()

def missing_rows_checker(x):
    drugstores_list_rows_numbers = list(range(1,last_row+1))
    updated_drugstores_list_rows_numbers = [x[0] for x in updated_drugstores_list]
    missing_rows = list(set(drugstores_list_rows_numbers) - set(updated_drugstores_list_rows_numbers))
    return missing_rows

missing_rows = tuple(missing_rows_checker(updated_drugstores_list))

# check the rows with error - look what's wrong with them 
df_drugstores.createOrReplaceTempView("df_drugstores")
spark.sql("""
    SELECT * 
    FROM df_drugstores
    WHERE Row IN {}
""".format(missing_rows)).show()

+----+--------------------+--------------------+--------------------+
| Row|                Name|             Address|             Contact|
+----+--------------------+--------------------+--------------------+
|1839|The Generics Phar...|Poblacion Macabeb...|           454350896|
|2798|The Generics Phar...|Stall 2 Blk 95 Lo...|                   0|
|2804|The Generics Phar...|20 Ph1 Pck1 L36 B...|             5603547|
|3600|            Generika|No. 326, Dr. Uygu...|0932-2359883 / 26...|
|3842|            Generika|Purok 6 Poblacion...|       ‎0917-1053604|
|3843|            Generika|Zone 1, Taboc, Op...|       ‎0955-5660033|
|4939|    Watsons Pharmacy|Bulacan, Walterma...|(044) 237-4141�/ ...|
|4943|    Watsons Pharmacy|Ground Floor Km 1...|     (02) 8-653-0026|
|5466|    Watsons Pharmacy|Ground Floor, Tan...|      (046) 430-3407|
|5522|    Watsons Pharmacy|Ground Floor, Osm...|        0917-8161504|
|5538|    Watsons Pharmacy|Miranda Building ...|        0917-8189176|
|5541|    Watsons Ph

In [45]:
# fix rows with error by hardcoding them
row_1839 = [1839, "The Generics Pharmacy", "Poblacion Macabebe Pampanga 2018", "454350896", 14.922570, 120.723510]
row_2798 = [2798, "The Generics Pharmacy", "Stall 2 Blk 95 Lot 21 Main Road Brgy 176 Bagong Silang Caloocan City", "0", 14.771744, 121.054321]
row_2804 = [2804, "The Generics Pharmacy","20 Ph1 Pck1 L36 Bagong Silang, Caloocan City", "5603547", 14.781080, 121.037600]
row_3600 = [3600, "Generika", "No. 326, Dr. Uyguangco Ave., Bo. Sto. Nino, Tala, Caloocan City", "0932-2359883 / 264-1275", 14.750244, 121.053688]
row_3842 = [3842, "Generika","Purok 6 Poblacion, Naawan, Misamis Oriental", "0917-1053604", 8.433621, 124.290973]
row_3843 = [3843, "Generika", "Zone 1, Taboc, Opol, Misamis Oriental", "0955-5660033", 8.50612, 124.6028]
row_4939 = [4939, "Watsons Pharmacy", "Bulacan, Waltermart Plaridel, Cagayan Valley Road Cor Plaridel Diversion Road Barangay Banga 1,  3004  , Philippines", "(044) 237-4141 / (044) 764-0026", 14.878310, 120.864150]
row_4943 = [4943, "Watsons Pharmacy", "Ground Floor Km 16 Manalac Ave. Cor. East Service Road, Brgy. San Martin  De Porres,  1713  ,Philippines", "(02) 8-653-0026", 14.496310, 121.041160]
row_5466 = [5466, "Watsons Pharmacy", "Ground Floor, Tan Bldg. Lot. A1, 1B, 2B, Brgy. San Gabriel, Governor's Drive, Corner Congressional Rd, 4117, Philippines", "(046) 430-3407", 14.599635, 120.975959]
row_5522 = [5522, "Watsons Pharmacy", "Ground Floor, Osmena Ave Highway, Poblacion, Dalaguete, 6022, Philippines", "0917-8161504", 7.30023, 125.68052]
row_5538 = [5538, "Watsons Pharmacy", "Miranda Building National Highway, 4209, Philippines", "0917-8189176", 13.87384, 120.97333]
row_5541 = [5541, "Watsons Pharmacy", "Alvez Building  National Highway, 4231, Philippines", "0917-8134564", 9.16842, 122.90454]
row_5552 = [5552, "Watsons Pharmacy", "TMHC Bldg along National Highway cor. Cortez St., 3305, Philippines", "(078) 447-0101", 16.93975, 121.76853]

# append these hardcoded fix to csv 
cleaned_coordinates = [row_1839, row_2798, row_2804, row_3600, row_3842, row_3843, row_4939, row_4943, row_5466, row_5522, row_5538, row_5541, row_5552]
with open('../data/transformed-data/cleaned_drugstores_with_coordinates.csv', 'a', newline='') as file:
    writer = csv.writer(file)
    
    for new_row in cleaned_coordinates:
        writer.writerow(new_row)
    file.close()

# reorder the csv rows by row count
geocoded_row_reorder_df = pd.read_csv('../data/transformed-data/cleaned_drugstores_with_coordinates.csv', encoding='latin-1', header=None)
geocoded_row_reorder_df.columns = ["Row","Name","Address","Contact","Latitude","Longitude"]
geocoded_row_reorder_df.sort_values('Row', inplace=True)
geocoded_row_reorder_df.tail()

# overwrite the unordered csv with the reordered one 
geocoded_row_reorder_df.to_csv ('../data/transformed-data/cleaned_drugstores_with_coordinates.csv', index = None, header=True)

Unnamed: 0,Row,Name,Address,Contact,Latitude,Longitude
5542,5556,Watsons Pharmacy,GF Carmart Building (Puregold Lanang) along JP...,0917-8742350,39.390897,-99.066067
5543,5557,Watsons Pharmacy,"Robinsons Place La Union, McArthur Highway, Br...",(02)7-791-5000,16.59072,120.31951
5544,5558,Watsons Pharmacy,3R Sergie Bldg Brgy Poblacion Sta Rosa Tarlac ...,0917-8177065,15.44854,120.79508
5545,5559,Watsons Pharmacy,University of Perpetual Help System Laguna - B...,0917-8445082,14.32985,121.08789
5546,5560,Watsons Pharmacy,"Florentina Arcade, National Highway, Barangay ...",(02)7-791-5000,10.60812,122.92988
