## PropertyHub: 5.2 Geocoding

In [1]:
# First let's set up the environment for Selenium to work in Kaggle
# install google chrome
!wget https://dl.google.com/linux/linux_signing_key.pub &>/dev/null 2>&1
!sudo apt-key add linux_signing_key.pub &>/dev/null 2>&1
!echo 'deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main' >> /etc/apt/sources.list.d/google-chrome.list;
!sudo apt-get -y update &>/dev/null 2>&1
!sudo apt-get install -y google-chrome-stable &>/dev/null 2>&1

# install chromedriver
# !apt-get install -y qq unzip
!wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip >/dev/null 2>&1
!unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/ &>/dev/null 2>&1

# install selenium
!sudo apt install -y python3-selenium &>/dev/null 2>&1
!pip install selenium &>/dev/null 2>&1

import os
# Delete unused files
file = 'linux_signing_key.pub'
path = '/kaggle/working/'+file

if os.path.isfile(path):
    os.remove(path)
    
# To check Google Chrome's version
!google-chrome --version;

# To check Chrome Driver's version
!chromedriver -v;

curl: /opt/conda/lib/libcurl.so.4: no version information available (required by curl)
Google Chrome 114.0.5735.198 
ChromeDriver 114.0.5735.90 (386bc09e8f4f2e025eddae123f36f6263096ae49-refs/branch-heads/5735@{#1052})


In [2]:
# Import libraries
import pandas as pd
import numpy as np
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains

In [3]:
class Geocode():
    def __init__(self):
        self.date = '202306'
        
        self.gg_map_url = 'https://www.google.co.th/maps'
        self.adj_place = 'Nye by Sansiri, Krung Thon Buri'
        self.lat_adj_place = 13.721834732432393
        self.lon_adj_place = 100.4979593259578
        self.adj_place_2 = '107 Baring Condo, Muang Samut Prakarn Samut Prakarn'
        self.lat_adj_place_2 = 13.65613877106645
        self.lon_adj_place_2 = 100.6063992928716
        
        self.lat_min = 5
        self.lon_min = 96
        self.lat_max = 22
        self.lon_max = 106
        self.lat = []
        self.lon = []
        self.gg_address = []
        self.gg_address_temp = ''
        self.error_index = []
        self.done_count = 0
        self.delay_time = 1
        self.timeout_while = 20
        
    def import_data(self):
        path = '/kaggle/input/ph-4-data-engineering'
        file_names = f"{self.date}_project.csv"
        self.project = pd.read_csv(f'{path}/{file_names}')
        half_project_len = round(len(self.project)/2)
        self.project = self.project.iloc[half_project_len:,:]
        self.project.reset_index(drop=True, inplace=True)
        print(f'Data imported Project: {len(self.project)} records')
        
    def driver_setup(self):
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument("--window-size=1920,1080")
        self.driver = webdriver.Chrome(options=chrome_options)
        self.action = ActionChains(self.driver)
        print(f'Web driver is set up')
    
    def find_adj(self):
        self.driver.get(self.gg_map_url)
        self.driver.implicitly_wait(5)
        time.sleep(2)
        text_box = self.driver.find_elements(by=By.XPATH, value='//input[contains(@id,"searchboxinput")]')
        search_button = self.driver.find_elements(by=By.XPATH, value='//button[contains(@id,"searchbox-searchbutton")]')
        text_box[0].send_keys(self.adj_place)
        self.action.click(on_element = search_button[0])
        self.action.perform()
        
        url = self.driver.current_url
        timeout = time.time() + self.timeout_while
        while url == self.gg_map_url:
            url = self.driver.current_url
            if time.time() > timeout:
                break
            
        count_at = url.count('@')
        if count_at >= 2:
            for _ in range(count_at-1):
                url = url[url.find('@')+1:]
        
        find_at = url[url.find('@'):]
        lat = float(find_at[1:find_at.find(',')])
        lon = float(find_at[find_at.find(',')+1:find_at.find(',',find_at.find(',')+2)])
        
        self.lat_adj = self.lat_adj_place-lat
        self.lon_adj = self.lon_adj_place-lon
        time.sleep(3)
        print(f'1st lat and lon adjustments are calculated')
    
    def find_adj2(self):
        self.driver.get(self.gg_map_url)
        self.driver.implicitly_wait(5)
        time.sleep(2)
        text_box = self.driver.find_elements(by=By.XPATH, value='//input[contains(@id,"searchboxinput")]')
        search_button = self.driver.find_elements(by=By.XPATH, value='//button[contains(@id,"searchbox-searchbutton")]')
        text_box[0].send_keys(self.adj_place_2)
        self.action.click(on_element = search_button[0])
        self.action.perform()
        
        url = self.driver.current_url
        while url == self.gg_map_url:
            url = self.driver.current_url
        
        time.sleep(3)
        link = self.driver.find_elements(by=By.XPATH, value='//a[contains(@class,"hfpxzc")]')[0].get_attribute('href')
        self.driver.get(link)
        self.driver.implicitly_wait(5)

        url = self.driver.current_url
        at_count = url.count('@')
        timeout = time.time() + self.timeout_while
        while url.count('@') < (at_count+1):
            url = self.driver.current_url
            if time.time() > timeout:
                break

        try:
            self.driver.find_elements(by=By.XPATH, value='//button[contains(@data-item-id,"address")]')[0].text
        except:
            print(f'Cannot find address {self.adj_place_2}')
            return 0,0
            
        count_at = url.count('@')
        if count_at >= 2:
            for _ in range(count_at-1):
                url = url[url.find('@')+1:]
        
        find_at = url[url.find('@'):]
        lat = float(find_at[1:find_at.find(',')])
        lon = float(find_at[find_at.find(',')+1:find_at.find(',',find_at.find(',')+2)])
        
        self.lat_adj_2 = self.lat_adj_place_2-lat
        self.lon_adj_2 = self.lon_adj_place_2-lon
        time.sleep(3)
        print(f'2nd lat and lon adjustments are calculated')
        
    def find_lat_lon(self, place):
        self.driver.get(self.gg_map_url)
        self.driver.implicitly_wait(5)
        time.sleep(1)
        text_box = self.driver.find_elements(by=By.XPATH, value='//input[contains(@id,"searchboxinput")]')
        search_button = self.driver.find_elements(by=By.XPATH, value='//button[contains(@id,"searchbox-searchbutton")]')
        text_box[0].send_keys(place.replace('.',''))
        self.action.click(on_element = search_button[0])
        self.action.perform()
        
        url = self.driver.current_url
        timeout = time.time() + self.timeout_while
        while url == self.gg_map_url:
            url = self.driver.current_url
            if time.time() > timeout:
                break
        
        adj_2_flag = 0
        try:
            self.gg_address_temp = self.driver.find_elements(by=By.XPATH, value='//button[contains(@data-item-id,"address")]')[0].text
        except:
            try: 
                time.sleep(0.5)
                link = self.driver.find_elements(by=By.XPATH, value='//a[contains(@class,"hfpxzc")]')[0].get_attribute('href')
                self.driver.get(link)
                self.driver.implicitly_wait(5)
                
                url = self.driver.current_url
                at_count = url.count('@')
                timeout = time.time() + self.timeout_while
                while url.count('@') < (at_count+1):
                    url = self.driver.current_url
                    if time.time() > timeout:
                        break
                
                try:
                    self.gg_address_temp = self.driver.find_elements(by=By.XPATH, value='//button[contains(@data-item-id,"address")]')[0].text
                    adj_2_flag = 1
                except:
                    time.sleep(self.delay_time)
                    return 0,0
            except:
                time.sleep(self.delay_time)
                return 0,0
                
        url = self.driver.current_url
        count_at = url.count('@')
        if count_at >= 2:
            for _ in range(count_at-1):
                url = url[url.find('@')+1:]
        
        find_at = url[url.find('@'):]
        lat_temp = float(find_at[1:find_at.find(',')])
        lon_temp = float(find_at[find_at.find(',')+1:find_at.find(',',find_at.find(',')+2)])
        lat_temp = lat_temp + self.lat_adj if adj_2_flag == 0 else lat_temp + self.lat_adj_2
        lon_temp = lon_temp + self.lon_adj if adj_2_flag == 0 else lon_temp + self.lon_adj_2
        time.sleep(self.delay_time)
        return lat_temp, lon_temp
        
    def find_geocode(self):
        print(f'Start scraping geocodes ...')
        print('---'*10)
        for i in range(len(self.project)):
            place = self.project.loc[i,'project_name'] + ', ' + self.project.loc[i,'address']
            lat_temp, lon_temp = self.find_lat_lon(place)
            if (lat_temp < self.lat_min) or (lat_temp > self.lat_max) or (lon_temp < self.lon_min) or (lon_temp > self.lon_max):
                place = self.project.loc[i,'project_name'] + ', ' + self.project.loc[i,'address'].split()[-1]
                lat_temp, lon_temp = self.find_lat_lon(place)
                if (lat_temp < self.lat_min) or (lat_temp > self.lat_max) or (lon_temp < self.lon_min) or (lon_temp > self.lon_max):
                    place = self.project.loc[i,'project_name']
                    lat_temp, lon_temp = self.find_lat_lon(place)
                    if (lat_temp < self.lat_min) or (lat_temp > self.lat_max) or (lon_temp < self.lon_min) or (lon_temp > self.lon_max):
                        place = self.project.loc[i,'project_name'] + ' Condo, '
                        lat_temp, lon_temp = self.find_lat_lon(place)
                        if (lat_temp < self.lat_min) or (lat_temp > self.lat_max) or (lon_temp < self.lon_min) or (lon_temp > self.lon_max):
                            place = self.project.loc[i,'project_name'] + ' Condo' + self.project.loc[i,'address']
                            lat_temp, lon_temp = self.find_lat_lon(place)
                            if (lat_temp < self.lat_min) or (lat_temp > self.lat_max) or (lon_temp < self.lon_min) or (lon_temp > self.lon_max):
                                place = self.project.loc[i,'project_name'] + ' Condo, ' + self.project.loc[i,'address'].split()[-1]
                                lat_temp, lon_temp = self.find_lat_lon(place)
                                if (lat_temp < self.lat_min) or (lat_temp > self.lat_max) or (lon_temp < self.lon_min) or (lon_temp > self.lon_max):
                                    place = ' '.join(self.project.loc[i,'project_name'].split()[:-1]) + ' Condo, ' + self.project.loc[i,'address']
                                    lat_temp, lon_temp = self.find_lat_lon(place)
                                    if (lat_temp < self.lat_min) or (lat_temp > self.lat_max) or (lon_temp < self.lon_min) or (lon_temp > self.lon_max):
                                        place = ' '.join(self.project.loc[i,'project_name'].split()[:-1]) + ' Condo, ' + self.project.loc[i,'address'].split()[-1]
                                        lat_temp, lon_temp = self.find_lat_lon(place)
                                        if (lat_temp < self.lat_min) or (lat_temp > self.lat_max) or (lon_temp < self.lon_min) or (lon_temp > self.lon_max):
                                            place = self.project.loc[i,'project_name'] + ' Condo, ' + self.project.loc[i,'address']
                                            print(f'Error getting lat,lon at {i}: ({lat_temp},{lon_temp}) {place}')
                                            self.error_index.append(i)
                                            lat_temp = np.nan
                                            lon_temp = np.nan
                                            self.gg_address_temp = ''
            
            self.lat.append(lat_temp)
            self.lon.append(lon_temp)
            self.gg_address.append(self.gg_address_temp)
            
            print(f'Done {self.done_count}: ({lat_temp},{lon_temp}) {place}')
            self.done_count += 1
            
    def concat_df(self):
        self.project['lat'] = self.lat
        self.project['lon'] = self.lon
        self.project['gg_address'] = self.gg_address
        
    def find_geocode_again_for_na(self):
        self.project_na = self.project[self.project['lat'].isna()]
        for i in self.project_na.index:
            place = self.project.loc[i,'project_name'] + ', ' + self.project.loc[i,'address']
            lat_temp, lon_temp = self.find_lat_lon(place)
            if (lat_temp < self.lat_min) or (lat_temp > self.lat_max) or (lon_temp < self.lon_min) or (lon_temp > self.lon_max):
                place = self.project.loc[i,'project_name'] + ', ' + self.project.loc[i,'address'].split()[-1]
                lat_temp, lon_temp = self.find_lat_lon(place)
                if (lat_temp < self.lat_min) or (lat_temp > self.lat_max) or (lon_temp < self.lon_min) or (lon_temp > self.lon_max):
                    place = self.project.loc[i,'project_name']
                    lat_temp, lon_temp = self.find_lat_lon(place)
                    if (lat_temp < self.lat_min) or (lat_temp > self.lat_max) or (lon_temp < self.lon_min) or (lon_temp > self.lon_max):
                        place = self.project.loc[i,'project_name'] + ' Condo, '
                        lat_temp, lon_temp = self.find_lat_lon(place)
                        if (lat_temp < self.lat_min) or (lat_temp > self.lat_max) or (lon_temp < self.lon_min) or (lon_temp > self.lon_max):
                            place = self.project.loc[i,'project_name'] + ' Condo' + self.project.loc[i,'address']
                            lat_temp, lon_temp = self.find_lat_lon(place)
                            if (lat_temp < self.lat_min) or (lat_temp > self.lat_max) or (lon_temp < self.lon_min) or (lon_temp > self.lon_max):
                                place = self.project.loc[i,'project_name'] + ' Condo, ' + self.project.loc[i,'address'].split()[-1]
                                lat_temp, lon_temp = self.find_lat_lon(place)
                                if (lat_temp < self.lat_min) or (lat_temp > self.lat_max) or (lon_temp < self.lon_min) or (lon_temp > self.lon_max):
                                    place = ' '.join(self.project.loc[i,'project_name'].split()[:-1]) + ' Condo, ' + self.project.loc[i,'address']
                                    lat_temp, lon_temp = self.find_lat_lon(place)
                                    if (lat_temp < self.lat_min) or (lat_temp > self.lat_max) or (lon_temp < self.lon_min) or (lon_temp > self.lon_max):
                                        place = ' '.join(self.project.loc[i,'project_name'].split()[:-1]) + ' Condo, ' + self.project.loc[i,'address'].split()[-1]
                                        lat_temp, lon_temp = self.find_lat_lon(place)
                                        if (lat_temp < self.lat_min) or (lat_temp > self.lat_max) or (lon_temp < self.lon_min) or (lon_temp > self.lon_max):
                                            place = self.project.loc[i,'project_name'] + ', ' + self.project.loc[i,'address']
                                            print(f'Still error getting lat,lon at {i}: ({lat_temp},{lon_temp}) {place}')
                                            lat_temp = np.nan
                                            lon_temp = np.nan
                                            self.gg_address_temp = ''
            
            self.project.loc[i,'lat'] = lat_temp
            self.project.loc[i,'lon'] = lon_temp
            self.project.loc[i,'gg_address'] = self.gg_address_temp
            
    def export_results(self):
        self.project.to_csv(f"{self.date}_project_geo_2.csv",index=False)
        print('---'*10)
        print('Results exported')
        
    def main(self):
        self.import_data()
        self.driver_setup()
        self.find_adj()
        self.find_adj2()
        self.find_geocode()
        self.concat_df()
        self.find_geocode_again_for_na()
        self.export_results()
        self.driver.quit()

In [4]:
GC = Geocode()
GC.main()

Data imported Project: 1664 records
Web driver is set up
1st lat and lon adjustments are calculated
2nd lat and lon adjustments are calculated
Start scraping geocodes ...
------------------------------
Done 0: (13.799890732432392,100.5735267259578) Pano Ville, Din Daeng Bangkok
Done 1: (18.799913632432393,98.9732634259578) Pansook The Urban Condo, Muang Chiang Mai Chiang Mai
Done 2: (13.240654371066451,100.9490403928716) Panya Resort Condominium, Muang Chon Buri Chonburi
Done 3: (7.882723671066451,98.2758561928716) Paradise Beach Residence, Kathu Phuket
Done 4: (13.010512732432392,100.9281552259578) Paradise Ocean View, Bang Lamung Chonburi
Done 5: (12.887696832432393,100.8862585259578) Paradise Park Jomtien Resort, Bang Lamung Chonburi
Done 6: (18.764841732432394,99.0351385259578) Parano Condo @ Chiangmai, Muang Chiang Mai Chiang Mai
Done 7: (13.81173667106645,100.6671855928716) Parc Exo Kaset - Navamintra, Bueng Kum Bangkok
Done 8: (13.889092032432393,100.5987044259578) Parc Residenc