## PropertyHub: 1 Getting Locations

In [1]:
# First let's set up the environment for Selenium to work in Kaggle
# install google chrome
!wget https://dl.google.com/linux/linux_signing_key.pub &>/dev/null 2>&1
!sudo apt-key add linux_signing_key.pub &>/dev/null 2>&1
!echo 'deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main' >> /etc/apt/sources.list.d/google-chrome.list;
!sudo apt-get -y update &>/dev/null 2>&1
!sudo apt-get install -y google-chrome-stable &>/dev/null 2>&1

# install chromedriver
# !apt-get install -y qq unzip
!wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip >/dev/null 2>&1
!unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/ &>/dev/null 2>&1

# install selenium
!sudo apt install -y python3-selenium &>/dev/null 2>&1
!pip install selenium &>/dev/null 2>&1

import os
# Delete unused files
file = 'linux_signing_key.pub'
path = '/kaggle/working/'+file

if os.path.isfile(path):
    os.remove(path)
    
# To check Google Chrome's version
!google-chrome --version;

# To check Chrome Driver's version
!chromedriver -v;

curl: /opt/conda/lib/libcurl.so.4: no version information available (required by curl)
Google Chrome 114.0.5735.198 
ChromeDriver 114.0.5735.90 (386bc09e8f4f2e025eddae123f36f6263096ae49-refs/branch-heads/5735@{#1052})


In [2]:
# Import libraries
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains

In [3]:
class GetLinks():
    def __init__(self):
        self.train_line = []
        self.station_name = []
        self.station_link_name = []
        self.station_num_rent = []
        self.station_num_sale = []
        
        self.school_type = []
        self.school_province = []
        self.school_name = []
        self.school_link_name = []
        self.school_num_rent = []
        self.school_num_sale = []
        
        self.area_province = []
        self.area_name = []
        self.area_link_name = []
        self.area_num_rent = []
        self.area_num_sale = []
        
        self.district_province = []
        self.district_name = []
        self.district_link_name = []
        self.district_num_rent = []
        self.district_num_sale = []
        
        self.place_link_names = []
        self.place_link_list = []
    
    def driver_setup(self):
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument("--window-size=1920,1080")
        self.driver = webdriver.Chrome(options=chrome_options)
        self.action = ActionChains(self.driver)
        
    def get_stations(self):
        self.driver.get(self.web_link)
        self.driver.implicitly_wait(2)
        bts_button = self.driver.find_elements(by=By.XPATH, value='//button[contains(@class,"sc-1piz3ft-1 daQjbi")]')
        self.action.click(on_element = bts_button[0])
        self.action.perform()
        find_train_lines = self.driver.find_elements(by=By.XPATH, value='//li[contains(@class,"sc-kna4ye-14 dWMNA-D")]')
        
        for find_train_line in find_train_lines:
            self.action.click(on_element = find_train_line)
            self.action.perform()
            find_stations = self.driver.find_elements(by=By.XPATH, value='//a[contains(@class,"zoneTypeStyle")]')
            if len(find_stations) > 0:
                for find_station in find_stations:
                    self.train_line.append(find_train_line.text.strip())
                    self.station_name.append(find_station.text.replace('Condo ','').strip())
                    self.station_link_name.append(find_station.get_attribute('href').replace('https://propertyhub.in.th/en/condo/',''))

                find_num_rents = self.driver.find_elements(by=By.XPATH, value='//div[contains(@class,"sc-1j4jwdg-6 ehYqed")]')
                for find_num_rent in find_num_rents:
                    self.station_num_rent.append(int(find_num_rent.text.replace(',','').split()[0]))

                find_num_sales = self.driver.find_elements(by=By.XPATH, value='//div[contains(@class,"sc-1j4jwdg-7 jMCJvE")]')
                for find_num_sale in find_num_sales:
                    self.station_num_sale.append(int(find_num_sale.text.replace(',','').split()[0]))

        self.df_station = pd.DataFrame([self.train_line,self.station_name,self.station_link_name,self.station_num_rent,self.station_num_sale]).T
        self.df_station.columns = ['place_type','place_name','link_name','num_rent','num_sale']
        print(f"Done getting names for {len(find_train_lines)} train lines ({len(self.station_name)} stations with {self.df_station['num_rent'].sum()} for-rents and {self.df_station['num_sale'].sum()} for-sales)")
        
    def get_schools(self):
        self.driver.get(self.web_link)
        self.driver.implicitly_wait(2)
        school_button = self.driver.find_elements(by=By.XPATH, value='//button[contains(@class,"sc-1piz3ft-1 daQjbi")]')
        self.action.click(on_element = school_button[1])
        self.action.perform()
        find_school_types = self.driver.find_elements(by=By.XPATH, value='//li[contains(@class,"sc-kna4ye-13 jExIsm")]')
        for find_school_type in find_school_types:
            self.action.click(on_element = find_school_type)
            self.action.perform()
            find_provinces_len = len(self.driver.find_elements(by=By.XPATH, value='//div[contains(@class,"sc-kna4ye-8 iWXqEw")]/ul/li'))
            for i in range(find_provinces_len):
                self.action.click(on_element = find_school_type)
                self.action.perform()
                find_province = self.driver.find_elements(by=By.XPATH, value='//div[contains(@class,"sc-kna4ye-8 iWXqEw")]/ul/li')[i]
                self.action.click(on_element = find_province)
                self.action.perform()
                find_schools = self.driver.find_elements(by=By.XPATH, value='//a[contains(@class,"zoneTypeStyle")]')
                if len(find_schools) > 0:
                    for find_school in find_schools:
                        self.school_type.append(find_school_type.text.strip())
                        self.school_name.append(find_school.text.replace('Condo ','').strip())
                        self.school_link_name.append(find_school.get_attribute('href').replace('https://propertyhub.in.th/en/condo/',''))
                        
                    find_num_rents = self.driver.find_elements(by=By.XPATH, value='//div[contains(@class,"sc-1j4jwdg-6 ehYqed")]')
                    for find_num_rent in find_num_rents:
                        self.school_num_rent.append(int(find_num_rent.text.replace(',','').split()[0]))

                    find_num_sales = self.driver.find_elements(by=By.XPATH, value='//div[contains(@class,"sc-1j4jwdg-7 jMCJvE")]')
                    for find_num_sale in find_num_sales:
                        self.school_num_sale.append(int(find_num_sale.text.replace(',','').split()[0]))
                    
        self.df_school = pd.DataFrame([self.school_type,self.school_name,self.school_link_name,self.school_num_rent,self.school_num_sale]).T
        self.df_school.columns = ['place_type','place_name','link_name','num_rent','num_sale']
        print(f"Done getting names for {len(find_school_types)} school types ({len(self.school_name)} schools with {self.df_school['num_rent'].sum()} for-rents and {self.df_school['num_sale'].sum()} for-sales)")
        
    def get_areas(self):
        self.driver.get(self.web_link)
        self.driver.implicitly_wait(2)
        area_button = self.driver.find_elements(by=By.XPATH, value='//button[contains(@class,"sc-1piz3ft-1 daQjbi")]')
        self.action.click(on_element = area_button[2])
        self.action.perform()
        find_area_provinces = self.driver.find_elements(by=By.XPATH, value='//div[contains(@class,"sc-kna4ye-7 eLDMsX")]/ul/li')
        
        for find_area_province in find_area_provinces:
            self.action.click(on_element = find_area_province)
            self.action.perform()
            find_areas = self.driver.find_elements(by=By.XPATH, value='//a[contains(@class,"zoneTypeStyle")]')
            if len(find_areas) > 0:
                for find_area in find_areas:
                    self.area_province.append(find_area_province.text.strip())
                    self.area_name.append(find_area.text.replace('Condo ','').strip())
                    self.area_link_name.append(find_area.get_attribute('href').replace('https://propertyhub.in.th/en/condo/',''))

                find_num_rents = self.driver.find_elements(by=By.XPATH, value='//div[contains(@class,"sc-1j4jwdg-6 ehYqed")]')
                for find_num_rent in find_num_rents:
                    self.area_num_rent.append(int(find_num_rent.text.replace(',','').split()[0]))

                find_num_sales = self.driver.find_elements(by=By.XPATH, value='//div[contains(@class,"sc-1j4jwdg-7 jMCJvE")]')
                for find_num_sale in find_num_sales:
                    self.area_num_sale.append(int(find_num_sale.text.replace(',','').split()[0]))

        self.df_area = pd.DataFrame([self.area_province,self.area_name,self.area_link_name,self.area_num_rent,self.area_num_sale]).T
        self.df_area.columns = ['place_type','place_name','link_name','num_rent','num_sale']
        print(f"Done getting names for {len(find_area_provinces)} provinces ({len(self.area_name)} areas with {self.df_area['num_rent'].sum()} for-rents and {self.df_area['num_sale'].sum()} for-sales)")
        
    def get_districts(self):
        self.driver.get(self.web_link)
        self.driver.implicitly_wait(2)
        province_button = self.driver.find_elements(by=By.XPATH, value='//button[contains(@class,"sc-1piz3ft-1 daQjbi")]')
        self.action.click(on_element = province_button[3])
        self.action.perform()
        find_district_provinces = self.driver.find_elements(by=By.XPATH, value='//div[contains(@class,"sc-kna4ye-7 eLDMsX")]/ul/li')
        
        for find_district_province in find_district_provinces:
            self.action.click(on_element = find_district_province)
            self.action.perform()
            find_districts = self.driver.find_elements(by=By.XPATH, value='//a[contains(@class,"zoneTypeStyle")]')
            if len(find_districts) > 0:
                for find_district in find_districts:
                    self.district_province.append(find_district_province.text.strip())
                    self.district_name.append(find_district.text.replace('Condo ','').strip())
                    self.district_link_name.append(find_district.get_attribute('href').replace('https://propertyhub.in.th/en/condo/',''))

                find_num_rents = self.driver.find_elements(by=By.XPATH, value='//div[contains(@class,"sc-1j4jwdg-6 ehYqed")]')
                for find_num_rent in find_num_rents:
                    self.district_num_rent.append(int(find_num_rent.text.replace(',','').split()[0]))

                find_num_sales = self.driver.find_elements(by=By.XPATH, value='//div[contains(@class,"sc-1j4jwdg-7 jMCJvE")]')
                for find_num_sale in find_num_sales:
                    self.district_num_sale.append(int(find_num_sale.text.replace(',','').split()[0]))

        self.df_district = pd.DataFrame([self.district_province,self.district_name,self.district_link_name,self.district_num_rent,self.district_num_sale]).T
        self.df_district.columns = ['place_type','place_name','link_name','num_rent','num_sale']
        print(f"Done getting names for {len(find_district_provinces)} provinces ({len(self.district_name)} districts with {self.df_district['num_rent'].sum()} for-rents and {self.df_district['num_sale'].sum()} for-sales)")
        
    def concat_df(self):
        self.place_link_names = pd.concat([self.df_station,self.df_school,self.df_area,self.df_district], ignore_index=True)
        rm_index = self.place_link_names[(self.place_link_names['num_rent']==0) & (self.place_link_names['num_sale']==0)].index
        self.place_link_names.drop(axis=0,index=rm_index,inplace=True)
        self.place_link_names.reset_index(inplace=True, drop=True)
        
        self.place_link_names['num_avg'] = (self.place_link_names['num_rent']+self.place_link_names['num_sale'])/2
        self.place_link_names.sort_values(by=['num_avg'], ascending=False, ignore_index=True, inplace=True)
        total = self.place_link_names['num_avg'].sum()
        divide_index = []
        divide_group = 4
        j = 1
        flag_list = []
        for i in range(len(self.place_link_names)-1):
            flag_list.append(j)
            if self.place_link_names['num_avg'][:i+1].sum() >= total/divide_group*j:
                divide_index.append(i+1)
                j += 1
                if j == divide_group+1:
                    break
        flag_list.append(j)
        self.place_link_names.insert(len(self.place_link_names.columns), "flag", flag_list)
        
        print('---'*20)
        print(f"Total dataset: len of {len(self.place_link_names)} with {self.place_link_names['num_avg'].sum()} links")
        print(f"Average (total/4): {self.place_link_names['num_avg'].sum()/4}")
        print('---'*20)
        print(f"Dataset 1: len of {len(self.place_link_names[self.place_link_names['flag']==1])} with {self.place_link_names[self.place_link_names['flag']==1]['num_avg'].sum()} links, starting from index {0}")
        print(f"Dataset 2: len of {len(self.place_link_names[self.place_link_names['flag']==2])} with {self.place_link_names[self.place_link_names['flag']==2]['num_avg'].sum()} links, starting from index {divide_index[0]}")
        print(f"Dataset 3: len of {len(self.place_link_names[self.place_link_names['flag']==3])} with {self.place_link_names[self.place_link_names['flag']==3]['num_avg'].sum()} links, starting from index {divide_index[1]}")
        print(f"Dataset 4: len of {len(self.place_link_names[self.place_link_names['flag']==4])} with {self.place_link_names[self.place_link_names['flag']==4]['num_avg'].sum()} links, starting from index {divide_index[2]}")
        
    def export_data(self):
        self.place_link_names.to_csv(f'locations.csv', index=False)
        
    def main(self, web_link):
        print(f'Start getting locations ...')
        self.web_link = web_link
        self.driver_setup()
        self.get_stations()
        self.get_schools()
        self.get_areas()
        self.get_districts()
        self.concat_df()
        self.export_data()
        self.driver.quit()

In [4]:
link = 'https://propertyhub.in.th/en'
GL = GetLinks()
GL.main(link)

Start getting locations ...
Done getting names for 11 train lines (243 stations with 283229 for-rents and 126285 for-sales)
Done getting names for 3 school types (430 schools with 805301 for-rents and 362637 for-sales)
Done getting names for 77 provinces (288 areas with 110242 for-rents and 47264 for-sales)
Done getting names for 77 provinces (927 districts with 162292 for-rents and 70751 for-sales)
------------------------------------------------------------
Total dataset: len of 893 with 984000.5 links
Average (total/4): 246000.125
------------------------------------------------------------
Dataset 1: len of 15 with 250140.0 links, starting from index 0
Dataset 2: len of 34 with 244301.0 links, starting from index 15
Dataset 3: len of 70 with 243968.5 links, starting from index 49
Dataset 4: len of 774 with 245591.0 links, starting from index 119
