In [1]:
# Import libraries
import os
import glob
import time
import pandas as pd
from datetime import datetime
import requests
from requests.adapters import HTTPAdapter, Retry
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, wait

In [2]:
class ScrapeData():
    def __init__(self):
        self.delay_time = 0     # **** Please set this variable > 1 not to overload the server ****
        self.condo_level_data = []
        self.parse_count = 0
        self.error_index = []
        self.invalid_link_count = 0
        self.retries = 5
        self.backoff = 1     # time-out = [0.5, 1, 2, 4, 8]
        self.status_forcelist = [403, 500, 502, 503, 504]
        self.timeout = (10, 10)
        
        # Initialize request session for retries and timeout
        self.s = requests.Session()
        retries = Retry(total=self.retries,
                        backoff_factor=self.backoff,
                        status_forcelist=self.status_forcelist)
        self.s.mount('http://', HTTPAdapter(max_retries=retries))
        self.s.mount('https://', HTTPAdapter(max_retries=retries))
        
    def parse_data(self):
        for condo_link in self.condo_links:
            try:
                page = self.s.get(condo_link, timeout=self.timeout)
                soup = BeautifulSoup(page.content, "html.parser")
                # Parse condo details
                post_name = soup.select("h1[class*='sc-14haut3-0 kiSLkD']")[0].text.strip()
                last_update_date = soup.select("div[class*='sc-ogfj7g-14 coaZDA']")[0].text.strip().split()[-2]
                last_update_time = soup.select("div[class*='sc-ogfj7g-14 coaZDA']")[0].text.strip().split()[-1]
                try:
                    poster_name = soup.select("li[class*='sc-ves8oa-9 hCPUWp']")[0].div.a.text
                except:
                    try:
                        poster_name = soup.select("li[class*='sc-ves8oa-9 hCPUWp']")[0].div.p.text
                    except:
                        poster_name = ''
                poster_status = soup.select("li[class*='sc-ves8oa-9 hCPUWp']")[0].div.div.text
                # view_count = soup.select("div[class*='sc-ves8oa-0 hWLLWW']")[0].text.strip().split()[-1]        # Cannot get view count as the page needs to load up
                
                if self.rent_flag == 1:
                    price_unit = soup.select("div[class*='sc-152o12i-7 dKuoZx priceTag sc-s9r052-5 dvOoHM']")[0].find_all('span')[-1].text
                    if price_unit.lower().strip() == 'please contact':
                        try:
                            price = int(soup.select("div[class*='sc-152o12i-7 dKuoZx priceTag sc-s9r052-5 dvOoHM']")[0].find_all('span')[1].text.split('THB')[0].replace(',',''))
                            price_unit = soup.select("div[class*='sc-152o12i-7 dKuoZx priceTag sc-s9r052-5 dvOoHM']")[0].find_all('span')[2].text.strip()
                        except:
                            price = 'please'
                            price_unit = 'contact'
                    else:
                        price = int(soup.select("div[class*='sc-152o12i-7 dKuoZx priceTag sc-s9r052-5 dvOoHM']")[0].find_all('span')[-2].text.split('THB')[0].replace(',',''))

                    deposit = soup.select("li[class*='sc-s9r052-3 iJHhTM']")[0].find_all('p')[-1].text.strip().split()[0]
                    try:
                        deposit_unit = soup.select("li[class*='sc-s9r052-3 iJHhTM']")[0].find_all('p')[-1].text.strip().split()[1]
                    except:
                        deposit_unit = ""
                    advance_payment = soup.select("li[class*='sc-s9r052-3 iJHhTM']")[1].find_all('p')[-1].text.strip().split()[0]
                    try:
                        advance_payment_unit = soup.select("li[class*='sc-s9r052-3 iJHhTM']")[1].find_all('p')[-1].text.strip().split()[1]
                    except:
                        advance_payment_unit = ""
                else:
                    price_unit = soup.select("div[class*='sc-152o12i-7 dKuoZx priceTag sc-s9r052-6 gRLtIb']")[0].find_all('span')[-1].text
                    if price_unit.lower().strip() == 'please contact':
                        try:
                            price = int(soup.select("div[class*='sc-152o12i-7 dKuoZx priceTag sc-s9r052-6 gRLtIb']")[0].find_all('span')[1].text.split('THB')[0].replace(',',''))
                            price_unit = soup.select("div[class*='sc-152o12i-7 dKuoZx priceTag sc-s9r052-6 gRLtIb']")[0].find_all('span')[2].text.strip()
                        except:
                            price = 'please'
                            price_unit = 'contact'
                    else:
                        price = int(soup.select("div[class*='sc-152o12i-7 dKuoZx priceTag sc-s9r052-6 gRLtIb']")[0].find_all('span')[-2].text.split('THB')[0].replace(',',''))
                    
                    deposit = ''
                    deposit_unit = ''
                    advance_payment = ''
                    advance_payment_unit = ''
                    
                room_info_header = [room_info.p.text.strip(':') for room_info in soup.select("li[class*='sc-s9r052-1 bLavUw']")]
                room_info_value = [room_info.span.text.strip() for room_info in soup.select("li[class*='sc-s9r052-1 bLavUw']")]
                find_room_description = soup.select("div[class*='sc-ves8oa-21 bBnKvP']")
                if len(find_room_description) == 0:
                    room_description = []
                else:
                    room_description = find_room_description[0].text.strip()

                room_amenities_have = [amen.text.strip() for amen in soup.select("div[class*='sc-1qj7qf1-1 czrvpe']")[0].find_all('span')]
                room_amenities_not_have = [amen.text.strip() for amen in soup.select("div[class*='sc-1qj7qf1-1 czrvpe']")[0].find_all('strike')]

                # Parse project details
                project_details_attributes = [header.text.strip() for header in soup.select("table[class*='sc-7l0zor-1 jEPVvF']")[0].find_all('th')]
                project_details_values = []
                for value in soup.select("table[class*='sc-7l0zor-1 jEPVvF']")[0].find_all('td'):
                    if len(value.find_all('li')) <= 1:
                        project_details_values.append(value.text.strip())
                    else:
                        value_list = [v.text.strip() for v in value.find_all('li')]
                        project_details_values.append(value_list)
                facilities_have = [facil.text.strip() for facil in soup.select("div[class*='sc-vxzykp-0 dTLeQV sc-ogfj7g-18 sc-iv2rdv-17 bLEjbt citTje']")[0].find_all('span')]
                facilities_not_have = [facil.text.strip() for facil in soup.select("div[class*='sc-vxzykp-0 dTLeQV sc-ogfj7g-18 sc-iv2rdv-17 bLEjbt citTje']")[0].find_all('strike')]

                # Parse properties in nearby area
                nearby_property_type = []
                nearby_property_name = []
                nearby_property_distance = []

                blocks = soup.select("div[class*='sc-vxzykp-0 dNnjli sc-nnw194-1 bnkbhT']")
                blocks.extend(soup.select("div[class*='sc-vxzykp-0 dTLeQV sc-nnw194-1 bnkbhT']"))

                for block in blocks:
                    for i,sub_block in enumerate(block.select("div[class*='row sc-nnw194-2 gAWLJy']")):
                        for prop in sub_block.select("a[class*='zoneTypeStyle']"):
                            nearby_property_type.append(block.find_all('h3')[i].text.strip())
                            nearby_property_name.append(prop.text.strip().replace('Condo ',''))
                            if len(sub_block.find_all('span'))==0:
                                nearby_property_distance.append('')
                            else:
                                nearby_property_distance.append(sub_block.find_all('span')[0].text.strip())

                self.condo_level_data.append([self.rent_flag,
                                            condo_link,
                                            post_name,
                                            last_update_date,
                                            last_update_time,
                                            poster_name,
                                            poster_status,
                                            price,
                                            price_unit,
                                            deposit,
                                            deposit_unit,
                                            advance_payment,
                                            advance_payment_unit,
                                            room_info_header,
                                            room_info_value,
                                            room_description,
                                            room_amenities_have,
                                            room_amenities_not_have,
                                            project_details_attributes,
                                            project_details_values,
                                            facilities_have,
                                            facilities_not_have,
                                            nearby_property_type,
                                            nearby_property_name,
                                            nearby_property_distance])

                self.parse_count += 1
                time.sleep(self.delay_time)
                
            except:
                self.condo_level_data.append([])
                if (page.url == 'https://propertyhub.in.th/en') or (len(soup.select("div[class*='sc-1552ugy-1 sc-1552ugy-5 kvqbdg eugViW']")) != 0):
                    self.invalid_link_count += 1 
                    self.parse_count += 1
                    continue
                else:
                    print(f'Error at {condo_link}')
                    self.error_index.append(self.parse_count)
                    self.parse_count += 1
                    continue
                
    
    def export_results(self):
        condo_data = pd.DataFrame(self.condo_level_data)
        col_names = ['rent_flag',
                    'condo_link',
                    'post_name',
                    'last_update_date',
                    'last_update_time',
                    'poster_name',
                    'poster_status',
                    'price',
                    'price_unit',
                    'deposit',
                    'deposit_unit',
                    'advance_payment',
                    'advance_payment_unit',
                    'room_info_header',
                    'room_info_value',
                    'room_description',
                    'room_amenities_have',
                    'room_amenities_not_have',
                    'project_details_attributes',
                    'project_details_values',
                    'facilities_have',
                    'facilities_not_have',
                    'nearby_property_type',
                    'nearby_property_name',
                    'nearby_property_distance']
        
        condo_data.columns = col_names
        condo_data.to_csv(f"condo_data_{self.order}.csv",index=False)
    
    def main(self, df_links, order, rent_flag=1):
        self.condo_links = df_links.iloc[:,3]
        self.order = order
        self.rent_flag = rent_flag   # 1 for-rent and 0 for-sale
        
        print(f'Thread {self.order}: Scraping for {len(self.condo_links)} links ...')
        
        self.parse_data()
        self.export_results()
        
        if len(self.error_index) > 0:
            print(f'Thread {self.order}: !!! Scraping completed ({len(self.error_index)} errors, {self.invalid_link_count} invalid links)')
        else:
            print(f'Thread {self.order}: !!! Scraping completed')

In [3]:
# Once scraping condo links notebook is done, upload the generated output files to this notebook and update the input path
path_1 = '/kaggle/input/ph-2-1-scraping-condo-links'
df_rent_links_1 = pd.read_csv(f"{path_1}/{datetime.now().strftime('%Y%m')}_rent_condo_links_1.csv")

path_2 = '/kaggle/input/ph-2-2-scraping-condo-links'
df_rent_links_2 = pd.read_csv(f"{path_2}/{datetime.now().strftime('%Y%m')}_rent_condo_links_2.csv")

path_3 = '/kaggle/input/ph-2-3-scraping-condo-links'
df_rent_links_3 = pd.read_csv(f"{path_3}/{datetime.now().strftime('%Y%m')}_rent_condo_links_3.csv")

path_4 = '/kaggle/input/ph-2-4-scraping-condo-links'
df_rent_links_4 = pd.read_csv(f"{path_4}/{datetime.now().strftime('%Y%m')}_rent_condo_links_4.csv")

df_rent_links = pd.concat([df_rent_links_1,df_rent_links_2,df_rent_links_3,df_rent_links_4], axis=0, ignore_index=True)
df_rent_links.drop_duplicates(subset=['condo_link'],keep='first',inplace=True, ignore_index=True)

# Dividing into groups
rent_len = round(len(df_rent_links)/5)
print(f'4th batch: for-rent data len: {rent_len}')

4th batch: for-rent data len: 21834


In [4]:
# Scraping for-rent condo data
rent_group_num = 80
rent_links = []
for i in range(rent_group_num):
    start = round(i*len(df_rent_links)/rent_group_num)
    end = round((i+1)*len(df_rent_links)/rent_group_num)
    rent_links.append(df_rent_links.iloc[start:end,:])

rent_thread_num = 16
rent_scrape_threads = [ScrapeData() for _ in range(rent_thread_num)]
rent_threadList = []
with ThreadPoolExecutor() as executor:
    for i in range(rent_thread_num):
        rent_threadList.append(executor.submit(rent_scrape_threads[i].main, rent_links[i+64], i+1+64, 1))
wait(rent_threadList);

Thread 65: Scraping for 1364 links ...Thread 66: Scraping for 1365 links ...

Thread 67: Scraping for 1365 links ...
Thread 68: Scraping for 1364 links ...
Thread 69: Scraping for 1365 links ...
Thread 70: Scraping for 1365 links ...
Thread 71: Scraping for 1364 links ...
Thread 72: Scraping for 1365 links ...
Error at https://propertyhub.in.th/en/listings/lumpini-ram44-size-64-sq-m-2-beds-14-b-fl-17-000-baht-094-549-4104---3584700
Thread 72: !!! Scraping completed
Thread 73: Scraping for 1364 links ...
Thread 67: !!! Scraping completed
Thread 74: Scraping for 1365 links ...
Thread 68: !!! Scraping completed
Thread 75: Scraping for 1365 links ...
Thread 69: !!! Scraping completed (1 errors, 1 invalid links)
Thread 76: Scraping for 1364 links ...
Thread 71: !!! Scraping completed
Thread 77: Scraping for 1365 links ...
Error at https://propertyhub.in.th/en/listings/park-ramindra--a19e08b2---3818978
Error at https://propertyhub.in.th/en/listings/the-origin-phahol-saphanmai-for-rent-12000-

In [5]:
# Read all files in the output folder
output_files = glob.glob(f"{os.getcwd()}/condo_data*.csv")
outputs = [pd.read_csv(f'{output_file}') for output_file in output_files]

# Combine outputs
output_all = outputs[0]
for i in range(len(outputs)-1):
    output_all = pd.concat([output_all,outputs[i+1]], axis=0, ignore_index=True)

# Drop empty rows resulting from errors from scraping
output_all.dropna(subset=['post_name'], axis=0, inplace=True)
output_all.reset_index(inplace=True, drop=True)
output = output_all

# Export the combine result
output.to_csv(f"{datetime.now().strftime('%Y%m')}_condo_data_5.csv",index=False)

# Delete all the unused files
[os.remove(output_file) for output_file in output_files];

# Show results
print(f'Total output length: {len(output)}')

Total output length: 21731
