## PropertyHub: 2.4 Scraping Condo Links

In [1]:
# Import libraries
import os
import glob
import time
import pandas as pd
from datetime import datetime
import requests
from requests.adapters import HTTPAdapter, Retry
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, wait

In [2]:
class GetLinks():
    def __init__(self):
        self.delay_time = 0.1
        
        self.rent_place_type = []
        self.rent_place_name = []
        self.rent_link_name = []
        self.rent_condo_links = []
        self.sale_place_type = []
        self.sale_place_name = []
        self.sale_link_name = []
        self.sale_condo_links = []
        
        self.retries = 5
        self.backoff = 1     # time-out = [0.5, 1, 2, 4, 8]
        self.status_forcelist = [403, 500, 502, 503, 504]
        self.timeout = (10, 10)
        
        # Initialize request session for retries and timeout
        self.s = requests.Session()
        retries = Retry(total=self.retries,
                        backoff_factor=self.backoff,
                        status_forcelist=self.status_forcelist)
        self.s.mount('http://', HTTPAdapter(max_retries=retries))
        self.s.mount('https://', HTTPAdapter(max_retries=retries))
    
    def import_shuffle_df(self, path, num):
        self.location = pd.read_csv(path)
        self.location = self.location[self.location['flag']==num]
        
    def get_condo_links(self, order):
        df_place_link_name = self.location.iloc[order-1,:]
        place_link_name = df_place_link_name['link_name']
        
        print(f"Thread {order}: getting {df_place_link_name['num_rent']+df_place_link_name['num_sale']} links ...")
        
        # Get for-rent links
        if df_place_link_name['num_rent'] != 0:
            try:
                place_link = f'https://propertyhub.in.th/en/condo-for-rent/{place_link_name}'
                # find max page number
                soup = BeautifulSoup(self.s.get(place_link, timeout=self.timeout).content, "html.parser")
                find_max_page = soup.find_all("ul", {"class": "sc-1p20b44-0 IoRRS"})
                try:
                    max_page = int(find_max_page[0].find_all('li')[-2].a['aria-label'].split()[-1])
                except:
                    max_page = 1
                page_links = [place_link + f'/{i+1}' if i!=0 else place_link for i in range(max_page)]
                for page_link in page_links:
                    soup = BeautifulSoup(self.s.get(page_link, timeout=self.timeout).content, "html.parser")
                    find_condo_links = soup.select("a[href*='en/listings/']")
                    for link in find_condo_links:
                        condo_link = 'https://propertyhub.in.th' + link['href']
                        self.rent_place_type.append(df_place_link_name['place_type'])
                        self.rent_place_name.append(df_place_link_name['place_name'])
                        self.rent_link_name.append(df_place_link_name['link_name'])
                        self.rent_condo_links.append(condo_link)
                time.sleep(self.delay_time)
            except:
                print(f"There is an invalid link: {place_link}")

        # Get for-sale links
        if df_place_link_name['num_sale'] != 0:
            try:
                place_link = f'https://propertyhub.in.th/en/condo-for-sale/{place_link_name}'
                # find max page number
                soup = BeautifulSoup(self.s.get(place_link, timeout=self.timeout).content, "html.parser")
                find_max_page = soup.find_all("ul", {"class": "sc-1p20b44-0 IoRRS"})
                try:
                    max_page = int(find_max_page[0].find_all('li')[-2].a['aria-label'].split()[-1])
                except:
                    max_page = 1
                page_links = [place_link + f'/{i+1}' if i!=0 else place_link for i in range(max_page)]
                for page_link in page_links:
                    soup = BeautifulSoup(self.s.get(page_link, timeout=self.timeout).content, "html.parser")
                    find_condo_links = soup.select("a[href*='en/listings/']")
                    for link in find_condo_links:
                        condo_link = 'https://propertyhub.in.th' + link['href']
                        self.sale_place_type.append(df_place_link_name['place_type'])
                        self.sale_place_name.append(df_place_link_name['place_name'])
                        self.sale_link_name.append(df_place_link_name['link_name'])
                        self.sale_condo_links.append(condo_link)
                time.sleep(self.delay_time)
            except:
                print(f"There is an invalid link: {place_link}")

        # Convert to df and remove duplicates
        self.df_rent_condo_links = pd.DataFrame(self.rent_place_type,columns=['place_type'])
        self.df_rent_condo_links['place_name'] = self.rent_place_name
        self.df_rent_condo_links['link_name'] = self.rent_link_name
        self.df_rent_condo_links['condo_link'] = self.rent_condo_links
        self.df_rent_condo_links.drop_duplicates(subset=['condo_link'],keep='first',inplace=True, ignore_index=True)

        self.df_sale_condo_links = pd.DataFrame(self.sale_place_type,columns=['place_type'])
        self.df_sale_condo_links['place_name'] = self.sale_place_name
        self.df_sale_condo_links['link_name'] = self.sale_link_name
        self.df_sale_condo_links['condo_link'] = self.sale_condo_links
        self.df_sale_condo_links.drop_duplicates(subset=['condo_link'],keep='first',inplace=True, ignore_index=True)

        # Export
        self.df_rent_condo_links.to_csv(f"df_rent_condo_links_{order}.csv",index=False)
        self.df_sale_condo_links.to_csv(f"df_sale_condo_links_{order}.csv",index=False)
        
        print(f"Thread {order}: finished !!!")
            
    def join_df(self):
        output_rent_files = glob.glob(os.getcwd()+'/df_rent*.csv')
        output_sale_files = glob.glob(os.getcwd()+'/df_sale*.csv')
        outputs_rent = [pd.read_csv(output_rent_file) for output_rent_file in output_rent_files]
        outputs_sale = [pd.read_csv(output_sale_file) for output_sale_file in output_sale_files]

        # Combine outputs
        output_rent_all = outputs_rent[0]
        for i in range(len(outputs_rent)-1):
            output_rent_all = pd.concat([output_rent_all,outputs_rent[i+1]], axis=0, ignore_index=True)
        output_sale_all = outputs_sale[0]
        for i in range(len(outputs_sale)-1):
            output_sale_all = pd.concat([output_sale_all,outputs_sale[i+1]], axis=0, ignore_index=True)
            
        # Drop duplicates
        output_rent_all.drop_duplicates(subset=['condo_link'],keep='first',inplace=True, ignore_index=True)
        output_sale_all.drop_duplicates(subset=['condo_link'],keep='first',inplace=True, ignore_index=True)
        
        # Print results
        print(f'!!!!!! Finished Scraping Links !!!!!!')
        print(f'for-rent data len: {len(output_rent_all)}')
        print(f'for-sale data len: {len(output_sale_all)}')

        # Export the combine result
        output_rent_all.to_csv(f"{datetime.now().strftime('%Y%m')}_rent_condo_links_4.csv",index=False)
        output_sale_all.to_csv(f"{datetime.now().strftime('%Y%m')}_sale_condo_links_4.csv",index=False)

        # Delete all the unused files
        [os.remove(output_rent_file) for output_rent_file in output_rent_files];
        [os.remove(output_sale_file) for output_sale_file in output_sale_files];

In [3]:
num = 4
input_path = '/kaggle/input/ph-1-getting-locations'
file_name = 'locations.csv'
locations = pd.read_csv(f"{input_path}/{file_name}")
locations = locations[locations['flag']==num]

thread_num = len(locations)
print(f'Number of threads: {thread_num}')
getlink_threads = [GetLinks() for _ in range(thread_num)]
[getlink_thread.import_shuffle_df(f'{input_path}/{file_name}',num) for getlink_thread in getlink_threads]

threadList = []
with ThreadPoolExecutor() as executor:
    for i in range(thread_num):
        threadList.append(executor.submit(getlink_threads[i].get_condo_links, i+1))
wait(threadList);

getlink_threads[0].join_df()

Number of threads: 774
Thread 1: getting 4466 links ...
Thread 2: getting 4406 links ...
Thread 3: getting 4354 links ...
Thread 4: getting 4328 links ...
Thread 5: getting 4308 links ...
Thread 6: getting 4235 links ...
Thread 7: getting 4196 links ...
Thread 8: getting 4009 links ...
Thread 5: finished !!!
Thread 9: getting 4001 links ...
Thread 6: finished !!!
Thread 10: getting 3929 links ...
Thread 3: finished !!!
Thread 11: getting 3876 links ...
Thread 8: finished !!!
Thread 12: getting 3868 links ...
Thread 1: finished !!!
Thread 13: getting 3848 links ...
Thread 4: finished !!!
Thread 14: getting 3819 links ...
Thread 7: finished !!!
Thread 15: getting 3780 links ...
Thread 2: finished !!!
Thread 16: getting 3742 links ...
Thread 9: finished !!!
Thread 17: getting 3733 links ...
Thread 11: finished !!!
Thread 18: getting 3664 links ...
Thread 10: finished !!!
Thread 19: getting 3577 links ...
Thread 16: finished !!!
Thread 20: getting 3537 links ...
Thread 12: finished !!!
Thr