<a href="https://colab.research.google.com/github/jm-tan-jm/web_scrap_agent_info/blob/main/get_Agent_Info_ibilik.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
## required multiple functions as the main url dont have the agent's info but only list of properties

In [None]:
!pip install selenium webdriver_manager

In [None]:
# function 1 - get the property's link from the ibilik url and required region
## Limitation - do one page only each time
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

def get_property_links(core_url="https://www.ibilik.my/rooms", region=None, page=None):
    if region is None:
        raise ValueError("Please provide valid values for 'region'")

    options = Options()
    options.headless = True
    options.add_argument("--headless")
    driver = webdriver.Firefox(options=options)
    driver.minimize_window()

    try:
        if page == 1:
            url = rf"{core_url}/{region}"
        else:
            url = rf"{core_url}/{region}?page={page}"

        driver.get(url)

        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'dir-alp-con-right-1'))
        )

        # "/a" to select the 'a' element that is a child of the 'div' element
        property_links = driver.find_elements(By.XPATH, "//div[contains(@class, 'home-list-pop-desc inn-list-pop-desc')]/a")
        all_property_links = [property_link.get_attribute('href') for property_link in property_links]

        return all_property_links

    except Exception as e:
        print(f"Error: {e}")
        return []

    finally:
        driver.quit()

In [None]:
# function 2 - get the agent's name and phone number from the url
## Limitation - can only handle one link each time, need to iterate the function for each link
def get_agent_info(property_link):
    agent_info = {}

    options = Options()
    options.headless = True
    options.add_argument("--headless")
    driver = webdriver.Firefox(options=options)
    driver.minimize_window()

    try:
        driver.get(property_link)

        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'list-pg-btn'))
        )

        agent_info_elements = driver.find_elements(By.XPATH, "//div[contains(@class, 'col-xs-9')]/span[contains(@class, 'name')]")

        if agent_info_elements:
            name_and_number = [name_element.text.strip() for name_element in agent_info_elements]
            agent_name = name_and_number[0]  # Extracting the first word as the name
            phone_number = name_and_number[1]  # Extracting the second word as the phone number

            agent_info = {
                'agent_name': agent_name,
                'phone_number': phone_number
            }

        else:
            agent_info = driver.find_element(By.CLASS_NAME, 'list-pg-btn').get_attribute('href')

    except Exception as e:
        print(f"Error: {e}")

    finally:
        driver.quit()

    return agent_info

In [None]:
# to run above two functions and convert the result to Excel
import pandas as pd

# to get multiple agents' profile
property_links = []

# loop for multiple pages
for page_number in range(1, 3):  # Scrapes pages 1 to 2
    # function 1
    property_link = get_property_links(core_url="https://www.ibilik.my/rooms", region='kuala_lumpur', page=page_number)
    property_links.extend(property_link)

agent_info_list = []
# to get agent's info from each links above
for property_link in property_links:
    # function 2
    agent_info = get_agent_info(property_link)

    if 'agent_name' in agent_info and 'phone_number' in agent_info:
        agent_info_list.append({'Agent Name': agent_info['agent_name'], 'Phone Number': agent_info['phone_number']})
    elif isinstance(agent_info, str):
        agent_info_list.append({'Agent Name': agent_info, 'Phone Number': None})
    else:
        agent_info_list.append({'Agent Name': None, 'Phone Number': None})

# Convert the list to a DataFrame
df = pd.DataFrame(agent_info_list)
# df = pd.DataFrame(agent_info_list, columns=['Agent Name', 'Phone Number'])

print(df)

with open('output.txt', 'w') as f:
  print(df, file=f)