In [None]:
import re
import csv
import math
from urllib.parse import quote
from urllib.parse import urlsplit, urlunsplit

from playwright.async_api import async_playwright
import pandas as pd
from parsel import Selector




# Data cleaning functions
def split_address(raw_address):
    try:
        components = raw_address.split(', ')
        zip_code, city = components[0].split(' ')
        street = components[1]
        match = re.search(r'(\D+)(\d+\w*)', street)
        if match:
            street_name, street_number = match.groups()
        else:
            street_name = street
            street_number = None
        
        return zip_code, city, street_name, street_number
    except Exception as e:
        return None, None, None, None


def encode_url(url):
    return quote(url, safe=':/')




def calculate_total_pages(data_found):
    try:
        items_per_page = 10
        total_pages = math.ceil(int(data_found) / items_per_page)
    except:
        total_pages = 1
    return total_pages

In [None]:
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(
    headless=False,
    channel="chrome",
    # slow_mo=500,
    args=["--disable-blink-features", "--disable-blink-features=AutomationControlled"],
    # timeout=0
)

context = await browser.new_context(
    user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36",
    # java_script_enabled=False
)

page = await context.new_page()
page.set_default_timeout(0)

In [None]:
# Reading keyword
df = pd.read_csv('1st_keyword.csv')
keyword_list = df['keyword'].tolist()
test_keyword_list = keyword_list[:]

In [None]:
# await page.goto("https://www.imenik.hr/imenik/trazi/2/Osobe/sve/sve/vaznost/zagreb,%20karlova%C4%8Dka%20cesta%202b.html")

In [None]:
item_list = []

In [None]:
keyword_number = 1

for keyword in test_keyword_list:
    # Keyword and search navigation code
    await page.goto("https://www.imenik.hr/imenik/trazi/2/Osobe/sve/sve/vaznost/zagreb,%20karlova%C4%8Dka%20cesta%202b.html")
    await page.wait_for_timeout(4000)
    await page.locator("#find_desc").fill(keyword)
    await page.get_by_label("OSOBE", exact=True).check()
    await page.locator("#submit_filter").click()


    # Pagination Code
    await page.wait_for_timeout(200)
    html = await page.content()
    profile_url_page_response = Selector(text=html)
    _08_DATA_FOUND = profile_url_page_response.xpath("//td[@class='c_32']/strong/text()").get() # Total data found
    number_of_page = calculate_total_pages(_08_DATA_FOUND)
    base_url = "https://www.imenik.hr/imenik/trazi/{}"
    current_url = page.url # Playwright current page
    url_components = urlsplit(current_url)
    url_part = url_components.path + url_components.query

    await page.goto(new_url)
    await page.get_by_label("OSOBE", exact=True).check()
    await page.locator("#submit_filter").click()

    
    for page_number in range(1, number_of_page + 1):
        new_path = url_part.replace('/trazi/1', f'/trazi/{page_number}')
        new_url_components = url_components._replace(path=new_path)
        global new_url
        new_url = urlunsplit(new_url_components) # This url for data extraction


        # Search page request code
        await page.goto(new_url)
        # await page.get_by_label("OSOBE", exact=True).check()
        # await page.locator("#submit_filter").click()
        await page.wait_for_timeout(400)
        
        html = await page.content()
        search_page_response = Selector(text=html)

        rez_item = search_page_response.xpath("//div[@class='rez_item rez_item_0']")
        for item in rez_item:
            profile_url_raw = f"https://www.imenik.hr{item.xpath('.//h4/a/@href').get()}"
            _06_PROFILE_URLs = encode_url(profile_url_raw)


            # Sending request to profile urls
            await page.goto(_06_PROFILE_URLs)
            await page.wait_for_timeout(100)
            html = await page.content()
            response = Selector(text=html)

            # Data extraction code
            _01_FULL_NAME = response.xpath("//h2/strong/text()").get()
            _04_PHONE = response.xpath("(//th[contains(text(), 'TELEFON')])[1]/../following-sibling::tr/td/strong/text()").get()
            _05_CELLPHONE = response.xpath("(//th[contains(text(), 'MOB')])[1]/../following-sibling::tr/td/strong/text()").get()
            _10_FULL_ADDRESS = response.xpath("//h4/a/text()").get()
            _16_ZIP_CODE, _15_CITY, _02_ADDRESS, _03_ADDRESS_NUMBER = split_address(_10_FULL_ADDRESS)
            _11_MOBILE_2 = response.xpath("(//th[contains(text(), 'MOB')])[2]/../following-sibling::tr/td/strong/text()").get()
            _12_MOBILE_3 = response.xpath("(//th[contains(text(), 'MOB')])[3]/../following-sibling::tr/td/strong/text()").get()
            _13_CELLPHONE_2  = response.xpath("(//th[contains(text(), 'TELEFON')])[2]/../following-sibling::tr/td/strong/text()").get()
            _14_CELLPHONE_3 = response.xpath("(//th[contains(text(), 'TELEFON')])[3]/../following-sibling::tr/td/strong/text()").get()


            item_dic = {
                "_01_FULL_NAME": _01_FULL_NAME,
                "_02_ADDRESS": _02_ADDRESS,
                "_03_ADDRESS_NUMBER": _03_ADDRESS_NUMBER,
                "_04_PHONE": _04_PHONE,
                "_05_CELLPHONE": _05_CELLPHONE,
                "_06_PROFILE_URLs": _06_PROFILE_URLs,
                "_07_SEARCH_KEYWORD": keyword,
                "_08_DATA_FOUND": _08_DATA_FOUND,
                "_09_SEARCH_URL": new_url,
                "_10_FULL_ADDRESS": _10_FULL_ADDRESS,
                "_11_MOBILE_2": _11_MOBILE_2,
                "_12_MOBILE_3": _12_MOBILE_3,
                "_13_CELLPHONE_2": _13_CELLPHONE_2,
                "_14_CELLPHONE_3": _14_CELLPHONE_3,
                "_15_CITY": _15_CITY,
                "_16_ZIP_CODE": _16_ZIP_CODE,
            }

            item_list.append(item_dic)
        




    # Current keyword tracking code
    print(f"\rProcessing keyword: {keyword_number}...", end="")
    keyword_number += 1

In [None]:
# Save condo data as CSV
csv_filename = 'profile-data.csv'
field_names = item_list[0].keys() if item_list else []
with open(csv_filename, 'a', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=field_names)
    writer.writeheader()
    writer.writerows(item_list)

In [None]:
38

In [None]:
"""
from playwright.sync_api import Playwright, sync_playwright, expect


def run(playwright: Playwright) -> None:
    browser = playwright.chromium.launch(headless=False)
    context = browser.new_context()
    page = context.new_page()
    page.goto("https://www.imenik.hr/imenik/trazi/2/Osobe/sve/sve/vaznost/zagreb,%20karlova%C4%8Dka%20cesta%202b.html")
    page.locator("#find_desc").click()
    page.locator("#find_desc").click()
    page.locator("#find_desc").press("ArrowRight")
    page.locator("#find_desc").press("ArrowRight")
    page.locator("#find_desc").press("ArrowRight")
    page.locator("#find_desc").press("ArrowRight")
    page.locator("#find_desc").fill("Zagreb Cvijete Zuzorić 29")
    page.get_by_label("OSOBE", exact=True).check()
    page.locator("#submit_filter").click()
    page.locator("#find_desc").click()
    page.locator("#find_desc").fill("Zagreb Jaruščica 17")
    page.locator("#submit_filter").click()

    # ---------------------
    context.close()
    browser.close()


with sync_playwright() as playwright:
    run(playwright)

"""