# Importing Libraries

In [None]:
!pip install playwright tqdm nest_asyncio openpyxl

!playwright install --with-deps chromium

Collecting playwright
  Downloading playwright-1.52.0-py3-none-manylinux1_x86_64.whl.metadata (3.5 kB)
Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting pyee<14,>=13 (from playwright)
  Downloading pyee-13.0.0-py3-none-any.whl.metadata (2.9 kB)
Collecting greenlet<4.0.0,>=3.1.1 (from playwright)
  Downloading greenlet-3.2.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (4.1 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading playwright-1.52.0-py3-none-manylinux1_x86_64.whl (45.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.1/45.1 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.9/250.9 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading greenlet-3.2.3-cp311-cp311-manylinux_2_24_x86_64.ma

In [None]:
import asyncio
import pandas as pd
from playwright.async_api import async_playwright, Page, BrowserContext
from tqdm.asyncio import tqdm_asyncio
import nest_asyncio
from google.colab import files
from google.colab import data_table

In [None]:
df = pd.read_csv("/content/df_hinhthuc_null.csv")

# Define functions for Industry and Type

In [None]:
# --- Helper Functions to Scrape Specific Details ---
# get_nganh_nghe_from_page and get_hinh_thuc_from_page remain THE SAME as in your notebook.
# I'm including them here for completeness of the runnable cell.
async def get_nganh_nghe_from_page(page: Page) -> str:
    try:
        label_element = await page.query_selector("xpath=//li[.//strong[contains(normalize-space(.), 'Ngành nghề')]]")
        if label_element:
            value_element = await label_element.query_selector("p > a")
            if value_element:
                text = (await value_element.inner_text()).strip()
                if text: return text
        strong_nganh_nghe = await page.query_selector("xpath=//strong[contains(normalize-space(.), 'Ngành nghề')]")
        if strong_nganh_nghe:
            parent_li = await strong_nganh_nghe.query_selector("xpath=ancestor::li")
            if parent_li:
                 p_tag = await parent_li.query_selector("p")
                 if p_tag:
                    a_tag = await p_tag.query_selector("a")
                    if a_tag:
                        text = (await a_tag.inner_text()).strip()
                        if text: return text
            else:
                p_sibling = await strong_nganh_nghe.query_selector("xpath=./following-sibling::p[1]")
                if p_sibling:
                    a_tag = await p_sibling.query_selector("a")
                    if a_tag:
                        text = (await a_tag.inner_text()).strip()
                        if text: return text
    except Exception as e:
        print(f"LỖI (Ngành nghề) {page.url}: {e}")
    return "N/A"

async def get_hinh_thuc_from_page(page: Page) -> str:
    try:
        label_element = await page.query_selector("xpath=//li[.//strong[contains(normalize-space(.), 'Hình thức')]]")
        if label_element:
            value_element = await label_element.query_selector("p")
            if value_element:
                text = (await value_element.inner_text()).strip()
                if text: return text
        strong_hinh_thuc = await page.query_selector("xpath=//strong[contains(normalize-space(.), 'Hình thức')]")
        if strong_hinh_thuc:
            parent_li = await strong_hinh_thuc.query_selector("xpath=ancestor::li")
            if parent_li:
                p_tag = await parent_li.query_selector("p")
                if p_tag:
                    text = (await p_tag.inner_text()).strip()
                    if text: return text
            else:
                p_sibling = await strong_hinh_thuc.query_selector("xpath=./following-sibling::p[1]")
                if p_sibling:
                    text = (await p_sibling.inner_text()).strip()
                    if text: return text
    except Exception as e:
        print(f"LỖI (Hình thức) {page.url}: {e}")
    return "N/A"

async def fetch_single_url_details(url: str, context: BrowserContext, semaphore: asyncio.Semaphore) -> dict:
    """
    Worker function to fetch details for a single URL.
    Manages its own page within the given browser context.
    """
    async with semaphore: # Acquire semaphore before creating a page
        page = None
        nganh_nghe = "N/A"
        hinh_thuc = "N/A"
        if not isinstance(url, str) or not url.startswith("http"):
            print(f"URL không hợp lệ, bỏ qua: {url}")
            return {"URL": url, "NganhNghe": "Invalid URL", "HinhThuc": "Invalid URL"}
        try:
            page = await context.new_page()
            # Abort image and CSS requests for speed
            await page.route("**/*.{png,jpg,jpeg,gif,webp,css,woff,woff2,svg}", lambda route: route.abort())

            await page.goto(url, wait_until="domcontentloaded", timeout=45000) # domcontentloaded is faster
            # Consider a small explicit wait if data isn't always present immediately
            # await page.wait_for_timeout(500)

            nganh_nghe = await get_nganh_nghe_from_page(page)
            hinh_thuc = await get_hinh_thuc_from_page(page)

            # Optional: print success for each URL if needed for debugging, can be noisy
            # print(f"DONE: {url} -> Ngành: {nganh_nghe}, Hình thức: {hinh_thuc}")

        except Exception as e:
            print(f"Lỗi nghiêm trọng khi xử lý URL {url}: {e}")
            nganh_nghe = f"Lỗi: {str(e)[:50]}"
            hinh_thuc = f"Lỗi: {str(e)[:50]}"
        finally:
            if page:
                await page.close()
            # Small delay before releasing semaphore to be nice to the server
            # This sleep is now per concurrent worker, not per URL sequentially.
            await asyncio.sleep(0.5) # Reduced sleep as it's within concurrent tasks

        return {"URL": url, "NganhNghe": nganh_nghe, "HinhThuc": hinh_thuc}

async def scrape_job_details_concurrently(job_urls: list, concurrency: int = 5) -> pd.DataFrame:
    """
    Scrapes 'Ngành nghề' and 'Hình thức' for a list of job URLs concurrently.
    """
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"
    semaphore = asyncio.Semaphore(concurrency) # Limit concurrent tasks

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(user_agent=user_agent, java_script_enabled=True)
        # No need to create a page here, workers will create their own

        tasks = [fetch_single_url_details(url, context, semaphore) for url in job_urls]

        results = []
        # Using tqdm for progress bar with asyncio.gather
        for future in tqdm_asyncio(asyncio.as_completed(tasks), total=len(tasks), desc="Scraping job details"):
            result = await future
            results.append(result)

        await context.close() # Close context first
        await browser.close()

    return pd.DataFrame(results)


In [None]:
# --- Main Execution ---
async def main():
    # Using the DataFrame 'df' loaded in Cell 3
    if df.empty or 'URL' not in df.columns:
        print("DataFrame 'df' is empty or does not contain a 'URL' column. Exiting.")
        return

    # Taking all URLs from the loaded df.
    # If you were using the partitioning logic, list_of_job_urls would be pre-filtered.
    # For this example, let's assume df contains the URLs for the current part/file.
    list_of_job_urls = df["URL"].dropna().unique().tolist() # Get unique, non-null URLs

    # You can take a slice for testing, e.g., list_of_job_urls = list_of_job_urls[:20]
    # list_of_job_urls = df["URL"][:5].to_list() # As in your original example for testing

    if not list_of_job_urls:
        print("Danh sách URL trống sau khi lọc. Vui lòng cung cấp URL để cào dữ liệu.")
        return

    print(f"Bắt đầu cào dữ liệu cho {len(list_of_job_urls)} URLs...")

    # Adjust concurrency based on Colab's stability. 3-5 is a safe start.
    # Higher might be faster if Colab handles it, but can also lead to crashes/blocks.
    CONCURRENCY_LEVEL = 4
    detailed_df = await scrape_job_details_concurrently(list_of_job_urls, concurrency=CONCURRENCY_LEVEL)

    print("\n--- Kết quả cào dữ liệu ---")
    if not detailed_df.empty:
        from google.colab import data_table
        display(data_table.DataTable(detailed_df, include_index=False, num_rows_per_page=10))

        output_filename = "careerviet_nganh_nghe_hinh_thuc_CONCURRENT.xlsx"
        detailed_df.to_excel(output_filename, index=False, engine='openpyxl')
        print(f"\nDữ liệu đã được lưu vào file: {output_filename}")
    else:
        print("Không có dữ liệu nào được cào.")

In [None]:
await main()

Bắt đầu cào dữ liệu cho 1521 URLs...


Scraping job details:  26%|██▌       | 389/1521 [06:29<08:00,  2.35it/s]

Lỗi nghiêm trọng khi xử lý URL https://careerviet.vn/vi/tim-viec-lam/chuyen-vien-ngan-hang-dau-tu-m-a.35C464F5.html: Page.goto: Timeout 45000ms exceeded.
Call log:
  - navigating to "https://careerviet.vn/vi/tim-viec-lam/chuyen-vien-ngan-hang-dau-tu-m-a.35C464F5.html", waiting until "domcontentloaded"



Scraping job details:  64%|██████▎   | 966/1521 [08:46<02:17,  4.04it/s]