In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import time
import os
import asyncio
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError

# --- Configuration ---
INPUT_LOG_FILE = r"C:\Users\jyoth\Downloads\Master_data\log.csv"
OUTPUT_CSV_FILE = "enriched_dataset_automated.csv"
ZAUBACORP_URL = "https://www.zaubacorp.com/"

async def enrich_cin_with_playwright(page, cin: str):
    """
    Automates the scraping of a single CIN from ZaubaCorp.
    """
    enriched_records = []
    print(f"Processing CIN: {cin}...")

    try:
        await page.goto(ZAUBACORP_URL, timeout=60000)
        await page.fill("#searchid", cin)
        await page.click("button.search-button")
        await page.wait_for_load_state("networkidle")

        if "/company/" not in page.url:
            print("  -> Search results page detected. Clicking link...")
            await page.locator(f'a:has-text("{cin}")').click()

        await page.wait_for_selector("h1", timeout=30000)
        
        company_page_url = page.url
        print(f"  -> Scraping page: {company_page_url}")
        
        page_source = await page.content()
        soup = BeautifulSoup(page_source, 'html.parser')

        # --- Data Extraction ---
        company_name = soup.find('h1').text.strip() if soup.find('h1') else 'N/A'
        company_info_table = soup.find('table', class_='table-striped')
        status, state = 'N/A', 'N/A'
        if company_info_table:
            for row in company_info_table.find_all('tr'):
                cells = row.find_all('td')
                if len(cells) == 2:
                    if 'Company Status' in cells[0].text:
                        status = cells[1].text.strip()
                    if 'Registered State' in cells[0].text:
                        state = cells[1].text.strip()

        address_p = soup.find('p', string='Registered Address')
        if address_p and address_p.find_next_sibling('p'):
            address = address_p.find_next_sibling('p').text.strip().replace('\n', ', ')
            enriched_records.append({'COMPANY_NAME': company_name, 'STATE': state, 'STATUS': status, 'SOURCE': 'ZaubaCorp', 'FIELD': 'Registered_address', 'VALUE': address, 'SOURCE_URL': company_page_url})

        directors_h2 = soup.find('h2', string='Director Details')
        if directors_h2 and directors_h2.find_next_sibling('table'):
            directors_table = directors_h2.find_next_sibling('table')
            for row in directors_table.find_all('tr')[1:]:
                cells = row.find_all('td')
                if len(cells) > 1:
                    director_name = cells[1].text.strip()
                    enriched_records.append({'COMPANY_NAME': company_name, 'STATE': state, 'STATUS': status, 'SOURCE': 'ZaubaCorp', 'FIELD': 'Director_name', 'VALUE': director_name, 'SOURCE_URL': company_page_url})
        
        return enriched_records

    except PlaywrightTimeoutError:
        print(f"  -> ❌ Timed out for CIN '{cin}'. Might be a CAPTCHA or page load issue.")
        return []
    except Exception as e:
        print(f"  -> ❌ An unexpected error occurred for {cin}: {e}")
        return []

async def main():
    """
    Main function to orchestrate the scraping process.
    """
    if not os.path.exists(INPUT_LOG_FILE):
        print(f"❌ Error: Input file '{INPUT_LOG_FILE}' not found.")
        return

    changes_df = pd.read_csv(INPUT_LOG_FILE, dtype=str)
    unique_cins = changes_df['CIN'].dropna().unique().tolist()
    print(f"Found {len(unique_cins)} unique CINs to enrich.")
    all_enriched_data = []

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()

        for cin in unique_cins:
            enriched_info = await enrich_cin_with_playwright(page, cin)
            if enriched_info:
                all_enriched_data.extend(enriched_info)
            await page.wait_for_timeout(1000)
        
        await browser.close()

    if all_enriched_data:
        enriched_df = pd.DataFrame(all_enriched_data)
        enriched_df.insert(0, 'CIN', enriched_df['SOURCE_URL'].str.split('/').str[-1])
        
        enriched_df.to_csv(OUTPUT_CSV_FILE, index=False)
        print(f"\n✅ Enrichment complete! Data saved to '{OUTPUT_CSV_FILE}'")
    else:
        print("\nCould not enrich any data.")