In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
import os
import json
import csv

def scrape_page(url):
    # Add delay between requests
    time.sleep(3)
    
    # Configure headers
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    }
    
    # Make request with retries
    max_retries = 5
    for attempt in range(max_retries):
        try:
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            break
        except requests.exceptions.RequestException as e:
            if attempt == max_retries - 1:
                print(f"Failed after {max_retries} attempts: {e}")
                return None, None
            print(f"Attempt {attempt + 1} failed, retrying...")
            time.sleep(3)
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract data from table
    data = []
    table_rows = soup.select("#newspaper-a tbody tr")
    
    if not table_rows:
        print(f"No data found on page: {url}")
        return [], soup
    
    for row in table_rows:
        try:
            # Handle potential None values with safe access
            nomor_cell = row.select_one("td:nth-child(1)")
            if not nomor_cell:
                continue
            # Remove dot from nomor
            nomor = nomor_cell.text.strip().replace('.', '')
            
            perusahaan_cell = row.select_one("td:nth-child(2) b")
            if not perusahaan_cell:
                continue
            perusahaan = perusahaan_cell.text.strip()
            
            alamat_cell = row.select_one("td:nth-child(2)")
            if not alamat_cell:
                continue
                
            kbli_cell = row.select_one("td:nth-child(3)")
            kbli = kbli_cell.text.strip() if kbli_cell else ""

            # Get address parts by <br> tags
            address_parts = [part.strip() for part in alamat_cell.get_text(separator='|').split('|') if part.strip()]
            
            # Remove company name from address parts
            address_parts = [p for p in address_parts if p != perusahaan]
            
            # Last part usually contains phone number
            telepon = ""
            alamat_full = ""
            if len(address_parts) > 0:
                for part in address_parts:
                    if "Telp" in part:
                        telepon = part
                    else:
                        if alamat_full:
                            alamat_full += " "
                        alamat_full += part

            # Split address into components by comma
            addr_components = [x.strip() for x in alamat_full.split(',')]
            
            # Initialize address components
            alamat = ""
            kelurahan = ""
            kecamatan = ""
            kota = ""
            provinsi = ""
            
            # Assign components based on position from end
            if len(addr_components) >= 5:
                provinsi = addr_components[-1]
                kota = addr_components[-2]
                kecamatan = addr_components[-3]
                kelurahan = addr_components[-4]
                alamat = ', '.join(addr_components[:-4])
            elif len(addr_components) == 4:
                provinsi = addr_components[-1]
                kota = addr_components[-2]
                kecamatan = addr_components[-3]
                alamat = addr_components[-4]
            elif len(addr_components) == 3:
                provinsi = addr_components[-1]
                kota = addr_components[-2]
                alamat = addr_components[-3]
            elif len(addr_components) == 2:
                provinsi = addr_components[-1]
                alamat = addr_components[-2]
            elif len(addr_components) == 1:
                alamat = addr_components[0]

            data.append({
                "No": nomor,
                "Perusahaan": perusahaan,
                "Alamat": alamat,
                "Kelurahan": kelurahan,
                "Kecamatan": kecamatan,
                "Kota": kota,
                "Provinsi": provinsi,
                "Telepon": telepon,
                "KBLI": kbli
            })
        except Exception as e:
            print(f"Error processing row: {e}")
            continue
    
    return data, soup

def save_progress(current_province_index, current_url, record_count):
    progress = {
        "current_province_index": current_province_index,
        "current_url": current_url,
        "record_count": record_count,
        "last_updated": time.strftime("%Y-%m-%d %H:%M:%S")
    }
    
    # Save progress to file
    with open("output/scraping_progress.json", "w") as f:
        json.dump(progress, f, indent=2)
    
    # Get province name for logging
    provinces = get_provinces()
    province_name = provinces[current_province_index]["name"] if current_province_index < len(provinces) else "Completed"
    print(f"Progress saved. Current province: {province_name}, Records: {record_count}")

def load_progress():
    if os.path.exists("output/scraping_progress.json"):
        try:
            with open("output/scraping_progress.json", "r") as f:
                progress = json.load(f)
            
            print(f"Resuming from previous session: {progress['last_updated']}")
            return progress
        except Exception as e:
            print(f"Error loading progress file: {e}")
    
    return None

def get_provinces():
    # Define provinces to scrape - centralized in one place
    return [
        {"name": "Bali", "code": "0bCtGgIPKU5sHVP-I3RJR_zaGkICRuxrBuLF8pn6okw,"},
        {"name": "Banten", "code": "szh3Nx9NmSTOTpqeCuh7rOYNcZov8Oricx3WNJaMJkg,"},
        {"name": "Bengkulu", "code": "7PXdLsCfLTabVYR-HdFCQ1lgobeRW7wKz4lderTiLAA,"},
        {"name": "DI Yogyakarta", "code": "YaPZnRqzpP2obO5M2vJBT-05qeMzPo7KQSLLhi4zW28,"},
        {"name": "DKI Jakarta", "code": "JWlMr9dBZoh4NcRGhLV2lw1ZzJLjyJbE3zQxksmdgRg,"},
        {"name": "Gorontalo", "code": "lj03S7Gb2eruSZuL8ba_7yLcdYZJ-4LWKyIXOfQRw_Y,"},
        {"name": "Jambi", "code": "sY3Wej_tKISCKJBFQVPbgxGBtclmA1CBsP2XzHLIQnE,"},
        {"name": "Jawa Barat", "code": "g-g92cJf63GcZzFru_hX80HG3NA95zwE5tWTVGAI5xY,"},
        {"name": "Jawa Tengah", "code": "JQYaw_F3IWxjLT5vFsXUh6CwfCBsw3zUdgJGGaNtqc0,"},
        {"name": "Jawa Timur", "code": "HJCA4sCEb2EHadmM-d2MTdZLHIqCSlODwIEaR0IZuz0,"},
        {"name": "Kalimantan Barat", "code": "IxrCrXAp3g1zI59mewRpTAmQPCTS890Aict2e5UMFTc,"},
        {"name": "Kalimantan Selatan", "code": "lNWKArPtvRuMF1NSp4iFoz2d5bnuzIiwQ2vkyHhak_0,"},
        {"name": "Kalimantan Tengah", "code": "IBaS1sFWk7P4GBK8cUj0wJQECSfN05EH7hE3fSYvcsI,"},
        {"name": "Kalimantan Timur", "code": "mtGQZOTaw8Pwbfknh9NnVNXHPjm3AYU24il3lqoHsKY,"},
        {"name": "Kalimantan Utara", "code": "9l2EyMtxPzG7DrI1BYbqzO2dsYxmSrsR-F5mUuznkUI,"},
        {"name": "Kepulauan Bangka Belitung", "code": "qPoGfR2CPXOIzHC-qtcv3dZPViN_U5d_HfvBDijVZDU,"},
        {"name": "Kepulauan Riau", "code": "d_6LRe0K-myaHNofmityPTFUXnCdgWUDYPbIwv41Lpo,"},
        {"name": "Lampung", "code": "EtDhV24I_AgUTjwdPnaAJCuEdnS41hZIYUPlkeLCd04,"},
        {"name": "Luar Negeri", "code": "QAjmoLvOjrGxggCMLV73K4-Z1v3XRBEIOjJ5r4KRRII,"},
        {"name": "Maluku", "code": "zNm24FybEw4j4JVWEbA14S2YgBKEkGiHknYNpZNps88,"},
        {"name": "Maluku Utara", "code": "rY-Vee9aPuuq68KevO_FPXkNFm_LAavy4LvMDC86yf0,"},
        {"name": "Nangroe Aceh Darussalam", "code": "qH6KCKkWFhD8UGv2rwsLvzOk-778LwSl9O2-qCSP-x8,"},
        {"name": "Nusa Tenggara Barat", "code": "DRelepyjdDkPntAag94avf-ou4ALhHIRkWctL4k3cxk,"},
        {"name": "Nusa Tenggara Timur", "code": "mC-0BzjjZqLYu-Kkc9Rg4j15GLTQBzgR3uIgp3mgKSg,"},
        {"name": "Papua", "code": "RdPXmu5cadJ7SQLmAalkvlc1qJA4XYziZUJL6kLsePs,"},
        {"name": "Papua Barat", "code": "of0LzbEMgs5YXxmpnXy5hL9vkPwtQy-j8nlEBIjZMy4,"},
        {"name": "Papua Barat Daya", "code": "jNG3dvTuNYIoPv1m8HWs-vghiRVIYvMkKg18O9kKAyo,"},
        {"name": "Papua Pegunungan", "code": "9BGr8lSPa1IfmvK8n-rRQjN7e3QdCzLKpvCQetCwT2Q,"},
        {"name": "Papua Selatan", "code": "ub8mQqHT259JPsWjDrZxfJNswELNe2ODk4No391hiKM,"},
        {"name": "Papua Tengah", "code": "4kld4CXPtKFdCyuTe_2kU8EIn5s-TipyTDFQ8RGHcHo,"},
        {"name": "Riau", "code": "h5kdoWTdkKA6SJRYKVGHBzC_kzW74s0P5uSTAZ3Qeqs,"},
        {"name": "Sulawesi Barat", "code": "uyqjnkrb04MuauY79cEh_jydyZPiKrNgd__nBrN89W0,"},
        {"name": "Sulawesi Selatan", "code": "NgxlXm4O9RisBSPbBxFCkU3EICDeSjBUOKkjQSYLo2c,"},
        {"name": "Sulawesi Tengah", "code": "75xafGvwr11mzUWpufY1t8OR4CJHbvT1cfvt7N7sO10,"},
        {"name": "Sulawesi Tenggara", "code": "AJ1mml5HqF88qgIV7PYpX4zyWJ7G0jCWodfIKonUyPM,"},
        {"name": "Sulawesi Utara", "code": "wALq168Ez0AZMs5Nc3aoSM5Zr-E0jM0Z_hYKkl-i2Zo,"},
        {"name": "Sumatera Barat", "code": "dH1L8Mh_lQt9ynweGqgAQt06SfXnOsW51ueJX7uv5Kg,"},
        {"name": "Sumatera Selatan", "code": "LT6Nl1-93HfnTOoo8L8ysPSKCRwmEnlqszx6HfH2tdk,"},
        {"name": "Sumatera Utara", "code": "ehCDQCBFt5DYprYqIgwr1h-bqWX8nCstPP1LL3NuLXY,"}
    ]

def is_scraping_completed(progress):
    """Check if scraping is already completed based on progress file"""
    if not progress:
        return False
    
    provinces = get_provinces()
    
    # If current_province_index is beyond the last province index, scraping is complete
    if progress.get("current_province_index", 0) >= len(provinces):
        return True
    
    # If current_url is None and we're at the last province, it might be complete
    if progress.get("current_url") is None and progress.get("current_province_index", 0) == len(provinces) - 1:
        return True
        
    return False

def main():
    base_url = "https://kemenperin.go.id/direktori-perusahaan"
    output_file = "output/kemenperin.csv"
    
    # Create output directory
    os.makedirs("output", exist_ok=True)
    
    # Check if file exists to determine if we need to write headers
    file_exists = os.path.exists(output_file)
    record_count = 0
    
    # Count existing records if file exists
    if file_exists:
        try:
            with open(output_file, 'r', encoding='utf-8') as f:
                record_count = sum(1 for _ in f) - 1  # Subtract 1 for header
            print(f"Found existing file with {record_count} records")
        except Exception as e:
            print(f"Error counting existing records: {e}")
            record_count = 0
    
    # Get provinces list
    provinces = get_provinces()
    
    # Check for saved progress
    progress = load_progress()
    
    # Check if scraping is already completed
    if is_scraping_completed(progress):
        print("Scraping was already completed in the previous run.")
        print(f"Last update: {progress.get('last_updated')}")
        print(f"Total records: {progress.get('record_count', record_count)}")
        return
    
    current_province_index = 0
    current_url = None
    
    if progress:
        current_province_index = progress.get("current_province_index", 0)
        current_url = progress.get("current_url")
        record_count = progress.get("record_count", record_count)
        
        # Check if current_province_index is valid
        if current_province_index < len(provinces):
            print(f"Resuming with province: {provinces[current_province_index]['name']}")
            if current_url:
                print(f"Starting URL: {current_url}")
            else:
                current_url = f"{base_url}?what=&prov={provinces[current_province_index]['code']}"
                print(f"No URL saved, starting with: {current_url}")
        else:
            # Index out of range, reset to start with the first province
            print("Saved province index is out of range. Starting from the beginning.")
            current_province_index = 0
            current_url = f"{base_url}?what=&prov={provinces[0]['code']}"
    else:
        # No progress file, start with first province
        current_url = f"{base_url}?what=&prov={provinces[0]['code']}"
    
    # Define headers for CSV
    headers = ["No", "Perusahaan", "Alamat", "Kelurahan", "Kecamatan", "Kota", "Provinsi", "Telepon", "KBLI"]
    
    # Write headers if file doesn't exist
    if not file_exists:
        with open(output_file, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(headers)
    
    try:
        # Process each province starting from the saved position
        for i in range(current_province_index, len(provinces)):
            province = provinces[i]
            print(f"\nProcessing province: {province['name']}")
            
            # Set the starting URL
            if i == current_province_index and current_url:
                url = current_url
            else:
                url = f"{base_url}?what=&prov={province['code']}"
            
            while url:
                print(f"Scraping: {url}")
                page_data, soup = scrape_page(url)
                
                # Check if soup is None (request failed)
                if soup is None:
                    print(f"Failed to get data for {url}, skipping to next province")
                    break
                
                if page_data:
                    # Append data to CSV file directly
                    with open(output_file, mode='a', newline='', encoding='utf-8') as file:
                        writer = csv.writer(file)
                        for item in page_data:
                            writer.writerow([
                                item["No"],
                                item["Perusahaan"],
                                item["Alamat"],
                                item["Kelurahan"],
                                item["Kecamatan"],
                                item["Kota"],
                                item["Provinsi"],
                                item["Telepon"],
                                item["KBLI"]
                            ])
                    
                    record_count += len(page_data)
                    print(f"Scraped {len(page_data)} records. Total records: {record_count}")
                    
                    # Get next page URL
                    try:
                        active_item = soup.select_one('.pagination li.active')
                        
                        if active_item and active_item.find_next_sibling('li'):
                            next_item = active_item.find_next_sibling('li')
                            next_link = next_item.find('a')
                            if next_link and next_link.get('href'):
                                next_href = next_link['href']
                                next_href = next_href.replace('direktori-perusahaan', '', 1)
                                url = base_url + next_href
                            else:
                                url = None
                        else:
                            url = None
                    except Exception as e:
                        print(f"Error finding next page: {e}")
                        url = None
                else:
                    print("No data found on page, moving to next province")
                    url = None
                
                # Save progress after each page
                save_progress(i, url, record_count)
                
                # Add delay between pages
                if url:
                    print(f"Next URL: {url}")
                    time.sleep(2)
            
            # Update progress when moving to next province
            current_province_index = i + 1
            if current_province_index < len(provinces):
                current_url = f"{base_url}?what=&prov={provinces[current_province_index]['code']}"
                save_progress(current_province_index, current_url, record_count)
            
            # Add delay between provinces
            time.sleep(5)
        
        print("Scraping completed successfully!")
        
    except KeyboardInterrupt:
        print("\nScraping interrupted by user.")
        save_progress(current_province_index, url, record_count)
        print("Progress saved. You can resume later.")
    except Exception as e:
        print(f"\nError occurred: {e}")
        save_progress(current_province_index, url, record_count)
        print("Progress saved despite error.")

if __name__ == "__main__":
    main()