In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time, os

# === SETUP SELENIUM ===
options = Options()
options.add_argument("--start-maximized")  # tampilkan browser (non-headless dulu)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

url_awal = "https://katalog.inaproc.id/search?keyword=penerangan+jalan+umum"
driver.get(url_awal)
time.sleep(5)

produk_list = []
os.makedirs("data_scrape_final2", exist_ok=True)

# === LOOP SEMUA HALAMAN ===
for p in range(1, 46):  # total 45 halaman
    print(f"\nüåê Sedang di halaman {p}")

    # Tunggu produk pertama muncul
    WebDriverWait(driver, 30).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "a[data-score]"))
    )
    time.sleep(2)

    # üü© Scroll bertahap sampai semua produk muncul
    last_count = 0
    max_scroll = 15
    for s in range(max_scroll):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1.5)
        produk_blocks = driver.find_elements(By.CSS_SELECTOR, "a[data-score]")
        current_count = len(produk_blocks)
        print(f"üîÑ Scroll {s+1}: {current_count} produk tampil")

        # kalau produk berhenti bertambah ‚Üí stop scroll
        if current_count == last_count:
            break
        last_count = current_count

    produk_blocks = driver.find_elements(By.CSS_SELECTOR, "a[data-score]")
    print(f"‚úÖ Total {len(produk_blocks)} produk ditemukan di halaman {p}")

    # === EKSTRAK DATA ===
    halaman_data = []
    for block in produk_blocks:
        def safe(sel):
            try:
                return block.find_element(By.CSS_SELECTOR, sel).text
            except:
                return ""
        nama = safe(".line-clamp-2.text-sm.text-tertiary500")
        harga = safe(".w-fit.truncate.text-sm.font-bold.text-tertiary500 div")
        label_produk = " | ".join(
            [x.text for x in block.find_elements(By.CSS_SELECTOR, ".flex.flex-wrap.gap-1 div")]
        )
        penyedia = safe(".bg-information25.px-1.font-semibold.text-information500")
        lokasi = safe(".h-4.cursor-pointer.overflow-hidden.text-tertiary300 span:nth-child(1)")
        vendor = safe(".h-4.cursor-pointer.overflow-hidden.text-tertiary300 span:nth-child(2)")
        terjual = safe(".text-xs.text-tertiary300 div.ml-1")
        kategori = safe(".rounded-2.bg-information25.px-2.py-1.text-xs.font-semibold.text-information500")
        status = safe(".rounded-2.bg-tertiary500.p-1.text-xs.font-semibold.text-warning25") or "Aktif"
        href = block.get_attribute("href")

        halaman_data.append({
            "Halaman": p, "Nama Produk": nama, "Harga": harga,
            "Label Produk": label_produk, "Jenis Penyedia": penyedia,
            "Lokasi": lokasi, "Vendor ": vendor, "Jumlah Terjual": terjual,
            "Kategori": kategori, "Status Produk": status, "URL": href
        })

    # === SIMPAN PER HALAMAN ===
    df_page = pd.DataFrame(halaman_data)
    df_page.to_csv(f"data_scrape_final2/halaman_{p}.csv", index=False, encoding="utf-8-sig")
    print(f"üíæ halaman_{p}.csv disimpan ({len(halaman_data)} produk)")
    produk_list.extend(halaman_data)

    # === SIMPAN BACKUP SETIAP 10 HALAMAN ===
    if p % 10 == 0:
        df_backup = pd.DataFrame(produk_list)
        df_backup.to_csv(f"data_scrape_final2/backup_sampai_halaman_{p}.csv", index=False, encoding="utf-8-sig")
        print(f"üß© Backup disimpan: data_scrape_final2/backup_sampai_halaman_{p}.csv")

    # === KLIK HALAMAN SELANJUTNYA (angka di bawah) ===
    if p < 45:
        try:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1)
            next_button = driver.find_element(By.XPATH, f"//button[normalize-space(text())='{p+1}']")
            driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
            time.sleep(0.5)
            driver.execute_script("arguments[0].click();", next_button)
            print(f"‚û°Ô∏è Klik tombol halaman {p+1}")
            time.sleep(5)
        except Exception as e:
            print(f"‚ö†Ô∏è Gagal klik halaman {p+1}: {e}")
            break

driver.quit()

# === SIMPAN GABUNGAN AKHIR ===
df_final = pd.DataFrame(produk_list)
df_final.to_csv("data_scrape_final2/penerangan_jalan_umum_full.csv", index=False, encoding="utf-8-sig")
print(f"\n Selesai! Total {len(df_final)} produk dari 45 halaman tersimpan.")
print("Semua file tersimpan di folder: data_scrape_final2/")



üåê Sedang di halaman 1
üîÑ Scroll 1: 60 produk tampil
üîÑ Scroll 2: 60 produk tampil
‚úÖ Total 60 produk ditemukan di halaman 1
üíæ halaman_1.csv disimpan (60 produk)
‚û°Ô∏è Klik tombol halaman 2

üåê Sedang di halaman 2
üîÑ Scroll 1: 60 produk tampil
üîÑ Scroll 2: 60 produk tampil
‚úÖ Total 60 produk ditemukan di halaman 2
üíæ halaman_2.csv disimpan (60 produk)
‚û°Ô∏è Klik tombol halaman 3

üåê Sedang di halaman 3
üîÑ Scroll 1: 60 produk tampil
üîÑ Scroll 2: 60 produk tampil
‚úÖ Total 60 produk ditemukan di halaman 3
üíæ halaman_3.csv disimpan (60 produk)
‚û°Ô∏è Klik tombol halaman 4

üåê Sedang di halaman 4
üîÑ Scroll 1: 60 produk tampil
üîÑ Scroll 2: 60 produk tampil
‚úÖ Total 60 produk ditemukan di halaman 4
üíæ halaman_4.csv disimpan (60 produk)
‚û°Ô∏è Klik tombol halaman 5


KeyboardInterrupt: 

## ANALISIS

In [11]:
import pandas as pd

df = pd.read_excel("datarow.xlsx")
df.head()

Unnamed: 0,Halaman,Nama Produk,Harga,Label Produk,Jenis Penyedia,Lokasi,Jumlah Terjual,Kategori,Status Produk,URL
0,1,LAMPU PENERANGAN JALAN UMUM,"Rp 13.737.364,00",PDN,UMKK,Kota Adm. Jakarta Barat,0,Barang,Aktif,https://katalog.inaproc.id/solusi-klik-cerdas-...
1,1,LAMPU PENERANGAN JALAN UMUM,"Rp 13.737.364,00",PDN,UMKK,Kota Tangerang,0,Barang,Aktif,https://katalog.inaproc.id/cv-mulia-berkahtama...
2,1,LAMPU PENERANGAN JALAN UMUM,"Rp 13.737.364,00",PDN,UMKK,Kota Tangerang,0,Barang,Aktif,https://katalog.inaproc.id/solusi-klik/lampu-p...
3,1,LAMPU PENERANGAN JALAN UMUM,"Rp 13.737.364,00",PDN,UMKK,Kota Balikpapan,0,Barang,Aktif,https://katalog.inaproc.id/solusi-klik-tanjung...
4,1,LAMPU PENERANGAN JALAN UMUM,"Rp 13.737.364,00",PDN,UMKK,Kab. Bandung,0,Barang,Aktif,https://katalog.inaproc.id/solusi-klik-global/...


In [14]:
import re
def clean_harga(value):
    if pd.isna(value):
        return None

    text = str(value).lower().strip()

    # --- 1Ô∏è‚É£ Jika mengandung "jt" (juta) ---
    if "jt" in text:
        text = text.replace("rp", "").replace("jt", "").replace(" ", "")
        # Tangkap angka dalam format "xx" atau "xx-yy"
        nums = re.findall(r'\d+', text)
        nums = [float(n) for n in nums]
        if len(nums) == 1:
            return nums[0] * 1_000_000
        elif len(nums) >= 2:
            return sum(nums[:2]) / 2 * 1_000_000
        else:
            return None

    # --- 2Ô∏è‚É£ Jika format rupiah lengkap (Rp 13.737.364,00) ---
    elif "rp" in text:
        # Hapus teks selain angka dan tanda titik/koma
        num = re.sub(r'[^0-9,\.]', '', text)
        # Ganti koma dengan titik untuk normalisasi
        num = num.replace(',', '.')
        # Hapus titik ribuan dengan regex aman
        num = re.sub(r'(?<=\d)\.(?=\d{3}\b)', '', num)
        try:
            return float(num)
        except:
            return None

    # --- 3Ô∏è‚É£ Kalau gak ada Rp atau jt ---
    else:
        nums = re.findall(r'\d+', text)
        if nums:
            return float(nums[0])
        return None

# === Terapkan fungsi ke kolom Harga ===
df["Harga_Bersih"] = df["Harga"].apply(clean_harga)

# === Cek hasil pertama ===
print(df[["Harga", "Harga_Bersih"]].head(20))


                Harga  Harga_Bersih
0    Rp 13.737.364,00    13737364.0
1    Rp 13.737.364,00    13737364.0
2    Rp 13.737.364,00    13737364.0
3    Rp 13.737.364,00    13737364.0
4    Rp 13.737.364,00    13737364.0
5    Rp 13.737.364,00    13737364.0
6    Rp 13.737.364,00    13737364.0
7    Rp 13.737.364,00    13737364.0
8    Rp 13.737.364,00    13737364.0
9    Rp 13.875.000,00    13875000.0
10   Rp 49.950.000,00    49950000.0
11  Rp 177.600.000,00   177600000.0
12   Rp 16.872.000,00    16872000.0
13  Rp 111.000.000,00   111000000.0
14   Rp 55.000.001,00    55000001.0
15            Rp 2,00           2.0
16    Rp 4.301.250,00     4301250.0
17    Rp 4.301.250,00     4301250.0
18            Rp 2,00           2.0
19    Rp 4.301.250,00     4301250.0


In [15]:
df['Harga_Bersih'].describe()

count    2.650000e+03
mean     2.307513e+07
std      5.726485e+07
min      0.000000e+00
25%      5.247426e+06
50%      1.660705e+07
75%      3.150000e+07
max      2.221610e+09
Name: Harga_Bersih, dtype: float64

In [16]:
df['Label Produk'].value_counts()

Label Produk
PDN             2642
PDN | Grosir       5
Import             3
Name: count, dtype: int64

In [17]:
df['Jenis Penyedia'].value_counts(normalize=True) * 100

Jenis Penyedia
UMKK    100.0
Name: proportion, dtype: float64