In [None]:
import os
import re
import time
import json
import requests
import pandas as pd
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from pdf2image import convert_from_bytes
import pytesseract
from openai import OpenAI

# =============================================
# CONFIG
# =============================================

TOTAL_PAGES = 1261
CHUNK_SIZE = 100

DETAIL_WORKERS = 20
OCR_WORKERS = 10
OPENAI_WORKERS = 3   # keep small

BASE_LIST_URL = "https://kursus.kemendikdasmen.go.id/?page={}"
DETAIL_BASE = "https://referensi.data.kemendikdasmen.go.id/pendidikan/npsn/{}"

OUTPUT_DIR = "output"
CACHE_FILE = "program_cache.json"

os.makedirs(OUTPUT_DIR, exist_ok=True)

HEADERS = {
    "User-Agent": "Mozilla/5.0",
    "Referer": "https://referensi.data.kemendikdasmen.go.id/"
}

client = OpenAI()

# Optional for Mac
# pytesseract.pytesseract.tesseract_cmd = "/opt/homebrew/bin/tesseract"

# =============================================
# LOAD CACHE
# =============================================

if os.path.exists(CACHE_FILE):
    with open(CACHE_FILE, "r") as f:
        program_cache = json.load(f)
else:
    program_cache = {}

# =============================================
# RESUME LOGIC
# =============================================

pattern = r"lkp_(\d+)_(\d+)\.csv"
completed = []

for file in os.listdir(OUTPUT_DIR):
    match = re.match(pattern, file)
    if match:
        completed.append(int(match.group(2)))

if completed:
    start_page = max(completed) + 1
else:
    start_page = 1

print(f"Resuming from page {start_page}")

# =============================================
# OCR FUNCTION
# =============================================

def ocr_pdf(sk_url):
    try:
        pdf_response = requests.get(sk_url, timeout=20)
        pdf_bytes = pdf_response.content
        images = convert_from_bytes(pdf_bytes)

        text = ""
        for img in images:
            text += pytesseract.image_to_string(img, lang="ind") + "\n"

        return text[:8000]

    except:
        return ""

# =============================================
# OPENAI FUNCTION
# =============================================

def extract_programs(npsn, sk_url):

    if not sk_url:
        return []

    if npsn in program_cache:
        return program_cache[npsn]

    text = ocr_pdf(sk_url)

    if not text.strip():
        program_cache[npsn] = []
        return []

    prompt = f"""
Extract ONLY the list of training programs offered in this Indonesian SK document.
Return ONLY a JSON array of strings.
If none found, return [].

Document:
\"\"\"{text}\"\"\"
"""

    for _ in range(3):
        try:
            response = client.chat.completions.create(
                model="gpt-4.1-mini",
                messages=[
                    {"role": "system", "content": "Extract structured training program names."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0
            )

            programs = json.loads(response.choices[0].message.content.strip())
            program_cache[npsn] = programs

            with open(CACHE_FILE, "w") as f:
                json.dump(program_cache, f)

            return programs

        except:
            time.sleep(2)

    return []

# =============================================
# MAIN LOOP BY CHUNK
# =============================================

for chunk_start in range(start_page, TOTAL_PAGES + 1, CHUNK_SIZE):

    chunk_end = min(chunk_start + CHUNK_SIZE - 1, TOTAL_PAGES)
    print(f"\nProcessing pages {chunk_start}-{chunk_end}")

    # ----------------------------------------
    # LIST SCRAPE
    # ----------------------------------------

    list_records = []

    for page in tqdm(range(chunk_start, chunk_end + 1),
                     desc="List Pages"):
        response = requests.get(BASE_LIST_URL.format(page),
                                headers=HEADERS)
        soup = BeautifulSoup(response.text, "lxml")

        table = soup.find("table")
        if not table:
            continue

        rows = table.find_all("tr")[1:]

        for row in rows:
            cols = row.find_all("td")
            if len(cols) >= 6:
                list_records.append({
                    "NPSN": cols[0].get_text(strip=True),
                    "Nama LKP": cols[1].get_text(strip=True),
                    "Provinsi (List)": cols[2].get_text(strip=True),
                    "Kabupaten (List)": cols[3].get_text(strip=True),
                    "Kecamatan (List)": cols[4].get_text(strip=True),
                    "Status (List)": cols[5].get_text(strip=True)
                })

    list_df = pd.DataFrame(list_records)

    # ----------------------------------------
    # DETAIL SCRAPE (PARALLEL)
    # ----------------------------------------

    def scrape_detail(npsn):
        try:
            url = DETAIL_BASE.format(npsn)
            response = requests.get(url, headers=HEADERS, timeout=10)
            soup = BeautifulSoup(response.text, "lxml")

            data = {"NPSN": npsn}

            for container in soup.find_all("div", class_="tabby-content"):
                rows = container.find_all("tr")
                for row in rows:
                    cols = row.find_all("td")

                    if len(cols) >= 4:
                        key = cols[1].get_text(strip=True)

                        if "File SK Operasional" in key:
                            link_tag = cols[3].find("a", href=True)
                            value = link_tag["href"] if link_tag else None
                        else:
                            value = cols[3].get_text(strip=True)

                        if key:
                            data[key] = value

            return data

        except:
            return {"NPSN": npsn}

    detail_records = []

    with ThreadPoolExecutor(max_workers=DETAIL_WORKERS) as executor:
        futures = {
            executor.submit(scrape_detail, npsn): npsn
            for npsn in list_df["NPSN"]
        }

        for future in tqdm(as_completed(futures),
                           total=len(futures),
                           desc="Detail Pages"):
            detail_records.append(future.result())

    detail_df = pd.DataFrame(detail_records)

    final_df = list_df.merge(detail_df, on="NPSN", how="left")

    # ----------------------------------------
    # PROGRAM EXTRACTION (CONTROLLED)
    # ----------------------------------------

    programs_map = {}

    with ThreadPoolExecutor(max_workers=OPENAI_WORKERS) as executor:
        futures = {
            executor.submit(
                extract_programs,
                row["NPSN"],
                row.get("File SK Operasional ()")
            ): row["NPSN"]
            for _, row in final_df.iterrows()
        }

        for future in tqdm(as_completed(futures),
                           total=len(futures),
                           desc="Programs"):
            programs_map[futures[future]] = future.result()

    final_df["Programs"] = final_df["NPSN"].map(programs_map)

    # ----------------------------------------
    # SAVE CHUNK
    # ----------------------------------------

    output_path = os.path.join(
        OUTPUT_DIR,
        f"lkp_{chunk_start}_{chunk_end}.csv"
    )

    final_df.to_csv(output_path,
                    index=False,
                    encoding="utf-8-sig")

    print(f"Saved {output_path}")

print("\nALL DONE.")


In [None]:
import os
import ast
import json
import pandas as pd
from openai import OpenAI
from tqdm import tqdm

# =============================================
# CONFIG
# =============================================

OUTPUT_DIR = "output"

CATEGORIES = [
    "Tata_Rias_dan_Kecantikan",
    "Tata_Busana",
    "Tata_Boga_Memasak",
    "Teknik_Komputer",
    "Teknik_Non_Komputer",
    "Bahasa",
    "Bimbel_Kursus_Pendidikan",
    "Hotel_Pariwisata",
    "Seni_dan_Budaya",
    "Lainnya"
]

client = OpenAI()

# =============================================
# STEP 1 — MERGE ALL FILES
# =============================================

files = [
    os.path.join(OUTPUT_DIR, f)
    for f in os.listdir(OUTPUT_DIR)
    if f.endswith(".csv")
]

df_list = [pd.read_csv(f) for f in files]
final_df = pd.concat(df_list, ignore_index=True)

# =============================================
# STEP 2 — FIX PROGRAMS COLUMN
# =============================================

def safe_eval(x):
    try:
        return ast.literal_eval(x) if isinstance(x, str) else []
    except:
        return []

final_df["Programs"] = final_df["Programs"].apply(safe_eval)



# =============================================
# STEP 4 — CLASSIFICATION FUNCTION
# =============================================

def classify_programs(program_list):

    if not program_list:
        return {cat: 0 for cat in CATEGORIES}

    prompt = f"""
Classify these Indonesian training programs:

{program_list}

Into these categories:
{CATEGORIES}

Return ONLY valid JSON like:
{{
  "Tata_Rias_dan_Kecantikan": 0 or 1,
  ...
}}

Use 1 if at least one program fits category.
Otherwise 0.
"""

    try:
        response = client.chat.completions.create(
            model="gpt-4.1-mini",
            messages=[
                {"role": "system", "content": "Classify training programs."},
                {"role": "user", "content": prompt}
            ],
            temperature=0
        )

        return json.loads(response.choices[0].message.content.strip())

    except:
        return {cat: 0 for cat in CATEGORIES}

# =============================================
# STEP 5 — RUN CLASSIFICATION
# =============================================

results = []

for programs in tqdm(final_df["Programs"], desc="Classifying"):
    results.append(classify_programs(programs))

classification_df = pd.DataFrame(results)

final_df = pd.concat([final_df, classification_df], axis=1)

# =============================================
# OUTPUT
# =============================================

print("\nCLASSIFIED SAMPLE:")
print(final_df[["Programs"] + CATEGORIES])

final_df

# =============================================
# SAVE SAMPLE TO CSV
# =============================================

final_df.to_csv(
    "lkp_all.csv",
    index=False,
    encoding="utf-8-sig"
)

print("\nSaved to lkp_all.csv")



In [None]:
os.environ["OPENAI_API_KEY"]=""

In [None]:
import os
import ast
import json
import re
import math
import pandas as pd
from openai import OpenAI
from tqdm import tqdm

# =============================================
# CONFIG
# =============================================

INPUT_DIR = "output"
OUTPUT_DIR = "output_2"
BATCH_SIZE = 500

CATEGORIES = [
    "Tata_Rias_dan_Kecantikan",
    "Tata_Busana",
    "Tata_Boga_Memasak",
    "Teknik_Komputer",
    "Teknik_Non_Komputer",
    "Bahasa",
    "Bimbel_Kursus_Pendidikan",
    "Hotel_Pariwisata",
    "Seni_dan_Budaya",
    "Lainnya"
]

os.makedirs(OUTPUT_DIR, exist_ok=True)

client = OpenAI()

# =============================================
# STEP 1 — MERGE ALL FILES
# =============================================

files = [
    os.path.join(INPUT_DIR, f)
    for f in os.listdir(INPUT_DIR)
    if f.endswith(".csv")
]
files = sorted(files)

df_list = [pd.read_csv(f) for f in files]
final_df = pd.concat(df_list, ignore_index=True)

# =============================================
# STEP 2 — FIX PROGRAMS COLUMN
# =============================================

def safe_eval(x):
    try:
        return ast.literal_eval(x) if isinstance(x, str) else []
    except:
        return []

final_df["Programs"] = final_df["Programs"].apply(safe_eval)

print("Total rows:", len(final_df))

# =============================================
# SAFE JSON PARSER (prevents all-0 due to JSON noise)
# =============================================

def safe_json_parse(raw: str):
    raw = (raw or "").strip()
    raw = raw.replace("```json", "").replace("```", "").strip()

    try:
        return json.loads(raw)
    except:
        m = re.search(r"\{.*\}", raw, re.DOTALL)
        if m:
            try:
                return json.loads(m.group())
            except:
                pass

    return {cat: 0 for cat in CATEGORIES}

# =============================================
# STEP 3 — CLASSIFICATION FUNCTION
# =============================================

def classify_programs(program_list):

    if not program_list:
        return {cat: 0 for cat in CATEGORIES}

    prompt = f"""
Classify these Indonesian training programs:

{program_list}

Into these categories:
{CATEGORIES}

Return STRICT JSON ONLY with these exact keys:
{{
  "Tata_Rias_dan_Kecantikan": 0,
  "Tata_Busana": 0,
  "Tata_Boga_Memasak": 0,
  "Teknik_Komputer": 0,
  "Teknik_Non_Komputer": 0,
  "Bahasa": 0,
  "Bimbel_Kursus_Pendidikan": 0,
  "Hotel_Pariwisata": 0,
  "Seni_dan_Budaya": 0,
  "Lainnya": 0
}}

Use 1 if at least one program fits category, else 0.
No explanation. No extra text.
"""

    try:
        response = client.chat.completions.create(
            model="gpt-4.1-mini",
            messages=[
                {"role": "system", "content": "You are a strict JSON classifier."},
                {"role": "user", "content": prompt}
            ],
            temperature=0
        )

        raw = response.choices[0].message.content.strip()
        parsed = safe_json_parse(raw)

        # Ensure all keys exist
        for cat in CATEGORIES:
            parsed.setdefault(cat, 0)

        # Force 0/1 ints
        parsed = {cat: int(parsed.get(cat, 0) == 1) for cat in CATEGORIES}
        return parsed

    except Exception as e:
        return {cat: 0 for cat in CATEGORIES}

# =============================================
# STEP 4 — PROCESS IN BATCHES & SAVE PER 500
# =============================================

total_rows = len(final_df)
total_batches = math.ceil(total_rows / BATCH_SIZE)

for batch_idx in range(total_batches):
    start = batch_idx * BATCH_SIZE
    end = min((batch_idx + 1) * BATCH_SIZE, total_rows)

    batch_df = final_df.iloc[start:end].copy()

    results = []
    for programs in tqdm(
        batch_df["Programs"],
        desc=f"Classifying rows {start+1}-{end} (Batch {batch_idx+1}/{total_batches})"
    ):
        results.append(classify_programs(programs))

    classification_df = pd.DataFrame(results)[CATEGORIES]
    batch_out = pd.concat([batch_df.reset_index(drop=True),
                           classification_df.reset_index(drop=True)], axis=1)

    out_path = os.path.join(OUTPUT_DIR, f"lkp_{start+1}_{end}.csv")
    batch_out.to_csv(out_path, index=False, encoding="utf-8-sig")

    print(f"\nSaved: {out_path}")
    print("Category totals in this batch:")
    print(batch_out[CATEGORIES].sum())

print("\nDONE. All batches saved to:", OUTPUT_DIR)


In [None]:
import os
import pandas as pd

# =============================================
# CONFIG
# =============================================
INPUT_FOLDER = "output_2"
OUTPUT_FILE = "lkp_all.csv"

# =============================================
# STEP 1 — COLLECT & SORT FILES
# =============================================
files = [
    os.path.join(INPUT_FOLDER, f)
    for f in os.listdir(INPUT_FOLDER)
    if f.endswith(".csv")
]
files = sorted(files)

print("Files found:", len(files))
if not files:
    raise FileNotFoundError(f"No CSV files found in: {INPUT_FOLDER}")

# =============================================
# STEP 2 — MERGE
# =============================================
df_list = [pd.read_csv(f) for f in files]
final_df = pd.concat(df_list, ignore_index=True)

print("Merged shape:", final_df.shape)

# =============================================
# STEP 3 — SAVE
# =============================================
final_df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8-sig")
print(f"Saved merged file: {OUTPUT_FILE}")
