In [50]:
import os
import re
import time
import json
import requests
import pandas as pd
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from pdf2image import convert_from_bytes
import pytesseract
from openai import OpenAI

# =============================================
# CONFIG
# =============================================

TOTAL_PAGES = 1261
CHUNK_SIZE = 100

DETAIL_WORKERS = 20
OCR_WORKERS = 10
OPENAI_WORKERS = 3   # keep small

BASE_LIST_URL = "https://kursus.kemendikdasmen.go.id/?page={}"
DETAIL_BASE = "https://referensi.data.kemendikdasmen.go.id/pendidikan/npsn/{}"

OUTPUT_DIR = "output"
CACHE_FILE = "program_cache.json"

os.makedirs(OUTPUT_DIR, exist_ok=True)

HEADERS = {
    "User-Agent": "Mozilla/5.0",
    "Referer": "https://referensi.data.kemendikdasmen.go.id/"
}

client = OpenAI()

# Optional for Mac
# pytesseract.pytesseract.tesseract_cmd = "/opt/homebrew/bin/tesseract"

# =============================================
# LOAD CACHE
# =============================================

if os.path.exists(CACHE_FILE):
    with open(CACHE_FILE, "r") as f:
        program_cache = json.load(f)
else:
    program_cache = {}

# =============================================
# RESUME LOGIC
# =============================================

pattern = r"lkp_(\d+)_(\d+)\.csv"
completed = []

for file in os.listdir(OUTPUT_DIR):
    match = re.match(pattern, file)
    if match:
        completed.append(int(match.group(2)))

if completed:
    start_page = max(completed) + 1
else:
    start_page = 1

print(f"Resuming from page {start_page}")

# =============================================
# OCR FUNCTION
# =============================================

def ocr_pdf(sk_url):
    try:
        pdf_response = requests.get(sk_url, timeout=20)
        pdf_bytes = pdf_response.content
        images = convert_from_bytes(pdf_bytes)

        text = ""
        for img in images:
            text += pytesseract.image_to_string(img, lang="ind") + "\n"

        return text[:8000]

    except:
        return ""

# =============================================
# OPENAI FUNCTION
# =============================================

def extract_programs(npsn, sk_url):

    if not sk_url:
        return []

    if npsn in program_cache:
        return program_cache[npsn]

    text = ocr_pdf(sk_url)

    if not text.strip():
        program_cache[npsn] = []
        return []

    prompt = f"""
Extract ONLY the list of training programs offered in this Indonesian SK document.
Return ONLY a JSON array of strings.
If none found, return [].

Document:
\"\"\"{text}\"\"\"
"""

    for _ in range(3):
        try:
            response = client.chat.completions.create(
                model="gpt-4.1-mini",
                messages=[
                    {"role": "system", "content": "Extract structured training program names."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0
            )

            programs = json.loads(response.choices[0].message.content.strip())
            program_cache[npsn] = programs

            with open(CACHE_FILE, "w") as f:
                json.dump(program_cache, f)

            return programs

        except:
            time.sleep(2)

    return []

# =============================================
# MAIN LOOP BY CHUNK
# =============================================

for chunk_start in range(start_page, TOTAL_PAGES + 1, CHUNK_SIZE):

    chunk_end = min(chunk_start + CHUNK_SIZE - 1, TOTAL_PAGES)
    print(f"\nProcessing pages {chunk_start}-{chunk_end}")

    # ----------------------------------------
    # LIST SCRAPE
    # ----------------------------------------

    list_records = []

    for page in tqdm(range(chunk_start, chunk_end + 1),
                     desc="List Pages"):
        response = requests.get(BASE_LIST_URL.format(page),
                                headers=HEADERS)
        soup = BeautifulSoup(response.text, "lxml")

        table = soup.find("table")
        if not table:
            continue

        rows = table.find_all("tr")[1:]

        for row in rows:
            cols = row.find_all("td")
            if len(cols) >= 6:
                list_records.append({
                    "NPSN": cols[0].get_text(strip=True),
                    "Nama LKP": cols[1].get_text(strip=True),
                    "Provinsi (List)": cols[2].get_text(strip=True),
                    "Kabupaten (List)": cols[3].get_text(strip=True),
                    "Kecamatan (List)": cols[4].get_text(strip=True),
                    "Status (List)": cols[5].get_text(strip=True)
                })

    list_df = pd.DataFrame(list_records)

    # ----------------------------------------
    # DETAIL SCRAPE (PARALLEL)
    # ----------------------------------------

    def scrape_detail(npsn):
        try:
            url = DETAIL_BASE.format(npsn)
            response = requests.get(url, headers=HEADERS, timeout=10)
            soup = BeautifulSoup(response.text, "lxml")

            data = {"NPSN": npsn}

            for container in soup.find_all("div", class_="tabby-content"):
                rows = container.find_all("tr")
                for row in rows:
                    cols = row.find_all("td")

                    if len(cols) >= 4:
                        key = cols[1].get_text(strip=True)

                        if "File SK Operasional" in key:
                            link_tag = cols[3].find("a", href=True)
                            value = link_tag["href"] if link_tag else None
                        else:
                            value = cols[3].get_text(strip=True)

                        if key:
                            data[key] = value

            return data

        except:
            return {"NPSN": npsn}

    detail_records = []

    with ThreadPoolExecutor(max_workers=DETAIL_WORKERS) as executor:
        futures = {
            executor.submit(scrape_detail, npsn): npsn
            for npsn in list_df["NPSN"]
        }

        for future in tqdm(as_completed(futures),
                           total=len(futures),
                           desc="Detail Pages"):
            detail_records.append(future.result())

    detail_df = pd.DataFrame(detail_records)

    final_df = list_df.merge(detail_df, on="NPSN", how="left")

    # ----------------------------------------
    # PROGRAM EXTRACTION (CONTROLLED)
    # ----------------------------------------

    programs_map = {}

    with ThreadPoolExecutor(max_workers=OPENAI_WORKERS) as executor:
        futures = {
            executor.submit(
                extract_programs,
                row["NPSN"],
                row.get("File SK Operasional ()")
            ): row["NPSN"]
            for _, row in final_df.iterrows()
        }

        for future in tqdm(as_completed(futures),
                           total=len(futures),
                           desc="Programs"):
            programs_map[futures[future]] = future.result()

    final_df["Programs"] = final_df["NPSN"].map(programs_map)

    # ----------------------------------------
    # SAVE CHUNK
    # ----------------------------------------

    output_path = os.path.join(
        OUTPUT_DIR,
        f"lkp_{chunk_start}_{chunk_end}.csv"
    )

    final_df.to_csv(output_path,
                    index=False,
                    encoding="utf-8-sig")

    print(f"Saved {output_path}")

print("\nALL DONE.")


Resuming from page 1

Processing pages 1-100


List Pages: 100%|██████████| 100/100 [01:44<00:00,  1.04s/it]
Detail Pages: 100%|██████████| 1000/1000 [00:25<00:00, 39.34it/s]
Programs: 100%|██████████| 1000/1000 [14:37<00:00,  1.14it/s]


Saved output/lkp_1_100.csv

Processing pages 101-200


List Pages: 100%|██████████| 100/100 [01:14<00:00,  1.34it/s]
Detail Pages: 100%|██████████| 1000/1000 [00:26<00:00, 37.88it/s]
Programs: 100%|██████████| 1000/1000 [14:35<00:00,  1.14it/s]


Saved output/lkp_101_200.csv

Processing pages 201-300


List Pages: 100%|██████████| 100/100 [01:52<00:00,  1.12s/it]
Detail Pages: 100%|██████████| 1000/1000 [00:25<00:00, 38.97it/s]
Programs: 100%|██████████| 1000/1000 [16:17<00:00,  1.02it/s]


Saved output/lkp_201_300.csv

Processing pages 301-400


List Pages: 100%|██████████| 100/100 [01:18<00:00,  1.27it/s]
Detail Pages: 100%|██████████| 1000/1000 [00:26<00:00, 37.90it/s]
Programs: 100%|██████████| 1000/1000 [14:40<00:00,  1.14it/s]


Saved output/lkp_301_400.csv

Processing pages 401-500


List Pages: 100%|██████████| 100/100 [01:20<00:00,  1.25it/s]
Detail Pages: 100%|██████████| 1000/1000 [00:26<00:00, 37.05it/s]
Programs: 100%|██████████| 1000/1000 [15:48<00:00,  1.05it/s]


Saved output/lkp_401_500.csv

Processing pages 501-600


List Pages: 100%|██████████| 100/100 [01:18<00:00,  1.27it/s]
Detail Pages: 100%|██████████| 1000/1000 [00:25<00:00, 39.30it/s]
Programs: 100%|██████████| 1000/1000 [13:52<00:00,  1.20it/s]


Saved output/lkp_501_600.csv

Processing pages 601-700


List Pages: 100%|██████████| 100/100 [01:19<00:00,  1.26it/s]
Detail Pages: 100%|██████████| 1000/1000 [00:27<00:00, 35.83it/s]
Programs: 100%|██████████| 1000/1000 [12:02<00:00,  1.38it/s]


Saved output/lkp_601_700.csv

Processing pages 701-800


List Pages: 100%|██████████| 100/100 [01:19<00:00,  1.26it/s]
Detail Pages: 100%|██████████| 1000/1000 [00:28<00:00, 35.07it/s]
Programs: 100%|██████████| 1000/1000 [14:01<00:00,  1.19it/s]


Saved output/lkp_701_800.csv

Processing pages 801-900


List Pages: 100%|██████████| 100/100 [01:20<00:00,  1.25it/s]
Detail Pages: 100%|██████████| 1000/1000 [00:28<00:00, 35.26it/s]
Programs: 100%|██████████| 1000/1000 [15:54<00:00,  1.05it/s]


Saved output/lkp_801_900.csv

Processing pages 901-1000


List Pages: 100%|██████████| 100/100 [01:20<00:00,  1.25it/s]
Detail Pages: 100%|██████████| 1000/1000 [00:30<00:00, 32.95it/s]
Programs: 100%|██████████| 1000/1000 [18:57<00:00,  1.14s/it]


Saved output/lkp_901_1000.csv

Processing pages 1001-1100


List Pages: 100%|██████████| 100/100 [01:23<00:00,  1.20it/s]
Detail Pages: 100%|██████████| 1000/1000 [00:29<00:00, 33.88it/s]
Programs: 100%|██████████| 1000/1000 [15:57<00:00,  1.04it/s]


Saved output/lkp_1001_1100.csv

Processing pages 1101-1200


List Pages: 100%|██████████| 100/100 [01:22<00:00,  1.22it/s]
Detail Pages: 100%|██████████| 1000/1000 [00:32<00:00, 31.21it/s]
Programs: 100%|██████████| 1000/1000 [16:36<00:00,  1.00it/s]


Saved output/lkp_1101_1200.csv

Processing pages 1201-1261


List Pages: 100%|██████████| 61/61 [00:50<00:00,  1.21it/s]
Detail Pages: 100%|██████████| 601/601 [00:17<00:00, 35.19it/s]
Programs: 100%|██████████| 601/601 [09:31<00:00,  1.05it/s]

Saved output/lkp_1201_1261.csv

ALL DONE.





In [None]:
import os
import ast
import json
import pandas as pd
from openai import OpenAI
from tqdm import tqdm

# =============================================
# CONFIG
# =============================================

OUTPUT_DIR = "output"

CATEGORIES = [
    "Tata_Rias_dan_Kecantikan",
    "Tata_Busana",
    "Tata_Boga_Memasak",
    "Teknik_Komputer",
    "Teknik_Non_Komputer",
    "Bahasa",
    "Bimbel_Kursus_Pendidikan",
    "Hotel_Pariwisata",
    "Seni_dan_Budaya",
    "Lainnya"
]

client = OpenAI()

# =============================================
# STEP 1 — MERGE ALL FILES
# =============================================

files = [
    os.path.join(OUTPUT_DIR, f)
    for f in os.listdir(OUTPUT_DIR)
    if f.endswith(".csv")
]

df_list = [pd.read_csv(f) for f in files]
final_df = pd.concat(df_list, ignore_index=True)

# =============================================
# STEP 2 — FIX PROGRAMS COLUMN
# =============================================

def safe_eval(x):
    try:
        return ast.literal_eval(x) if isinstance(x, str) else []
    except:
        return []

final_df["Programs"] = final_df["Programs"].apply(safe_eval)



# =============================================
# STEP 4 — CLASSIFICATION FUNCTION
# =============================================

def classify_programs(program_list):

    if not program_list:
        return {cat: 0 for cat in CATEGORIES}

    prompt = f"""
Classify these Indonesian training programs:

{program_list}

Into these categories:
{CATEGORIES}

Return ONLY valid JSON like:
{{
  "Tata_Rias_dan_Kecantikan": 0 or 1,
  ...
}}

Use 1 if at least one program fits category.
Otherwise 0.
"""

    try:
        response = client.chat.completions.create(
            model="gpt-4.1-mini",
            messages=[
                {"role": "system", "content": "Classify training programs."},
                {"role": "user", "content": prompt}
            ],
            temperature=0
        )

        return json.loads(response.choices[0].message.content.strip())

    except:
        return {cat: 0 for cat in CATEGORIES}

# =============================================
# STEP 5 — RUN CLASSIFICATION
# =============================================

results = []

for programs in tqdm(final_df["Programs"], desc="Classifying"):
    results.append(classify_programs(programs))

classification_df = pd.DataFrame(results)

final_df = pd.concat([final_df, classification_df], axis=1)

# =============================================
# OUTPUT
# =============================================

print("\nCLASSIFIED SAMPLE:")
print(final_df[["Programs"] + CATEGORIES])

final_df

# =============================================
# SAVE SAMPLE TO CSV
# =============================================

final_df.to_csv(
    "lkp_all.csv",
    index=False,
    encoding="utf-8-sig"
)

print("\nSaved to lkp_all.csv")



Classifying: 100%|██████████| 100/100 [01:33<00:00,  1.07it/s]


CLASSIFIED SAMPLE:
                                 Programs  Tata_Rias_dan_Kecantikan  \
0                                      []                         0   
1                                      []                         0   
2                                      []                         0   
3                                      []                         0   
4                                      []                         0   
..                                    ...                       ...   
95                                     []                         0   
96         [Pelatihan Kursus Bahasa Arab]                         0   
97  [Kursus dan Pelatihan Bahasa Inggris]                         0   
98      [Pelatihan Kursus Bahasa Inggris]                         0   
99                                     []                         0   

    Tata_Busana  Tata_Boga_Memasak  Teknik_Komputer  Teknik_Non_Komputer  \
0             0                  0                0




Unnamed: 0,NPSN,Nama LKP,Provinsi (List),Kabupaten (List),Kecamatan (List),Status (List),Nama,Alamat,Desa/Kelurahan,Kecamatan/Kota (LN),...,Tata_Rias_dan_Kecantikan,Tata_Busana,Tata_Boga_Memasak,Teknik_Komputer,Teknik_Non_Komputer,Bahasa,Bimbel_Kursus_Pendidikan,Hotel_Pariwisata,Seni_dan_Budaya,Lainnya
0,K9998710,LKP. QUANTUM SYIFA LEARNING,Jawa Timur,Kab. Jombang,Mojoagung,SWASTA,LKP. QUANTUM SYIFA LEARNING,Ds. Seketi RT.01 RW.01,SEKETI,KEC. MOJOAGUNG,...,0,0,0,0,0,0,0,0,0,0
1,K5653984,LKP Modes Al-Farizi,Jawa Timur,Kab. Jombang,Mojowarno,SWASTA,LKP Modes Al-Farizi,"Jl. Yos Sudarso No.89 Rt.02/07, Catakgayam, Mo...",CATAK GAYAM,KEC. MOJOWARNO,...,0,0,0,0,0,0,0,0,0,0
2,K5653983,LKP Modes Andri,Jawa Timur,Kab. Jombang,Mojowarno,SWASTA,LKP Modes Andri,Jl. Mayor Tumijo Sidokerto Mojowarno Jombang J...,SIDOKERTO,KEC. MOJOWARNO,...,0,0,0,0,0,0,0,0,0,0
3,K5653987,LKP Modes Indah,Jawa Timur,Kab. Jombang,Mojowarno,SWASTA,LKP Modes Indah,Jl. Bidan kukuh no 14 Cata Gayam Mojowarno Jom...,CATAK GAYAM,KEC. MOJOWARNO,...,0,0,0,0,0,0,0,0,0,0
4,K5653986,LKP Modes Widya Indah,Jawa Timur,Kab. Jombang,Mojowarno,SWASTA,LKP Modes Widya Indah,Jl. Melati Jetak Sidokerto mojoowarno Jombang ...,SIDOKERTO,KEC. MOJOWARNO,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,K9990117,LKP MANGALA,Jawa Timur,Kab. Kediri,Pare,SWASTA,LKP MANGALA,Jl. Pancawarna No. 6C,TULUNGREJO,KEC. PARE,...,0,0,0,0,0,0,0,0,0,0
96,K9980825,LKP MARKAZ ARABIYAH,Jawa Timur,Kab. Kediri,Pare,SWASTA,LKP MARKAZ ARABIYAH,Jl. Pancawarna,TULUNGREJO,KEC. PARE,...,0,0,0,0,0,1,0,0,0,0
97,K9999498,LKP MERRY ENGLISH,Jawa Timur,Kab. Kediri,Pare,SWASTA,LKP MERRY ENGLISH,Jl. Anyelir No. 62,TULUNGREJO,KEC. PARE,...,0,0,0,0,0,1,0,0,0,0
98,K9989848,LKP MIRACLE ENGLISH COURSE,Jawa Timur,Kab. Kediri,Pare,SWASTA,LKP MIRACLE ENGLISH COURSE,Jl. Flamboyan No. 160,TULUNGREJO,KEC. PARE,...,0,0,0,0,0,1,0,0,0,0


In [None]:
os.environ["OPENAI_API_KEY"]=""

In [5]:
import os
import ast
import json
import re
import math
import pandas as pd
from openai import OpenAI
from tqdm import tqdm

# =============================================
# CONFIG
# =============================================

INPUT_DIR = "output"
OUTPUT_DIR = "output_2"
BATCH_SIZE = 500

CATEGORIES = [
    "Tata_Rias_dan_Kecantikan",
    "Tata_Busana",
    "Tata_Boga_Memasak",
    "Teknik_Komputer",
    "Teknik_Non_Komputer",
    "Bahasa",
    "Bimbel_Kursus_Pendidikan",
    "Hotel_Pariwisata",
    "Seni_dan_Budaya",
    "Lainnya"
]

os.makedirs(OUTPUT_DIR, exist_ok=True)

client = OpenAI()

# =============================================
# STEP 1 — MERGE ALL FILES
# =============================================

files = [
    os.path.join(INPUT_DIR, f)
    for f in os.listdir(INPUT_DIR)
    if f.endswith(".csv")
]
files = sorted(files)

df_list = [pd.read_csv(f) for f in files]
final_df = pd.concat(df_list, ignore_index=True)

# =============================================
# STEP 2 — FIX PROGRAMS COLUMN
# =============================================

def safe_eval(x):
    try:
        return ast.literal_eval(x) if isinstance(x, str) else []
    except:
        return []

final_df["Programs"] = final_df["Programs"].apply(safe_eval)

print("Total rows:", len(final_df))

# =============================================
# SAFE JSON PARSER (prevents all-0 due to JSON noise)
# =============================================

def safe_json_parse(raw: str):
    raw = (raw or "").strip()
    raw = raw.replace("```json", "").replace("```", "").strip()

    try:
        return json.loads(raw)
    except:
        m = re.search(r"\{.*\}", raw, re.DOTALL)
        if m:
            try:
                return json.loads(m.group())
            except:
                pass

    return {cat: 0 for cat in CATEGORIES}

# =============================================
# STEP 3 — CLASSIFICATION FUNCTION
# =============================================

def classify_programs(program_list):

    if not program_list:
        return {cat: 0 for cat in CATEGORIES}

    prompt = f"""
Classify these Indonesian training programs:

{program_list}

Into these categories:
{CATEGORIES}

Return STRICT JSON ONLY with these exact keys:
{{
  "Tata_Rias_dan_Kecantikan": 0,
  "Tata_Busana": 0,
  "Tata_Boga_Memasak": 0,
  "Teknik_Komputer": 0,
  "Teknik_Non_Komputer": 0,
  "Bahasa": 0,
  "Bimbel_Kursus_Pendidikan": 0,
  "Hotel_Pariwisata": 0,
  "Seni_dan_Budaya": 0,
  "Lainnya": 0
}}

Use 1 if at least one program fits category, else 0.
No explanation. No extra text.
"""

    try:
        response = client.chat.completions.create(
            model="gpt-4.1-mini",
            messages=[
                {"role": "system", "content": "You are a strict JSON classifier."},
                {"role": "user", "content": prompt}
            ],
            temperature=0
        )

        raw = response.choices[0].message.content.strip()
        parsed = safe_json_parse(raw)

        # Ensure all keys exist
        for cat in CATEGORIES:
            parsed.setdefault(cat, 0)

        # Force 0/1 ints
        parsed = {cat: int(parsed.get(cat, 0) == 1) for cat in CATEGORIES}
        return parsed

    except Exception as e:
        return {cat: 0 for cat in CATEGORIES}

# =============================================
# STEP 4 — PROCESS IN BATCHES & SAVE PER 500
# =============================================

total_rows = len(final_df)
total_batches = math.ceil(total_rows / BATCH_SIZE)

for batch_idx in range(total_batches):
    start = batch_idx * BATCH_SIZE
    end = min((batch_idx + 1) * BATCH_SIZE, total_rows)

    batch_df = final_df.iloc[start:end].copy()

    results = []
    for programs in tqdm(
        batch_df["Programs"],
        desc=f"Classifying rows {start+1}-{end} (Batch {batch_idx+1}/{total_batches})"
    ):
        results.append(classify_programs(programs))

    classification_df = pd.DataFrame(results)[CATEGORIES]
    batch_out = pd.concat([batch_df.reset_index(drop=True),
                           classification_df.reset_index(drop=True)], axis=1)

    out_path = os.path.join(OUTPUT_DIR, f"lkp_{start+1}_{end}.csv")
    batch_out.to_csv(out_path, index=False, encoding="utf-8-sig")

    print(f"\nSaved: {out_path}")
    print("Category totals in this batch:")
    print(batch_out[CATEGORIES].sum())

print("\nDONE. All batches saved to:", OUTPUT_DIR)


Total rows: 12601


Classifying rows 1-500 (Batch 1/26): 100%|██████████| 500/500 [05:52<00:00,  1.42it/s]



Saved: output_2/lkp_1_500.csv
Category totals in this batch:
Tata_Rias_dan_Kecantikan    55
Tata_Busana                 52
Tata_Boga_Memasak           39
Teknik_Komputer             25
Teknik_Non_Komputer         29
Bahasa                      36
Bimbel_Kursus_Pendidikan    27
Hotel_Pariwisata             6
Seni_dan_Budaya             25
Lainnya                     22
dtype: int64


Classifying rows 501-1000 (Batch 2/26): 100%|██████████| 500/500 [04:33<00:00,  1.83it/s]



Saved: output_2/lkp_501_1000.csv
Category totals in this batch:
Tata_Rias_dan_Kecantikan    23
Tata_Busana                 24
Tata_Boga_Memasak           15
Teknik_Komputer             46
Teknik_Non_Komputer         21
Bahasa                      46
Bimbel_Kursus_Pendidikan    43
Hotel_Pariwisata            12
Seni_dan_Budaya             13
Lainnya                     21
dtype: int64


Classifying rows 1001-1500 (Batch 3/26): 100%|██████████| 500/500 [07:49<00:00,  1.07it/s]



Saved: output_2/lkp_1001_1500.csv
Category totals in this batch:
Tata_Rias_dan_Kecantikan    39
Tata_Busana                 23
Tata_Boga_Memasak           14
Teknik_Komputer             33
Teknik_Non_Komputer         32
Bahasa                      75
Bimbel_Kursus_Pendidikan    40
Hotel_Pariwisata            20
Seni_dan_Budaya             22
Lainnya                     20
dtype: int64


Classifying rows 1501-2000 (Batch 4/26): 100%|██████████| 500/500 [02:30<00:00,  3.31it/s]



Saved: output_2/lkp_1501_2000.csv
Category totals in this batch:
Tata_Rias_dan_Kecantikan    14
Tata_Busana                  7
Tata_Boga_Memasak            2
Teknik_Komputer             11
Teknik_Non_Komputer          9
Bahasa                      32
Bimbel_Kursus_Pendidikan     9
Hotel_Pariwisata            12
Seni_dan_Budaya             14
Lainnya                      4
dtype: int64


Classifying rows 2001-2500 (Batch 5/26): 100%|██████████| 500/500 [08:42<00:00,  1.05s/it]



Saved: output_2/lkp_2001_2500.csv
Category totals in this batch:
Tata_Rias_dan_Kecantikan    58
Tata_Busana                 53
Tata_Boga_Memasak           10
Teknik_Komputer             81
Teknik_Non_Komputer         11
Bahasa                      39
Bimbel_Kursus_Pendidikan    17
Hotel_Pariwisata             5
Seni_dan_Budaya              7
Lainnya                      4
dtype: int64


Classifying rows 2501-3000 (Batch 6/26): 100%|██████████| 500/500 [06:26<00:00,  1.29it/s]



Saved: output_2/lkp_2501_3000.csv
Category totals in this batch:
Tata_Rias_dan_Kecantikan    40
Tata_Busana                 47
Tata_Boga_Memasak            9
Teknik_Komputer             64
Teknik_Non_Komputer         16
Bahasa                      15
Bimbel_Kursus_Pendidikan    11
Hotel_Pariwisata             0
Seni_dan_Budaya              7
Lainnya                      7
dtype: int64


Classifying rows 3001-3500 (Batch 7/26): 100%|██████████| 500/500 [04:34<00:00,  1.82it/s]



Saved: output_2/lkp_3001_3500.csv
Category totals in this batch:
Tata_Rias_dan_Kecantikan    55
Tata_Busana                 28
Tata_Boga_Memasak           13
Teknik_Komputer             14
Teknik_Non_Komputer         15
Bahasa                       8
Bimbel_Kursus_Pendidikan     9
Hotel_Pariwisata             2
Seni_dan_Budaya             11
Lainnya                      6
dtype: int64


Classifying rows 3501-4000 (Batch 8/26): 100%|██████████| 500/500 [04:35<00:00,  1.82it/s]



Saved: output_2/lkp_3501_4000.csv
Category totals in this batch:
Tata_Rias_dan_Kecantikan    24
Tata_Busana                 44
Tata_Boga_Memasak           16
Teknik_Komputer             31
Teknik_Non_Komputer         18
Bahasa                      22
Bimbel_Kursus_Pendidikan    18
Hotel_Pariwisata             1
Seni_dan_Budaya             18
Lainnya                      7
dtype: int64


Classifying rows 4001-4500 (Batch 9/26): 100%|██████████| 500/500 [07:22<00:00,  1.13it/s]



Saved: output_2/lkp_4001_4500.csv
Category totals in this batch:
Tata_Rias_dan_Kecantikan    45
Tata_Busana                 21
Tata_Boga_Memasak           19
Teknik_Komputer             22
Teknik_Non_Komputer         19
Bahasa                      71
Bimbel_Kursus_Pendidikan    20
Hotel_Pariwisata            31
Seni_dan_Budaya             17
Lainnya                     24
dtype: int64


Classifying rows 4501-5000 (Batch 10/26): 100%|██████████| 500/500 [06:36<00:00,  1.26it/s]



Saved: output_2/lkp_4501_5000.csv
Category totals in this batch:
Tata_Rias_dan_Kecantikan    31
Tata_Busana                 49
Tata_Boga_Memasak           25
Teknik_Komputer             32
Teknik_Non_Komputer         12
Bahasa                      39
Bimbel_Kursus_Pendidikan    23
Hotel_Pariwisata             5
Seni_dan_Budaya             13
Lainnya                      4
dtype: int64


Classifying rows 5001-5500 (Batch 11/26): 100%|██████████| 500/500 [07:07<00:00,  1.17it/s]



Saved: output_2/lkp_5001_5500.csv
Category totals in this batch:
Tata_Rias_dan_Kecantikan    31
Tata_Busana                 51
Tata_Boga_Memasak            7
Teknik_Komputer             53
Teknik_Non_Komputer         14
Bahasa                      47
Bimbel_Kursus_Pendidikan    14
Hotel_Pariwisata             7
Seni_dan_Budaya             12
Lainnya                     14
dtype: int64


Classifying rows 5501-6000 (Batch 12/26): 100%|██████████| 500/500 [07:30<00:00,  1.11it/s]



Saved: output_2/lkp_5501_6000.csv
Category totals in this batch:
Tata_Rias_dan_Kecantikan    42
Tata_Busana                 60
Tata_Boga_Memasak           19
Teknik_Komputer             61
Teknik_Non_Komputer         34
Bahasa                      63
Bimbel_Kursus_Pendidikan    23
Hotel_Pariwisata             9
Seni_dan_Budaya             23
Lainnya                     14
dtype: int64


Classifying rows 6001-6500 (Batch 13/26): 100%|██████████| 500/500 [03:17<00:00,  2.53it/s]



Saved: output_2/lkp_6001_6500.csv
Category totals in this batch:
Tata_Rias_dan_Kecantikan    21
Tata_Busana                 14
Tata_Boga_Memasak            8
Teknik_Komputer             13
Teknik_Non_Komputer         10
Bahasa                      20
Bimbel_Kursus_Pendidikan    19
Hotel_Pariwisata            11
Seni_dan_Budaya              7
Lainnya                     12
dtype: int64


Classifying rows 6501-7000 (Batch 14/26): 100%|██████████| 500/500 [05:58<00:00,  1.40it/s]



Saved: output_2/lkp_6501_7000.csv
Category totals in this batch:
Tata_Rias_dan_Kecantikan    35
Tata_Busana                 14
Tata_Boga_Memasak           11
Teknik_Komputer             35
Teknik_Non_Komputer         25
Bahasa                      54
Bimbel_Kursus_Pendidikan    32
Hotel_Pariwisata            18
Seni_dan_Budaya             28
Lainnya                     27
dtype: int64


Classifying rows 7001-7500 (Batch 15/26): 100%|██████████| 500/500 [07:28<00:00,  1.11it/s]



Saved: output_2/lkp_7001_7500.csv
Category totals in this batch:
Tata_Rias_dan_Kecantikan    32
Tata_Busana                 66
Tata_Boga_Memasak           12
Teknik_Komputer             51
Teknik_Non_Komputer         48
Bahasa                      60
Bimbel_Kursus_Pendidikan    29
Hotel_Pariwisata            11
Seni_dan_Budaya             22
Lainnya                     23
dtype: int64


Classifying rows 7501-8000 (Batch 16/26): 100%|██████████| 500/500 [06:08<00:00,  1.36it/s]



Saved: output_2/lkp_7501_8000.csv
Category totals in this batch:
Tata_Rias_dan_Kecantikan    30
Tata_Busana                 51
Tata_Boga_Memasak           12
Teknik_Komputer             40
Teknik_Non_Komputer         32
Bahasa                      39
Bimbel_Kursus_Pendidikan    30
Hotel_Pariwisata            12
Seni_dan_Budaya             23
Lainnya                     19
dtype: int64


Classifying rows 8001-8500 (Batch 17/26): 100%|██████████| 500/500 [05:15<00:00,  1.59it/s]



Saved: output_2/lkp_8001_8500.csv
Category totals in this batch:
Tata_Rias_dan_Kecantikan    22
Tata_Busana                 18
Tata_Boga_Memasak            5
Teknik_Komputer             23
Teknik_Non_Komputer         19
Bahasa                      53
Bimbel_Kursus_Pendidikan    20
Hotel_Pariwisata            19
Seni_dan_Budaya             14
Lainnya                     12
dtype: int64


Classifying rows 8501-9000 (Batch 18/26): 100%|██████████| 500/500 [05:09<00:00,  1.62it/s]



Saved: output_2/lkp_8501_9000.csv
Category totals in this batch:
Tata_Rias_dan_Kecantikan    19
Tata_Busana                 21
Tata_Boga_Memasak            6
Teknik_Komputer             14
Teknik_Non_Komputer         10
Bahasa                      76
Bimbel_Kursus_Pendidikan    12
Hotel_Pariwisata             4
Seni_dan_Budaya             12
Lainnya                     14
dtype: int64


Classifying rows 9001-9500 (Batch 19/26): 100%|██████████| 500/500 [03:46<00:00,  2.20it/s]



Saved: output_2/lkp_9001_9500.csv
Category totals in this batch:
Tata_Rias_dan_Kecantikan    25
Tata_Busana                 25
Tata_Boga_Memasak            7
Teknik_Komputer             24
Teknik_Non_Komputer         14
Bahasa                      27
Bimbel_Kursus_Pendidikan    18
Hotel_Pariwisata             2
Seni_dan_Budaya             10
Lainnya                     10
dtype: int64


Classifying rows 9501-10000 (Batch 20/26): 100%|██████████| 500/500 [05:09<00:00,  1.61it/s]



Saved: output_2/lkp_9501_10000.csv
Category totals in this batch:
Tata_Rias_dan_Kecantikan    20
Tata_Busana                 15
Tata_Boga_Memasak            4
Teknik_Komputer             25
Teknik_Non_Komputer         32
Bahasa                      42
Bimbel_Kursus_Pendidikan    44
Hotel_Pariwisata            13
Seni_dan_Budaya             24
Lainnya                     10
dtype: int64


Classifying rows 10001-10500 (Batch 21/26): 100%|██████████| 500/500 [06:14<00:00,  1.33it/s]



Saved: output_2/lkp_10001_10500.csv
Category totals in this batch:
Tata_Rias_dan_Kecantikan    33
Tata_Busana                 37
Tata_Boga_Memasak           19
Teknik_Komputer             71
Teknik_Non_Komputer         48
Bahasa                      40
Bimbel_Kursus_Pendidikan    36
Hotel_Pariwisata             7
Seni_dan_Budaya             28
Lainnya                     28
dtype: int64


Classifying rows 10501-11000 (Batch 22/26): 100%|██████████| 500/500 [05:44<00:00,  1.45it/s]



Saved: output_2/lkp_10501_11000.csv
Category totals in this batch:
Tata_Rias_dan_Kecantikan    18
Tata_Busana                 15
Tata_Boga_Memasak            7
Teknik_Komputer             70
Teknik_Non_Komputer         38
Bahasa                      50
Bimbel_Kursus_Pendidikan    51
Hotel_Pariwisata            14
Seni_dan_Budaya             13
Lainnya                     32
dtype: int64


Classifying rows 11001-11500 (Batch 23/26): 100%|██████████| 500/500 [04:24<00:00,  1.89it/s]



Saved: output_2/lkp_11001_11500.csv
Category totals in this batch:
Tata_Rias_dan_Kecantikan    13
Tata_Busana                 27
Tata_Boga_Memasak            7
Teknik_Komputer             53
Teknik_Non_Komputer         11
Bahasa                      43
Bimbel_Kursus_Pendidikan    19
Hotel_Pariwisata             1
Seni_dan_Budaya              6
Lainnya                      9
dtype: int64


Classifying rows 11501-12000 (Batch 24/26): 100%|██████████| 500/500 [05:21<00:00,  1.56it/s]



Saved: output_2/lkp_11501_12000.csv
Category totals in this batch:
Tata_Rias_dan_Kecantikan    49
Tata_Busana                 22
Tata_Boga_Memasak           13
Teknik_Komputer             37
Teknik_Non_Komputer         15
Bahasa                      40
Bimbel_Kursus_Pendidikan    16
Hotel_Pariwisata             7
Seni_dan_Budaya             13
Lainnya                      8
dtype: int64


Classifying rows 12001-12500 (Batch 25/26): 100%|██████████| 500/500 [04:45<00:00,  1.75it/s]



Saved: output_2/lkp_12001_12500.csv
Category totals in this batch:
Tata_Rias_dan_Kecantikan    23
Tata_Busana                 29
Tata_Boga_Memasak            9
Teknik_Komputer             55
Teknik_Non_Komputer         22
Bahasa                      47
Bimbel_Kursus_Pendidikan    25
Hotel_Pariwisata             4
Seni_dan_Budaya             13
Lainnya                     24
dtype: int64


Classifying rows 12501-12601 (Batch 26/26): 100%|██████████| 101/101 [01:03<00:00,  1.58it/s]


Saved: output_2/lkp_12501_12601.csv
Category totals in this batch:
Tata_Rias_dan_Kecantikan     5
Tata_Busana                  8
Tata_Boga_Memasak            2
Teknik_Komputer              9
Teknik_Non_Komputer          7
Bahasa                      13
Bimbel_Kursus_Pendidikan     4
Hotel_Pariwisata             3
Seni_dan_Budaya              2
Lainnya                     12
dtype: int64

DONE. All batches saved to: output_2





In [6]:
import os
import pandas as pd

# =============================================
# CONFIG
# =============================================
INPUT_FOLDER = "output_2"
OUTPUT_FILE = "lkp_all.csv"

# =============================================
# STEP 1 — COLLECT & SORT FILES
# =============================================
files = [
    os.path.join(INPUT_FOLDER, f)
    for f in os.listdir(INPUT_FOLDER)
    if f.endswith(".csv")
]
files = sorted(files)

print("Files found:", len(files))
if not files:
    raise FileNotFoundError(f"No CSV files found in: {INPUT_FOLDER}")

# =============================================
# STEP 2 — MERGE
# =============================================
df_list = [pd.read_csv(f) for f in files]
final_df = pd.concat(df_list, ignore_index=True)

print("Merged shape:", final_df.shape)

# =============================================
# STEP 3 — SAVE
# =============================================
final_df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8-sig")
print(f"Saved merged file: {OUTPUT_FILE}")


Files found: 26
Merged shape: (12601, 45)
Saved merged file: lkp_all.csv
