# Prediksi Tingkat Kemiskinan di Indonesia
**Deskripsi:** Notebook ini membangun model prediksi tingkat kemiskinan dengan menggabungkan data klasifikasi kemiskinan, data sosial-ekonomi 2021, dan indikator pendidikan provinsi 2023.


In [1]:
import os
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import joblib

# optional / advanced
try:
    import xgboost as xgb
except Exception:
    xgb = None

try:
    import shap
except Exception:
    shap = None

print("Libraries imported")


Libraries imported


In [2]:
DATA_DIR = "data"  # folder tempat CSV
Poverty_path = os.path.join(DATA_DIR, "klasifikasi_kemiskinan.csv")
Socio_path   = os.path.join(DATA_DIR, "socio_economic.csv")
Edu_path     = os.path.join(DATA_DIR, "data_pendidikan.csv")

print("Expected paths:")
print(Poverty_path)
print(Socio_path)
print(Edu_path)


Expected paths:
data/klasifikasi_kemiskinan.csv
data/socio_economic.csv
data/data_pendidikan.csv


In [3]:
def safe_read_csv(path):
    if not os.path.exists(path):
        print(f"File not found: {path}")
        return None
    encodings = ['utf-8', 'latin1', 'cp1252']
    for e in encodings:
        try:
            return pd.read_csv(path, encoding=e)
        except Exception:
            continue
    try:
        return pd.read_csv(path)
    except Exception as ex:
        print(f"Failed to read {path}: {ex}")
        return None

def normalize_columns(df):
    df = df.copy()
    df.columns = (
        df.columns.str.strip()
                  .str.lower()
                  .str.replace(' ', '_')
                  .str.replace('-', '_')
                  .str.replace('\\n', '_')
    )
    return df


In [4]:
poverty = safe_read_csv(Poverty_path)
socio   = safe_read_csv(Socio_path)
edu     = safe_read_csv(Edu_path)

for name, df in [('poverty', poverty), ('socio', socio), ('edu', edu)]:
    if df is None:
        print(f"{name} -> NOT LOADED")
    else:
        print(f"{name} -> loaded, shape: {df.shape}")
        display(df.head(3))


File not found: data/klasifikasi_kemiskinan.csv
File not found: data/socio_economic.csv
File not found: data/data_pendidikan.csv
poverty -> NOT LOADED
socio -> NOT LOADED
edu -> NOT LOADED


In [5]:
if poverty is not None:
    poverty = normalize_columns(poverty)
if socio is not None:
    socio = normalize_columns(socio)
if edu is not None:
    edu = normalize_columns(edu)

print("Sample columns (poverty):", (poverty.columns.tolist()[:30] if poverty is not None else []))
print("Sample columns (socio):", (socio.columns.tolist()[:30] if socio is not None else []))
print("Sample columns (edu):", (edu.columns.tolist()[:30] if edu is not None else []))


Sample columns (poverty): []
Sample columns (socio): []
Sample columns (edu): []


In [6]:
def find_possible_join_cols(df):
    candidates = ['provinsi','province','kode_prov','kode_provinsi','nama_provinsi','nama_prov','kabupaten','district','prov']
    found = [c for c in df.columns if any(k in c for k in candidates)]
    return found

print("Possible join cols poverty:", find_possible_join_cols(poverty) if poverty is not None else [])
print("Possible join cols socio:", find_possible_join_cols(socio) if socio is not None else [])
print("Possible join cols edu:", find_possible_join_cols(edu) if edu is not None else [])


Possible join cols poverty: []
Possible join cols socio: []
Possible join cols edu: []


In [7]:
def unify_and_merge(poverty, socio, edu, join_name='provinsi'):
    dfs = [poverty, socio, edu]
    # attempt to rename first matching join col to 'provinsi' in each df
    candidates = ['provinsi','province','nama_prov','nama_provinsi','prov']
    for df in dfs:
        if df is None:
            continue
        found = None
        for c in df.columns:
            for cand in candidates:
                if cand in c:
                    found = c
                    break
            if found:
                break
        if found:
            df.rename(columns={found: join_name}, inplace=True)
    # now try full merge on provinsi
    merged = poverty.copy() if poverty is not None else None
    if merged is not None and socio is not None and 'provinsi' in merged.columns and 'provinsi' in socio.columns:
        merged = merged.merge(socio, on='provinsi', how='left')
    elif merged is not None and socio is not None:
        # try any shared column
        shared = set(merged.columns).intersection(set(socio.columns))
        if shared:
            key = list(shared)[0]
            merged = merged.merge(socio, on=key, how='left')
    # attach edu (provinsi) if possible
    if merged is not None and edu is not None and 'provinsi' in edu.columns:
        merged = merged.merge(edu, on='provinsi', how='left')
    return merged

In [8]:
df = unify_and_merge(poverty, socio, edu)
print("Merged dataframe shape:", df.shape if df is not None else None)

Merged dataframe shape: None


In [None]:
def detect_target_column(df):
    for c in df.columns:
        key = c.lower()
        if 'kemiskin' in key or 'poverty' in key or 'tingkat' in key or 'status' in key:
            return c
    return None

target_col = detect_target_column(df)
print("Auto-detected target column:", target_col)

if target_col is None:
    print("==> Edit variable target_col manually. Contoh: target_col = 'tingkat_kemiskinan'")
    print("Available columns:", df.columns.tolist()[:100])


AttributeError: 'NoneType' object has no attribute 'columns'