In [1]:
# ============================================================
# PREPROCESSING II – FEATURE ENGINEERING
# LASSO & PCA (Opsi A: PCA hanya pada fitur numerik)
# ============================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import warnings, os
warnings.filterwarnings("ignore")

In [2]:
# ============================================================
# 1. LOAD DATA
# ============================================================

train_df = pd.read_csv("data_train_preprocessed.csv")
test_df  = pd.read_csv("data_test_preprocessed.csv")

print("Train shape :", train_df.shape)
print("Test shape  :", test_df.shape)
train_df.head()

Train shape : (350, 11)
Test shape  : (150, 11)


Unnamed: 0,Umur,Pendapatan,JumlahPinjaman,LamaBekerja,Pendidikan,StatusPerkawinan,Pekerjaan,TujuanPinjaman,KodeKota,RiwayatPinjaman,StatusPinjaman
0,56,7060000.0,2570000.0,15,1,0,0,4,58,0,1
1,31,4110000.0,7210000.0,7,1,1,1,4,66,0,1
2,45,7070000.0,7150000.0,6,3,0,4,1,61,1,0
3,19,10470000.0,17900000.0,3,1,0,4,2,12,1,0
4,27,7070000.0,4540000.0,13,2,1,4,3,31,0,0


In [3]:

# ============================================================
# ---------------------- BAGIAN A ----------------------------
#     FEATURE ENGINEERING UNTUK KLASIFIKASI
#     Target: StatusPinjaman
# ============================================================

target_class = "StatusPinjaman"

y_class_train = train_df[target_class]
y_class_test  = test_df[target_class]

X_class_train = train_df.drop(columns=[target_class]).copy()
X_class_test  = test_df.drop(columns=[target_class]).copy()

# Numeric columns untuk PCA (Opsi A)
numeric_cols_class = ["Umur", "Pendapatan", "JumlahPinjaman", "LamaBekerja"]
numeric_cols_class = [c for c in numeric_cols_class if c in X_class_train.columns]

print("Numeric cols (Classification PCA):", numeric_cols_class)


Numeric cols (Classification PCA): ['Umur', 'Pendapatan', 'JumlahPinjaman', 'LamaBekerja']


In [4]:
# ============================================================
# A.1 — LASSO FEATURE SELECTION (LOGISTIC L1)
# ============================================================

max_feats_class = min(15, X_class_train.shape[1])

l1_model_class = LogisticRegression(
    penalty="l1",
    C=0.05,
    solver="liblinear",
    random_state=42,
    max_iter=1000
)

selector_class = SelectFromModel(l1_model_class, max_features=max_feats_class)
selector_class.fit(X_class_train, y_class_train)

selected_features_class = X_class_train.columns[selector_class.get_support()]
print("Selected features (classification):")
print(selected_features_class)

# Simpan hasil LASSO
X_class_train_selected = X_class_train[selected_features_class]
X_class_test_selected  = X_class_test[selected_features_class]

X_class_train_selected.to_csv("X_class_train_selected.csv", index=False)
X_class_test_selected.to_csv("X_class_test_selected.csv", index=False)

# Importance
lasso_importance_class = pd.DataFrame({
    "Feature": X_class_train.columns,
    "Importance": np.abs(selector_class.estimator_.coef_[0])
}).sort_values("Importance", ascending=False)

lasso_importance_class.to_csv("lasso_class_importances.csv", index=False)


Selected features (classification):
Index(['Umur', 'LamaBekerja', 'KodeKota'], dtype='object')


In [5]:

# ============================================================
# A.2 — PCA UNTUK FITUR NUMERIK
# ============================================================

scaler_class = StandardScaler()

X_train_num = X_class_train[numeric_cols_class]
X_test_num  = X_class_test[numeric_cols_class]

X_train_scaled = scaler_class.fit_transform(X_train_num)
X_test_scaled  = scaler_class.transform(X_test_num)

# PCA (retain 95% variance)
pca_class = PCA(n_components=0.95, random_state=42)
pca_class.fit(X_train_scaled)

print("Jumlah PC (classification):", pca_class.n_components_)

X_train_pca = pca_class.transform(X_train_scaled)
X_test_pca  = pca_class.transform(X_test_scaled)

pc_cols_class = [f"PC{i+1}" for i in range(pca_class.n_components_)]

df_train_pca = pd.DataFrame(X_train_pca, columns=pc_cols_class)
df_test_pca  = pd.DataFrame(X_test_pca,  columns=pc_cols_class)

# Gabungkan dengan fitur non-numeric
non_num_cols_class = [c for c in X_class_train.columns if c not in numeric_cols_class]

X_class_train_pca = pd.concat([X_class_train[non_num_cols_class].reset_index(drop=True),
                               df_train_pca.reset_index(drop=True)], axis=1)

X_class_test_pca = pd.concat([X_class_test[non_num_cols_class].reset_index(drop=True),
                              df_test_pca.reset_index(drop=True)], axis=1)

X_class_train_pca.to_csv("X_class_train_pca.csv", index=False)
X_class_test_pca.to_csv("X_class_test_pca.csv", index=False)

Jumlah PC (classification): 4


In [6]:
# ============================================================
# ---------------------- BAGIAN B ----------------------------
#     FEATURE ENGINEERING UNTUK REGRESI
#     Target: Pendapatan
# ============================================================

target_reg = "Pendapatan"

y_reg_train = train_df[target_reg]
y_reg_test  = test_df[target_reg]

X_reg_train = train_df.drop(columns=[target_reg]).copy()
X_reg_test  = test_df.drop(columns=[target_reg]).copy()

# Numeric PCA fields untuk regresi
numeric_cols_reg = ["Umur", "JumlahPinjaman", "LamaBekerja"]
numeric_cols_reg = [c for c in numeric_cols_reg if c in X_reg_train.columns]

print("Numeric cols (Regression PCA):", numeric_cols_reg)


Numeric cols (Regression PCA): ['Umur', 'JumlahPinjaman', 'LamaBekerja']


In [7]:
# ============================================================
# B.1 — LASSO FEATURE SELECTION (REGRESSION)
# ============================================================

max_feats_reg = min(15, X_reg_train.shape[1])

lasso_reg = Lasso(alpha=0.1, random_state=42, max_iter=10000)
selector_reg = SelectFromModel(lasso_reg, max_features=max_feats_reg)
selector_reg.fit(X_reg_train, y_reg_train)

selected_features_reg = X_reg_train.columns[selector_reg.get_support()]
print("Selected features (regression):")
print(selected_features_reg)

X_reg_train_selected = X_reg_train[selected_features_reg]
X_reg_test_selected  = X_reg_test[selected_features_reg]

X_reg_train_selected.to_csv("X_reg_train_selected.csv", index=False)
X_reg_test_selected.to_csv("X_reg_test_selected.csv", index=False)

# Importance
lasso_importance_reg = pd.DataFrame({
    "Feature": X_reg_train.columns,
    "Importance": np.abs(selector_reg.estimator_.coef_)
}).sort_values("Importance", ascending=False)

lasso_importance_reg.to_csv("lasso_reg_importances.csv", index=False)

Selected features (regression):
Index(['Umur', 'JumlahPinjaman', 'LamaBekerja', 'Pendidikan',
       'StatusPerkawinan', 'Pekerjaan', 'TujuanPinjaman', 'KodeKota',
       'RiwayatPinjaman', 'StatusPinjaman'],
      dtype='object')


In [8]:
# ============================================================
# B.2 — PCA UNTUK FITUR NUMERIK REGRESI
# ============================================================

scaler_reg = StandardScaler()

X_train_num_reg = X_reg_train[numeric_cols_reg]
X_test_num_reg  = X_reg_test[numeric_cols_reg]

X_train_scaled_reg = scaler_reg.fit_transform(X_train_num_reg)
X_test_scaled_reg  = scaler_reg.transform(X_test_num_reg)

pca_reg = PCA(n_components=0.95, random_state=42)
pca_reg.fit(X_train_scaled_reg)

print("Jumlah PC (regression):", pca_reg.n_components_)

X_train_pca_reg = pca_reg.transform(X_train_scaled_reg)
X_test_pca_reg  = pca_reg.transform(X_test_scaled_reg)

pc_cols_reg = [f"PC{i+1}" for i in range(pca_reg.n_components_)]

df_train_pca_reg = pd.DataFrame(X_train_pca_reg, columns=pc_cols_reg)
df_test_pca_reg  = pd.DataFrame(X_test_pca_reg,  columns=pc_cols_reg)

non_num_cols_reg = [c for c in X_reg_train.columns if c not in numeric_cols_reg]

X_reg_train_pca = pd.concat([X_reg_train[non_num_cols_reg].reset_index(drop=True),
                             df_train_pca_reg.reset_index(drop=True)], axis=1)

X_reg_test_pca = pd.concat([X_reg_test[non_num_cols_reg].reset_index(drop=True),
                            df_test_pca_reg.reset_index(drop=True)], axis=1)

X_reg_train_pca.to_csv("X_reg_train_pca.csv", index=False)
X_reg_test_pca.to_csv("X_reg_test_pca.csv", index=False)

Jumlah PC (regression): 3


In [9]:
# ============================================================
# SIMPAN TARGET LABEL
# ============================================================

y_class_train.to_csv("y_class_train.csv", index=False)
y_class_test.to_csv("y_class_test.csv", index=False)
y_reg_train.to_csv("y_reg_train.csv", index=False)
y_reg_test.to_csv("y_reg_test.csv", index=False)

print("\n=== DONE: All Feature Engineering Outputs Generated ===")


=== DONE: All Feature Engineering Outputs Generated ===
