In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# =========================================================
# 1. LOAD DATA & RENAME KOLUMN
# =========================================================

file_path = "Students_Performance_data_set.xlsx"
df_raw = pd.read_excel(file_path)

rename_columns = {
    "University Admission year": "AdmYear",
    "Age": "Age",
    "H.S.C passing year": "HSCYear",
    "Current Semester": "Semester",
    "Do you have meritorious scholarship ?": "Scholarship",
    "How many hour do you study daily?": "Study Hours",
    "How many times do you seat for study in a day?": "Study Frequency",
    "What is your preferable learning mode?": "Learn Mode",
    "How many hour do you spent daily in social media?": "Social Media",
    "Status of your English language proficiency": "English Skill",
    "Average attendance on class": "Attendance",
    "Did you ever fall in probation?": "Probation",
    "Do you attend in teacher consultancy for any kind of academical problems?": "Consultancy",
    "How many hour do you spent daily on your skill development?": "Skill Hours",
    "Are you engaged with any co-curriculum activities?": "Co Curricular",
    "Do you have any health issues?": "Health Issues",
    "What was your previous SGPA?": "SGPA",
    "What is your current CGPA?": "CGPA",
    "How many Credit did you have completed?": "Credits",
    "What is your monthly family income?": "Family Income",
}
df = df_raw.rename(columns=rename_columns)

# 20 kriteria (urutan sama dengan Excel/AHP/TOPSIS)
criteria_cols = [
    "AdmYear", "Age", "HSCYear", "Semester", "Scholarship",
    "Study Hours", "Study Frequency", "Learn Mode", "Social Media",
    "English Skill", "Attendance", "Probation", "Consultancy",
    "Skill Hours", "Co Curricular", "Health Issues",
    "SGPA", "CGPA", "Credits", "Family Income",
]

# Ambil 20 data pertama sebagai A1..A20
data = df[criteria_cols].head(20).copy()
alt_ids = [f"A{i}" for i in range(1, len(data) + 1)]
data.index = alt_ids

# =========================================================
# 2. ENCODING KATEGORIK (sesuai dataset kamu)
# =========================================================

def bin_encode(series, pos_values):
    return series.apply(lambda x: 1 if str(x).strip().lower() in pos_values else 0)

# Scholarship, Probation, Consultancy, Co Curricular, Health Issues (Yes/No)
data["Scholarship"]   = bin_encode(data["Scholarship"],   {"yes", "1"})
data["Probation"]     = bin_encode(data["Probation"],     {"yes", "1"})
data["Consultancy"]   = bin_encode(data["Consultancy"],   {"yes", "1"})
data["Co Curricular"] = bin_encode(data["Co Curricular"], {"yes", "1"})
data["Health Issues"] = bin_encode(data["Health Issues"], {"yes", "1"})

# Learn Mode: 1 kalau "Offline", selain itu 0 (persis rumus Excel)
data["Learn Mode"] = (
    data["Learn Mode"]
    .astype(str).str.strip().str.lower()
    .apply(lambda x: 1 if x == "offline" else 0)
)


# English Skill
def encode_english_skill(x):
    x = str(x).strip().lower()
    if x == "basic":
        return 1
    elif x == "intermediate":
        return 2
    elif x == "advanced" or x == "advance":
        return 3
    else:
        return 0

data["English Skill"] = data["English Skill"].apply(encode_english_skill)

# Kolom numerik "asli"
numeric_cols = [
    "AdmYear", "Age", "HSCYear", "Semester",
    "Study Hours", "Study Frequency", "Social Media",
    "Attendance", "Skill Hours", "SGPA", "CGPA",
    "Credits", "Family Income",
]

for col in numeric_cols:
    data[col] = pd.to_numeric(data[col], errors="coerce")

# Isi NaN dengan median
data = data.astype(float)
data = data.fillna(data.median(numeric_only=True))

print("\n=== DATASET SETELAH ENCODING (20 ALT x 20 KRITERIA) ===")
print(data)

# =========================================================
# 3. NORMALISASI MIN–MAX (seperti tabel "Normalisasi" Excel)
# =========================================================

scaler = MinMaxScaler()
norm_array = scaler.fit_transform(data[criteria_cols].values)
norm_df = pd.DataFrame(norm_array, index=alt_ids, columns=criteria_cols)

print("\n=== TABEL NORMALISASI MIN–MAX (0–1) ===")
print(norm_df)

# =========================================================
# 4. K-MEANS MANUAL (k=3, centroid awal = A1, A2, A3)
# =========================================================

def kmeans_manual(X, k=3, max_iter=100, tol=1e-6):
    """
    X : DataFrame (n_alt x n_feat), index = A1..An
    """
    X_values = X.values
    n_alt, n_feat = X_values.shape

    # centroid awal: baris 0,1,2 (A1,A2,A3) → C1,C2,C3
    centroids = X_values[:k].copy()

    labels = np.zeros(n_alt, dtype=int)
    last_labels = None
    history = []

    for it in range(1, max_iter + 1):
        # hitung jarak ke masing-masing centroid (euclidean)
        # shape: (n_alt, k)
        diffs = X_values[:, None, :] - centroids[None, :, :]
        dists = np.sqrt((diffs ** 2).sum(axis=2))

        # cluster assignment
        labels = np.argmin(dists, axis=1)

        # simpan info iterasi
        dist_df = pd.DataFrame(
            dists,
            index=X.index,
            columns=[f"C{j+1}" for j in range(k)]
        )
        dist_df["Cluster"] = ["C" + str(c+1) for c in labels]
        history.append((it, dist_df))

        # hitung centroid baru
        new_centroids = np.zeros_like(centroids)
        for j in range(k):
            members = X_values[labels == j]
            if len(members) > 0:
                new_centroids[j] = members.mean(axis=0)
            else:
                # kalau kosong, biarkan centroid lama
                new_centroids[j] = centroids[j]

        # cek konvergensi
        if last_labels is not None and np.array_equal(labels, last_labels):
            centroids = new_centroids
            break

        if np.max(np.abs(new_centroids - centroids)) < tol:
            centroids = new_centroids
            break

        centroids = new_centroids
        last_labels = labels.copy()

    return labels, centroids, history

cluster_labels_idx, centroids, kmeans_history = kmeans_manual(norm_df, k=3)

# Tabel jarak & cluster iterasi terakhir (kayak "Iterasi 2" di Excel)
last_it, last_dist_df = kmeans_history[-1]
print(f"\n=== K-MEANS – ITERASI TERAKHIR (Iterasi {last_it}) ===")
print(last_dist_df)

# mapping alternatif -> cluster
cluster_map = {alt_ids[i]: f"C{cluster_labels_idx[i]+1}" for i in range(len(alt_ids))}
cluster_series = pd.Series(cluster_map, name="Cluster")
print("\n=== CLUSTER SETIAP ALTERNATIF ===")
print(cluster_series)

# =========================================================
# 5. HITUNG NEED INDEX (RISK INDEX) PER CLUSTER
#    pakai atribut Benefit/Cost/Netral yg kamu tulis di Excel
# =========================================================

# mapping atribut untuk Need Index (BUKAN untuk TOPSIS)
need_attr = {
    "AdmYear": "benefit",
    "Age": "neutral",
    "HSCYear": "benefit",
    "Semester": "benefit",
    "Scholarship": "benefit",
    "Study Hours": "benefit",
    "Study Frequency": "benefit",
    "Learn Mode": "benefit",
    "Social Media": "cost",
    "English Skill": "benefit",
    "Attendance": "benefit",
    "Probation": "cost",
    "Consultancy": "benefit",
    "Skill Hours": "benefit",
    "Co Curricular": "benefit",
    "Health Issues": "cost",
    "SGPA": "benefit",
    "CGPA": "benefit",
    "Credits": "benefit",
    "Family Income": "neutral",
}

cluster_ids = [0, 1, 2]
cluster_names = [f"C{i+1}" for i in cluster_ids]

need_index_list = []
cluster_members = {f"C{i+1}": [] for i in cluster_ids}

for alt_id, c in cluster_map.items():
    cluster_members[c].append(alt_id)

for j, cname in zip(cluster_ids, cluster_names):
    centroid = centroids[j]  # vector 20 dim
    risk_vals = []

    for col_idx, col in enumerate(criteria_cols):
        val = centroid[col_idx]
        attr = need_attr[col]
        if attr == "benefit":
            risk = 1.0 - val
        elif attr == "cost":
            risk = val
        else:  # neutral
            risk = 0.5
        risk_vals.append(risk)

    mean_risk = float(np.mean(risk_vals))
    need_index_list.append(
        {
            "Cluster": cname,
            "Mean_Need_Index": mean_risk,
            "Count": len(cluster_members[cname]),
            "Members": ", ".join(cluster_members[cname]),
        }
    )

need_index_df = pd.DataFrame(need_index_list).set_index("Cluster")
print("\n=== MEAN NEED INDEX PER CLUSTER (C1/C2/C3) ===")
print(need_index_df)

# beri label tinggi/sedang/rendah berdasar Mean_Need_Index
ordered_clusters = need_index_df.sort_values("Mean_Need_Index", ascending=False).index.tolist()
label_map = {}
if len(ordered_clusters) == 3:
    label_map[ordered_clusters[0]] = "Butuh Bimbingan Tinggi"
    label_map[ordered_clusters[1]] = "Butuh Bimbingan Sedang"
    label_map[ordered_clusters[2]] = "Butuh Bimbingan Rendah"

need_index_df["Label"] = need_index_df.index.map(label_map)

print("\n=== CLUSTER + LABEL BUTUH BIMBINGAN ===")
print(need_index_df)

# cluster paling tinggi kebutuhannya
high_cluster = ordered_clusters[0]
high_need_alts = cluster_members[high_cluster]

print(f"\n=== ALTERNATIF DI CLUSTER '{high_cluster}' ({label_map[high_cluster]}) ===")
print(high_need_alts)

# tabel alternatif + cluster + label
alt_cluster_df = pd.DataFrame({
    "Alternatif": alt_ids,
    "Cluster": [cluster_map[a] for a in alt_ids],
})
alt_cluster_df["Label"] = alt_cluster_df["Cluster"].map(label_map)
alt_cluster_df = alt_cluster_df.set_index("Alternatif")

print("\n=== TABEL ALTERNATIF + CLUSTER + LABEL FINAL ===")
print(alt_cluster_df)

# =========================================================
# 6. AHP – DARI MATRiks PERBANDINGAN BERPASANGAN
# =========================================================

pairwise_matrix = np.array([
    # AdmYear
    [1,   1,   1,   1,   1/2, 1/2, 1/2, 1,   1/2, 1/2, 1/2, 1/2, 1/2, 1/2, 1,   1/2, 1/3, 1/3, 1/2, 1],
    # Age
    [1,   1,   1/2, 1/2, 1/2, 1/3, 1/3, 1/2, 1/3, 1/3, 1/4, 1/3, 1/3, 1/2, 1,   1/2, 1/4, 1/5, 1/3, 1/2],
    # HSCYear
    [1,   2,   1,   1,   1/2, 1/2, 1/2, 1,   1/2, 1/2, 1/2, 1/2, 1/2, 1/2, 1,   1/2, 1/3, 1/3, 1/2, 1],
    # Semester
    [1,   2,   1,   1,   1/2, 1/2, 1/2, 1,   1/2, 1/2, 1/2, 1/2, 1/2, 1/2, 1,   1/2, 1/3, 1/3, 1/2, 1],
    # Scholarship
    [2,   2,   2,   2,   1,   1/2, 1,   1,   1,   1,   1/2, 1/2, 1,   1,   2,   1,   1/2, 1/2, 1,   1],
    # Study Hours
    [2,   3,   2,   2,   2,   1,   1,   2,   1,   1,   1,   1,   1,   1,   3,   1,   1/2, 1/2, 1,   2],
    # Study Frequency
    [2,   3,   2,   2,   1,   1,   1,   2,   1,   1,   1/2, 1,   1,   1,   3,   1,   1/2, 1/2, 1,   2],
    # Learn Mode
    [1,   2,   1,   1,   2,   1/2, 1/2, 1,   1/2, 1/2, 1/2, 1/2, 1/2, 1/2, 1,   1/2, 1/3, 1/3, 1/2, 1],
    # Social Media
    [2,   3,   2,   2,   1,   1,   1,   2,   1,   1,   1/2, 1,   1,   1,   3,   1,   1/2, 1/2, 1,   2],
    # English Skill
    [2,   3,   2,   2,   1,   1,   1,   2,   1,   1,   1/2, 1,   1,   1,   3,   1,   1/2, 1/2, 1,   2],
    # Attendance
    [2,   4,   2,   2,   2,   1,   2,   2,   2,   2,   1,   1,   1,   2,   4,   2,   1,   1,   2,   2],
    # Probation
    [2,   3,   2,   2,   1,   1,   1,   2,   1,   1,   1,   1,   1,   1,   3,   1,   1/2, 1/2, 1,   2],
    # Consultancy
    [2,   2,   1,   1,   1,   1,   1,   2,   1,   1,   1/2, 1,   1,   1,   3,   1,   1/2, 1/2, 1,   2],
    # Skill Hours
    [2,   2,   2,   2,   1,   1,   1,   2,   1,   1,   1/2, 1,   1,   1,   2,   1,   1/2, 1/2, 1,   1],
    # Co Curricular
    [1,   1,   1,   1,   1/2, 1/3, 1/3, 1/2, 1/3, 1/3, 1/4, 1/3, 1/3, 1/2, 1,   1/2, 1/4, 1/5, 1/3, 1/2],
    # Health Issues
    [2,   2,   2,   2,   2,   1/2, 1,   2,   1,   1,   1/2, 1/2, 1,   1,   2,   1,   1/2, 1/2, 1,   1],
    # SGPA
    [3,   4,   3,   3,   2,   1,   2,   3,   2,   2,   1,   1,   2,   2,   4,   2,   1,   1,   2,   3],
    # CGPA
    [3,   3,   3,   3,   2,   2,   2,   3,   2,   2,   1,   2,   2,   2,   4,   2,   1,   1,   2,   3],
    # Credits
    [2,   3,   2,   2,   1,   1,   1,   2,   1,   1,   1/2, 1,   1,   1,   3,   1,   1/2, 1/2, 1,   2],
    # Family Income
    [1,   2,   1,   1,   1,   1/2, 1,   2,   1,   1,   1/2, 1/2, 1,   1,   2,   1,   1/2, 1/2, 1,   1],
], dtype=float)

n = pairwise_matrix.shape[0]

# normalisasi kolom (A_ij / sum_j)
col_sums = pairwise_matrix.sum(axis=0)
norm_A = pairwise_matrix / col_sums

# bobot = rata-rata baris
weights = norm_A.mean(axis=1)
weights = weights / weights.sum()  # normalisasi lagi biar pasti sum=1

ahp_weights_df = pd.DataFrame({
    "Kriteria": criteria_cols,
    "Weight": weights,
}).set_index("Kriteria")

print("\n=== BOBOT AHP (DARI MATRIKS PERBANDINGAN) ===")
print(ahp_weights_df)

# cek konsistensi (lambda_max, CI, CR)
Aw = pairwise_matrix @ weights
lambda_max = (Aw / weights).mean()
CI = (lambda_max - n) / (n - 1)
RI = 1.59  # untuk n=20
CR = CI / RI

print("\n=== AHP CONSISTENCY CHECK ===")
print(f"lambda_max = {lambda_max:.4f}")
print(f"CI         = {CI:.6f}")
print(f"CR         = {CR:.6f}")

def score_excel_style(row):
    s = {}

    # AdmYear
    ay = row["AdmYear"]
    if ay <= 2019: s["AdmYear"] = 50
    elif ay == 2020: s["AdmYear"] = 70
    else: s["AdmYear"] = 90

    # Age
    age = row["Age"]
    if age >= 23: s["Age"] = 50
    elif age == 22: s["Age"] = 70
    else: s["Age"] = 90

    # HSCYear
    hsc = row["HSCYear"]
    if hsc <= 2018: s["HSCYear"] = 50
    elif hsc <= 2020: s["HSCYear"] = 70
    else: s["HSCYear"] = 90

    # Semester
    sem = row["Semester"]
    if sem <= 2: s["Semester"] = 60
    elif sem <= 4: s["Semester"] = 75
    elif sem <= 6: s["Semester"] = 85
    else: s["Semester"] = 90

    # Scholarship
    s["Scholarship"] = 90 if row["Scholarship"] == 1 else 50

    # Study Hours
    sh = row["Study Hours"]
    if sh <= 1: s["Study Hours"] = 50
    elif sh <= 2: s["Study Hours"] = 60
    elif sh <= 3: s["Study Hours"] = 70
    elif sh <= 4: s["Study Hours"] = 80
    else: s["Study Hours"] = 90

    # Study Frequency
    sf = row["Study Frequency"]
    if sf == 1: s["Study Frequency"] = 60
    elif sf == 2: s["Study Frequency"] = 75
    elif sf == 3: s["Study Frequency"] = 85
    else: s["Study Frequency"] = 90

    # Learn Mode
    lm = row["Learn Mode"]
    if lm == 1: s["Learn Mode"] = 90         # Offline
    elif lm == 0.5: s["Learn Mode"] = 80     # Hybrid
    else: s["Learn Mode"] = 70               # Online

    # Social Media
    sm = row["Social Media"]
    if sm < 1: s["Social Media"] = 90
    elif sm < 2: s["Social Media"] = 80
    elif sm < 3: s["Social Media"] = 70
    elif sm < 5: s["Social Media"] = 60
    else: s["Social Media"] = 50

    # English Skill
    eng = row["English Skill"]
    if eng >= 3: s["English Skill"] = 90     # Advance
    elif eng == 2: s["English Skill"] = 80   # Intermediate
    else: s["English Skill"] = 60            # Basic

    # Attendance
    att = row["Attendance"]
    if att <= 20: s["Attendance"] = 50
    elif att <= 40: s["Attendance"] = 60
    elif att <= 60: s["Attendance"] = 70
    elif att <= 80: s["Attendance"] = 80
    else: s["Attendance"] = 90

    # Probation
    s["Probation"] = 50 if row["Probation"] == 1 else 90

    # Consultancy
    s["Consultancy"] = 85 if row["Consultancy"] == 1 else 60

    # Skill Hours
    h = row["Skill Hours"]
    if h == 0: s["Skill Hours"] = 60
    elif h <= 1: s["Skill Hours"] = 70
    elif h <= 2: s["Skill Hours"] = 80
    else: s["Skill Hours"] = 90

    # Co Curricular
    s["Co Curricular"] = 85 if row["Co Curricular"] == 1 else 70

    # Health Issues
    s["Health Issues"] = 60 if row["Health Issues"] == 1 else 90

    # SGPA
    sg = row["SGPA"]
    if sg <= 2: s["SGPA"] = 50
    elif sg < 2.5: s["SGPA"] = 60
    elif sg < 3: s["SGPA"] = 70
    elif sg < 3.5: s["SGPA"] = 80
    else: s["SGPA"] = 90

    # CGPA — sama dengan SGPA
    cg = row["CGPA"]
    if cg <= 2: s["CGPA"] = 50
    elif cg < 2.5: s["CGPA"] = 60
    elif cg < 3: s["CGPA"] = 70
    elif cg < 3.5: s["CGPA"] = 80
    else: s["CGPA"] = 90

    # Credits
    cr = row["Credits"]
    if cr < 30: s["Credits"] = 60
    elif cr < 40: s["Credits"] = 75
    elif cr < 60: s["Credits"] = 85
    else: s["Credits"] = 90

    # Family Income
    inc = row["Family Income"]
    if inc <= 20000: s["Family Income"] = 60
    elif inc <= 35000: s["Family Income"] = 70
    elif inc <= 60000: s["Family Income"] = 80
    else: s["Family Income"] = 90

    return pd.Series(s)


# =========================================================
# 7. TOPSIS – HANYA ALTERNATIF BERLABEL "BUTUH BIMBINGAN TINGGI"
# =========================================================

# Pilih alternatif berdasarkan LABEL, bukan sekadar cluster
label_target = "Butuh Bimbingan Tinggi"
high_alts = alt_cluster_df.index[alt_cluster_df["Label"] == label_target].tolist()

print(f"\n=== ALTERNATIF DENGAN LABEL '{label_target}' ===")
print(high_alts)

# Ambil data encoded hanya untuk alternatif tersebut (16 alternatif)
high_data = data.loc[high_alts, criteria_cols].copy()

print("\n=== DATA ENCODED – ALTERNATIF LABEL BUTUH BIMBINGAN TINGGI ===")
print(high_data)

score_df = high_data.apply(score_excel_style, axis=1)

print("\n=== MATRiks SKOR (50–90) UNTUK TOPSIS (16 ALT) ===")
print(score_df)

# mapping atribut untuk TOPSIS
topsis_attr = {
    "AdmYear": "benefit",
    "Age": "cost",
    "HSCYear": "benefit",
    "Semester": "cost",
    "Scholarship": "cost",
    "Study Hours": "cost",
    "Study Frequency": "cost",
    "Learn Mode": "cost",
    "Social Media": "benefit",
    "English Skill": "cost",
    "Attendance": "cost",
    "Probation": "benefit",
    "Consultancy": "cost",
    "Skill Hours": "cost",
    "Co Curricular": "cost",
    "Health Issues": "benefit",
    "SGPA": "cost",
    "CGPA": "cost",
    "Credits": "cost",
    "Family Income": "cost",
}

benefit_flags = np.array([topsis_attr[col] == "benefit" for col in criteria_cols])

def topsis_from_scores(score_df, weights, benefit_flags):
    X = score_df.values.astype(float)  # m x n
    m, n = X.shape

    # 1. normalisasi vektor
    norm = np.sqrt((X ** 2).sum(axis=0))
    norm[norm == 0] = 1
    R = X / norm

    # 2. bobot AHP
    w = weights.reshape(1, -1)
    V = R * w

    # 3. solusi ideal
    ideal_pos = np.where(benefit_flags, V.max(axis=0), V.min(axis=0))
    ideal_neg = np.where(benefit_flags, V.min(axis=0), V.max(axis=0))

    # 4. jarak ke solusi ideal
    D_pos = np.sqrt(((V - ideal_pos) ** 2).sum(axis=1))
    D_neg = np.sqrt(((V - ideal_neg) ** 2).sum(axis=1))

    # 5. nilai preferensi
    C = D_neg / (D_pos + D_neg + 1e-12)

    return C, D_pos, D_neg, R, V, ideal_pos, ideal_neg

scores, D_pos, D_neg, R, V, ideal_pos, ideal_neg = topsis_from_scores(
    score_df, weights, benefit_flags
)

# Tabel-tabel TOPSIS utk 16 alternatif
R_df = pd.DataFrame(R, index=high_alts, columns=criteria_cols)
V_df = pd.DataFrame(V, index=high_alts, columns=criteria_cols)

print("\n=== MATRiks NORMALISASI (R_ij) – TOPSIS (16 ALT) ===")
print(R_df)

print("\n=== MATRiks TERTIMBANG (V_ij = w_j * R_ij) – TOPSIS (16 ALT) ===")
print(V_df)

ideal_df = pd.DataFrame(
    [ideal_pos, ideal_neg],
    index=["Ideal_Pos (+)", "Ideal_Neg (-)"],
    columns=criteria_cols,
)
print("\n=== SOLUSI IDEAL POSITIF & NEGATIF (16 ALT) ===")
print(ideal_df)

dist_df = pd.DataFrame({
    "Alternatif": high_alts,
    "D_pos": D_pos,
    "D_neg": D_neg,
})
dist_df["Preferensi"] = scores
print("\n=== JARAK KE SOLUSI IDEAL & NILAI PREFERENSI (16 ALT) ===")
print(dist_df.set_index("Alternatif"))

# Ranking akhir TOPSIS hanya untuk 16 alternatif berlabel Butuh Bimbingan Tinggi
rank_df = pd.DataFrame({
    "Alternatif": high_alts,
    "TOPSIS_Score": scores,
})
rank_df["Rank"] = rank_df["TOPSIS_Score"].rank(
    ascending=False, method="dense"
).astype(int)
rank_df = rank_df.sort_values("TOPSIS_Score", ascending=False).set_index("Alternatif")

print("\n=== RANKING AKHIR TOPSIS (HANYA LABEL BUTUH BIMBINGAN TINGGI) ===")
print(rank_df)

# Gabungkan dengan info cluster + label (supaya jelas di tabel akhir)
final_table = rank_df.join(alt_cluster_df[["Cluster", "Label"]], how="left")
print("\n=== TABEL FINAL: TOPSIS + CLUSTER + LABEL (16 ALT) ===")
print(final_table)

# =========================================================
# 8. ANALISIS FAKTOR DOMINAN PER CLUSTER (BERDASARKAN RISK)
# =========================================================

# Data centroid final dalam bentuk DataFrame
centroid_df = pd.DataFrame(
    centroids,
    index=cluster_names,  # ['C1','C2','C3']
    columns=criteria_cols
)

# Hitung risk per kriteria per cluster (sama logika dengan Need Index)
risk_detail_rows = []
for cname in cluster_names:
    centroid = centroid_df.loc[cname].values
    for col_idx, col in enumerate(criteria_cols):
        val = centroid[col_idx]
        attr = need_attr[col]
        if attr == "benefit":
            risk = 1.0 - val
        elif attr == "cost":
            risk = val
        else:
            risk = 0.5
        risk_detail_rows.append({
            "Cluster": cname,
            "Kriteria": col,
            "Centroid_Val": val,
            "Risk_Score": risk
        })

risk_detail_df = pd.DataFrame(risk_detail_rows)

print("\n=== DETAIL RISK SCORE PER KRITERIA & CLUSTER ===")
print(risk_detail_df)

# TOP-N faktor dominan per cluster (misal 5 tertinggi Risk_Score)
TOP_N = 5
dominant_factors = []

for cname in cluster_names:
    sub = risk_detail_df[risk_detail_df["Cluster"] == cname]\
            .sort_values("Risk_Score", ascending=False)\
            .head(TOP_N)
    dominant_factors.append(sub)

dominant_factors_df = pd.concat(dominant_factors, ignore_index=True)

print(f"\n=== {TOP_N} FAKTOR RISIKO DOMINAN PER CLUSTER ===")
print(dominant_factors_df)

# =========================================================
# 9. PERUBAHAN KOMPOSISI CLUSTER PER ITERASI K-MEANS
# =========================================================

total_alt = len(norm_df)
iter_stats = []

for it, dist_df in kmeans_history:
    counts = dist_df["Cluster"].value_counts().reindex(cluster_names, fill_value=0)
    row = {"Iterasi": it}
    for cname in cluster_names:
        count = int(counts[cname])
        pct = 100.0 * count / total_alt
        row[f"{cname}_Count"] = count
        row[f"{cname}_Pct"] = pct
    iter_stats.append(row)

iter_stats_df = pd.DataFrame(iter_stats)

print("\n=== PERUBAHAN KOMPOSISI CLUSTER PER ITERASI K-MEANS ===")
print(iter_stats_df)

# =========================================================
# 10. CENTROID PER ITERASI (SETELAH NORMALISASI)
# =========================================================

centroid_history_rows = []

for it, dist_df in kmeans_history:
    labels_iter = dist_df["Cluster"]  # C1/C2/C3 per alt
    for cname in cluster_names:
        members_idx = labels_iter[labels_iter == cname].index
        if len(members_idx) == 0:
            continue
        # centroid iterasi ini = rata-rata norm_df untuk anggota cluster tsb
        centroid_iter = norm_df.loc[members_idx, criteria_cols].mean()
        row = {
            "Iterasi": it,
            "Cluster": cname,
        }
        for col in criteria_cols:
            row[col] = centroid_iter[col]
        centroid_history_rows.append(row)

centroid_history_df = pd.DataFrame(centroid_history_rows)

print("\n=== CENTROID PER ITERASI (SETELAH NORMALISASI) ===")
print(centroid_history_df)

# =========================================================
# 11. KOMPOSISI CLUSTER & LABEL (JUMLAH + PERSENTASE)
# =========================================================

# Komposisi cluster (C1/C2/C3)
cluster_comp = alt_cluster_df["Cluster"].value_counts()\
                .reindex(cluster_names, fill_value=0)\
                .rename_axis("Cluster")\
                .reset_index(name="Count")
cluster_comp["Pct"] = cluster_comp["Count"] / total_alt * 100.0

print("\n=== KOMPOSISI CLUSTER (JUMLAH & PERSEN) ===")
print(cluster_comp)

# Komposisi label (Butuh Bimbingan Tinggi/Sedang/Rendah)
label_comp = alt_cluster_df["Label"].value_counts()\
              .rename_axis("Label")\
              .reset_index(name="Count")
label_comp["Pct"] = label_comp["Count"] / total_alt * 100.0

print("\n=== KOMPOSISI LABEL BUTUH BIMBINGAN (JUMLAH & PERSEN) ===")
print(label_comp)


=== DATASET SETELAH ENCODING (20 ALT x 20 KRITERIA) ===
     AdmYear   Age  HSCYear  Semester  Scholarship  Study Hours  \
A1    2018.0  24.0   2016.0      12.0          1.0          3.0   
A2    2021.0  22.0   2020.0       4.0          1.0          3.0   
A3    2020.0  21.0   2019.0       5.0          0.0          3.0   
A4    2021.0  20.0   2020.0       4.0          1.0          1.0   
A5    2021.0  22.0   2019.0       4.0          1.0          3.0   
A6    2021.0  20.0   2020.0       4.0          1.0          2.0   
A7    2021.0  22.0   2018.0       4.0          1.0          2.0   
A8    2021.0  22.0   2019.0       4.0          1.0          2.0   
A9    2021.0  20.0   2020.0       4.0          1.0          3.0   
A10   2021.0  22.0   2020.0       4.0          1.0          5.0   
A11   2021.0  22.0   2019.0       4.0          1.0          3.0   
A12   2021.0  22.0   2019.0       4.0          0.0          3.0   
A13   2021.0  22.0   2020.0       4.0          0.0          1.0   
A14  