# Analisis Clustering Marketing Campaign

Notebook ini melakukan clustering pada data marketing campaign **berdasarkan dua fitur**:
- **Pendapatan Tahunan (`Income`)**
- **Total Belanja (`Total_Mnt`)** = sum seluruh kolom yang diawali "Mnt..."

Kolom respon/respon dan fitur lain yang tidak relevan sudah dihapus sesuai instruksi asprak.

In [None]:
# 1. Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score

In [None]:
# 2. Import dataset
df = pd.read_csv("marketing_campaign.csv")

In [None]:
# 3. Drop kolom tidak diperlukan: ID, tanggal, response/respon, dll
drop_cols = [
    "ID", "Dt_Customer",
    "Response", "AcceptedCmp1", "AcceptedCmp2", "AcceptedCmp3", "AcceptedCmp4", "AcceptedCmp5", "Complain"
]
df = df.drop(columns=drop_cols, errors="ignore")

In [None]:
# 4. Hapus data kosong dan duplikat
df = df.dropna(subset=["Income"])
df = df.drop_duplicates()

In [None]:
# 5. Hitung Total Belanja (sum seluruh kolom Mnt*) dan siapkan data clustering
mnt_cols = [c for c in df.columns if c.startswith("Mnt")]
df["Total_Mnt"] = df[mnt_cols].sum(axis=1)
cluster_data = df[["Income", "Total_Mnt"]].copy().astype(float)

In [None]:
# 6. Statistik deskriptif
print("Income stats:\n", cluster_data["Income"].describe())
print("Total Belanja stats:\n", cluster_data["Total_Mnt"].describe())

In [None]:
# 7. Visualisasi distribusi kedua fitur
fig, axs = plt.subplots(1,2, figsize=(13,5))
sns.histplot(cluster_data["Income"], kde=True, ax=axs[0])
axs[0].set_title("Distribusi Pendapatan ('Income')")
sns.histplot(cluster_data["Total_Mnt"], kde=True, ax=axs[1])
axs[1].set_title("Distribusi Total Belanja ('Total_Mnt')")
plt.tight_layout()
plt.show()

In [None]:
# 8. Scatter plot antara income vs total belanja
plt.figure(figsize=(7,5))
sns.scatterplot(x="Income", y="Total_Mnt", data=cluster_data)
plt.title("Income vs Total Belanja")
plt.show()

In [None]:
# 9. Visualisasi KDE kedua fitur
for col in ["Income", "Total_Mnt"]:
    plt.figure(figsize=(5,3))
    sns.kdeplot(cluster_data[col])
    plt.title(f'Distribusi KDE {col}')
    plt.show()

In [None]:
# 10. Standardisasi kedua fitur untuk clustering
scaler = StandardScaler()
X = scaler.fit_transform(cluster_data)
X_df = pd.DataFrame(X, columns=["Income", "Total_Mnt"])

In [None]:
# 11. Visualisasi pairplot kedua fitur (hasil scaling)
sns.pairplot(X_df)
plt.show()

In [None]:
# 12. Korelasi antara kedua fitur saja
plt.figure(figsize=(5,4))
sns.heatmap(cluster_data.corr(), annot=True, cmap="coolwarm")
plt.title("Korelasi Income dan Total Belanja")
plt.show()

In [None]:
# 13. Elbow method untuk KMeans (hanya 2 fitur)
sse = []
K = range(2, 10)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    sse.append(kmeans.inertia_)
plt.figure(figsize=(7,5))
plt.plot(K, sse, marker='o')
plt.xlabel('Jumlah Cluster (K)')
plt.ylabel('SSE')
plt.title('Elbow Method KMeans (Income - Total Belanja)')
plt.show()

In [None]:
# 14. KMeans clustering + evaluasi
k_opt = 4
kmeans_final = KMeans(n_clusters=k_opt, random_state=42)
labels = kmeans_final.fit_predict(X)
cluster_data['cluster_kmeans'] = labels

print('KMeans Silhouette Score:', silhouette_score(X, labels))
print('KMeans Davies-Bouldin Index:', davies_bouldin_score(X, labels))
# Visualisasi hasil cluster
plt.figure(figsize=(7,5))
sns.scatterplot(x=cluster_data["Income"], y=cluster_data["Total_Mnt"], hue=labels, palette="tab10")
plt.title("Clustering KMeans: Income vs Total Belanja")
plt.show()

In [None]:
# 15. DBSCAN clustering + evaluasi
dbscan = DBSCAN(eps=1.5, min_samples=10)
labels_db = dbscan.fit_predict(X)
cluster_data['cluster_dbscan'] = labels_db
# Evaluasi
mask = labels_db != -1
if sum(mask) > 0 and len(np.unique(labels_db[mask])) > 1:
    sil_score_db = silhouette_score(X[mask], labels_db[mask])
    db_index_db = davies_bouldin_score(X[mask], labels_db[mask])
else:
    sil_score_db = None
    db_index_db = None
print("DBSCAN Silhouette Score:", sil_score_db)
print("DBSCAN Davies-Bouldin Index:", db_index_db)
# Visualisasi
plt.figure(figsize=(7,5))
sns.scatterplot(x=cluster_data["Income"], y=cluster_data["Total_Mnt"], hue=labels_db, palette='tab10')
plt.title("Clustering DBSCAN: Income vs Total Belanja")
plt.show()