In [None]:
# -*- coding: utf-8 -*-
"""Pertemuan10_Association_Rules_&_Anomaly_Detection.ipynb

Automatically generated by Colab.

**Studi Kasus 1: Market Basket Analysis dengan Association Rules**
**Studi Kasus 2: Deteksi Anomali Transaksi dengan Isolation Forest**

Langkah 0: Install & Import Library
"""

# Install library yang diperlukan
!pip install mlxtend

# Import semua library yang akan digunakan
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')

print("Semua library berhasil diimport!")

In [None]:
# =============================================
# STUDI KASUS 1: MARKET BASKET ANALYSIS
# =============================================

print("STUDI KASUS 1: MARKET BASKET ANALYSIS")
print("=" * 50)

In [None]:
# Langkah 1: Upload file CSV dataset transaksi
from google.colab import files
import io

print(" SILAHKAN UPLOAD FILE CSV DATASET TRANSAKSI:")
print("Format file CSV harus memiliki kolom: Transaction_ID dan Item")
print("Contoh format:")
print("Transaction_ID,Item")
print("1,whole milk")
print("1,yogurt")
print("2,bread")
print("2,butter")
print("...")

# Upload file
uploaded = files.upload()

# Cek file yang diupload
file_name = list(uploaded.keys())[0]
print(f"File '{file_name}' berhasil diupload!")
print(f"Size file: {len(uploaded[file_name])} bytes")

# Load dataset
df_grocery = pd.read_csv(io.BytesIO(uploaded[file_name]))

# Rename columns to match expected format for association rules
df_grocery.rename(columns={'Member_number': 'Transaction_ID', 'itemDescription': 'Item'}, inplace=True)

# Tampilkan informasi dataset
print("\n INFORMASI DATASET:")
print(f"Shape: {df_grocery.shape}")
print(f"\n5 Data Teratas:")
print(df_grocery.head())
print(f"\nJumlah Transaksi Unik: {df_grocery['Transaction_ID'].nunique()}")
print(f"Jumlah Item Unik: {df_grocery['Item'].nunique()}")

# Tampilkan item paling populer
print(f"\n 10 Item Paling Populer:")
top_items = df_grocery['Item'].value_counts().head(10)
print(top_items)

In [None]:
# Langkah 2: EDA - Analisis Data Transaksi

plt.figure(figsize=(15, 5))

# Plot 1: Top 20 items paling populer
plt.subplot(1, 2, 1)
top_20_items = df_grocery['Item'].value_counts().head(20)
sns.barplot(y=top_20_items.index, x=top_20_items.values, palette='viridis')
plt.title('Top 20 Items Paling Populer')
plt.xlabel('Jumlah Kemunculan')

# Plot 2: Distribusi jumlah item per transaksi
plt.subplot(1, 2, 2)
items_per_transaction = df_grocery.groupby('Transaction_ID')['Item'].count()
sns.histplot(items_per_transaction, bins=30, kde=True)
plt.title('Distribusi Jumlah Item per Transaksi')
plt.xlabel('Jumlah Item per Transaksi')
plt.ylabel('Frekuensi')

plt.tight_layout()
plt.show()

print(f" STATISTIK TRANSAKSI:")
print(f"Rata-rata item per transaksi: {items_per_transaction.mean():.2f}")
print(f"Median item per transaksi: {items_per_transaction.median()}")
print(f"Maksimum item per transaksi: {items_per_transaction.max()}")
print(f"Minimum item per transaksi: {items_per_transaction.min()}")

In [None]:
# Langkah 3: Preprocessing Data untuk Association Rules

print("PREPROCESSING DATA UNTUK ASSOCIATION RULES")

# Format data menjadi list of lists (setiap transaksi sebagai list item)
transactions = df_grocery.groupby('Transaction_ID')['Item'].apply(list).values.tolist()

print(f" Contoh 3 Transaksi Pertama:")
for i, transaction in enumerate(transactions[:3]):
    print(f"Transaksi {i+1}: {transaction}")

# Encode data transaksi menjadi matrix boolean menggunakan TransactionEncoder
te = TransactionEncoder()
te_array = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_array, columns=te.columns_)

print(f"\n Data Berhasil Diencode")
print(f"Shape Data Encoded: {df_encoded.shape}")
print(f"Jumlah Item Unik: {len(df_encoded.columns)}")

# Hitung support untuk setiap item
item_support = df_encoded.mean().sort_values(ascending=False)
print(f"\n SUPPORT TOP 10 ITEMS:")
print(item_support.head(10))

In [None]:
# Langkah 4: Menerapkan Algoritma Apriori untuk Frequent Itemsets

# Tentukan minimum support (sesuaikan dengan dataset Anda)
min_support = 0.02  # Itemset harus muncul setidaknya di 2% transaksi

print(f" MENCARI FREQUENT ITEMSETS DENGAN MIN_SUPPORT = {min_support}")

# Jalankan algoritma Apriori
frequent_itemsets = apriori(df_encoded,
                           min_support=min_support,
                           use_colnames=True,
                           max_len=3)  # Maksimal 3 items per itemset

print(" FREQUENT ITEMSETS YANG DITEMUKAN:")
print(f"Jumlah Frequent Itemsets: {len(frequent_itemsets)}")
print(f"\n10 Frequent Itemsets dengan Support Tertinggi:")
print(frequent_itemsets.sort_values('support', ascending=False).head(10))

# Visualisasi top frequent itemsets
plt.figure(figsize=(12, 6))
top_itemsets = frequent_itemsets.head(15).sort_values('support', ascending=True)

# Buat label yang lebih readable
labels = []
for itemset in top_itemsets['itemsets']:
    if len(itemset) == 1:
        labels.append(str(list(itemset)[0]))
    else:
        labels.append(" + ".join(list(itemset)))

plt.barh(range(len(top_itemsets)), top_itemsets['support'])
plt.yticks(range(len(top_itemsets)), labels, fontsize=10)
plt.xlabel('Support')
plt.title('Top 15 Frequent Itemsets')
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Langkah 5: Membangun Association Rules dari Frequent Itemsets

print(" MEMBANGUN ASSOCIATION RULES")

# Tentukan minimum confidence
min_confidence = 0.2  # Minimum 20% confidence

# Bangun rules dengan minimum confidence
rules = association_rules(frequent_itemsets,
                         metric="confidence",
                         min_threshold=min_confidence)

print(" ASSOCIATION RULES AWAL:")
print(f"Jumlah Rules Awal: {len(rules)}")
print(f"Minimum Confidence: {min_confidence}")

# Filter rules untuk mendapatkan rules yang kuat (lift > 1.2)
strong_rules = rules[rules['lift'] > 1.2]

print(f"\n STRONG RULES (Lift > 1.2):")
print(f"Jumlah Strong Rules: {len(strong_rules)}")

# Tampilkan 15 rules terbaik berdasarkan lift
print(f"\n 15 RULES TERBAIK (Berdasarkan Lift):")
if len(strong_rules) > 0:
    best_rules = strong_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].sort_values('lift', ascending=False).head(15)

    # Format output untuk lebih mudah dibaca
    for idx, row in best_rules.iterrows():
        antecedents = list(row['antecedents'])
        consequents = list(row['consequents'])
        print(f"Rule {idx+1:2d}: {antecedents} -> {consequents}")
        print(f"        Support: {row['support']:.3f}, Confidence: {row['confidence']:.3f}, Lift: {row['lift']:.3f}")
else:
    print("Tidak ada strong rules yang ditemukan. Coba turunkan minimum confidence atau support.")

In [None]:
# Langkah 6: Visualisasi & Interpretasi Association Rules

print("VISUALISASI ASSOCIATION RULES")

plt.figure(figsize=(15, 10))

if len(strong_rules) > 0:
    # Plot 1: Scatter plot Support vs Confidence dengan warna Lift
    plt.subplot(2, 2, 1)
    scatter = plt.scatter(strong_rules['support'],
                         strong_rules['confidence'],
                         c=strong_rules['lift'],
                         cmap='viridis',
                         s=100,
                         alpha=0.7)
    plt.colorbar(scatter, label='Lift')
    plt.xlabel('Support')
    plt.ylabel('Confidence')
    plt.title('Association Rules: Support vs Confidence\n(Color = Lift)')
    plt.grid(True, alpha=0.3)

    # Plot 2: Top rules by lift
    plt.subplot(2, 2, 2)
    top_10_rules = best_rules.head(10)
    y_pos = range(len(top_10_rules))

    plt.barh(y_pos, top_10_rules['lift'])
    plt.yticks(y_pos, [f"Rule {i+1}" for i in y_pos])
    plt.xlabel('Lift')
    plt.title('Top 10 Rules by Lift')
    plt.grid(axis='x', alpha=0.3)

    # Plot 3: Support vs Lift
    plt.subplot(2, 2, 3)
    plt.scatter(strong_rules['support'], strong_rules['lift'], alpha=0.6)
    plt.xlabel('Support')
    plt.ylabel('Lift')
    plt.title('Support vs Lift')
    plt.grid(True, alpha=0.3)

    # Plot 4: Confidence vs Lift
    plt.subplot(2, 2, 4)
    plt.scatter(strong_rules['confidence'], strong_rules['lift'], alpha=0.6)
    plt.xlabel('Confidence')
    plt.ylabel('Lift')
    plt.title('Confidence vs Lift')
    plt.grid(True, alpha=0.3)

else:
    # Jika tidak ada rules
    plt.subplot(2, 2, 1)
    plt.text(0.5, 0.5, 'Tidak ada strong rules\nyang ditemukan',
             ha='center', va='center', transform=plt.gca().transAxes, fontsize=12)
    plt.title('Tidak Ada Strong Rules')

plt.tight_layout()
plt.show()

# Interpretasi rules terbaik
print("\n INTERPRETASI RULES TERBAIK:")
print("=" * 50)

if len(best_rules) > 0:
    for idx, row in best_rules.head(5).iterrows():
        antecedents = list(row['antecedents'])
        consequents = list(row['consequents'])
        print(f"\nRule {idx+1}: JIKA membeli {antecedents} → MAKA juga membeli {consequents}")
        print(f"   • Support: {row['support']:.3f} (muncul di {row['support']*100:.1f}% transaksi)")
        print(f"   • Confidence: {row['confidence']:.3f} ({row['confidence']*100:.1f}% kemungkinan)")
        print(f"   • Lift: {row['lift']:.3f} (x{row['lift']:.1f} lebih sering dari ekspektasi)")
else:
    print("Tidak ada strong rules yang ditemukan untuk diinterpretasi.")

In [None]:
# Langkah 7: Rekomendasi Bisnis Berdasarkan Association Rules

print("REKOMENDASI BISNIS UNTUK SUPERMARKET:")
print("=" * 50)

if len(strong_rules) > 0:
    # 1. Rekomendasi Product Bundling
    print("\n1. **PRODUCT BUNDLING & PROMO:**")
    bundling_rules = best_rules[best_rules['confidence'] > 0.3].head(3)
    if len(bundling_rules) > 0:
        for idx, row in bundling_rules.iterrows():
            antecedents = list(row['antecedents'])
            consequents = list(row['consequents'])
            print(f"   • BUAT PAKET PROMO: {antecedents} + {consequents}")
            print(f"     (Tingkat keberhasilan: {row['confidence']*100:.1f}%)")
    else:
        print("   • Cari rules dengan confidence > 30% untuk bundling efektif")

    # 2. Rekomendasi Store Layout
    print("\n2. **STORE LAYOUT OPTIMIZATION:**")
    layout_rules = best_rules[best_rules['support'] > 0.03].head(3)
    if len(layout_rules) > 0:
        for idx, row in layout_rules.iterrows():
            antecedents = list(row['antecedents'])
            consequents = list(row['consequents'])
            print(f"   • Tempatkan {antecedents} dan {consequents} berdekatan")
            print(f"     (Dampak: {row['support']*100:.1f}% transaksi terpengaruh)")

    # 3. Rekomendasi Inventory Management
    print("\n3. **INVENTORY MANAGEMENT:**")
    inventory_rules = best_rules[best_rules['lift'] > 1.8].head(3)
    if len(inventory_rules) > 0:
        for idx, row in inventory_rules.iterrows():
            antecedents = list(row['antecedents'])
            consequents = list(row['consequents'])
            print(f"   • Stok {consequents} harus diperhatikan ketika {antecedents} laku keras")
            print(f"     (Korelasi: {row['lift']:.1f}x lebih sering dari normal)")

    print(f"\n SUMMARY:")
    print(f"   • Jumlah patterns kuat: {len(strong_rules)} rules")
    print(f"   • Rules terbaik: Lift = {best_rules['lift'].max():.2f}")
    print(f"   • Coverage: Support maksimal = {best_rules['support'].max()*100:.1f}%")

else:
    print("Tidak ada strong rules yang ditemukan.")
    print("Saran:")
    print("1. Turunkan minimum support (saat ini: {min_support})")
    print("2. Turunkan minimum confidence (saat ini: {min_confidence})")
    print("3. Pastikan dataset cukup besar (> 100 transaksi)")

In [None]:
# =============================================
# STUDI KASUS 2: DETEKSI ANOMALI TRANSAKSI
# =============================================

print("\n\n STUDI KASUS 2: DETEKSI ANOMALI TRANSAKSI")
print("=" * 50)

In [None]:
# Langkah 8: Upload file CSV dataset untuk anomali detection
print(" SILAHKAN UPLOAD FILE CSV UNTUK ANOMALI DETECTION:")
print("Format file CSV harus memiliki kolom numerik seperti: amount, quantity, dll")
print("Contoh format:")
print("transaction_id,amount,quantity,time_of_day")
print("1,50.0,2,14")
print("2,1500.0,1,3")
print("3,75.5,3,16")
print("...")

# Upload file
uploaded_anomaly = files.upload()

# Cek file yang diupload
file_name_anomaly = list(uploaded_anomaly.keys())[0]
print(f" File '{file_name_anomaly}' berhasil diupload!")

# Load dataset
df_ecommerce = pd.read_csv(io.BytesIO(uploaded_anomaly[file_name_anomaly]))

# Tampilkan informasi dataset
print("\n INFORMASI DATASET E-COMMERCE:")
print(f"Shape: {df_ecommerce.shape}")
print(f"\n5 Data Teratas:")
print(df_ecommerce.head())
print(f"\n Statistik Dataset:")
print(df_ecommerce.describe())

# Cek kolom numerik yang tersedia
numeric_columns = df_ecommerce.select_dtypes(include=[np.number]).columns.tolist()
print(f"\n Kolom Numerik yang Tersedia: {numeric_columns}")

In [None]:
# Langkah 9: Pilih fitur untuk anomali detection

print("PILIH FITUR UNTUK ANOMALI DETECTION:")

# Jika ada kolom amount, quantity, time_of_day, gunakan itu
# Jika tidak, gunakan 3 kolom numerik pertama
if 'amount' in df_ecommerce.columns and 'quantity' in df_ecommerce.columns:
    features = ['amount', 'quantity']
    if 'time_of_day' in df_ecommerce.columns:
        features.append('time_of_day')
else:
    features = numeric_columns[:3]  # Ambil 3 kolom numerik pertama

print(f"Fitur yang dipilih: {features}")

X = df_ecommerce[features]

print(f"\n STATISTIK FITUR YANG DIPILIH:")
print(X.describe())

In [None]:
# Langkah 10: Preprocessing dan Anomali Detection

print("PREPROCESSING & ANOMALI DETECTION")

# Standardisasi fitur
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Data berhasil di-scaling")

# Tentukan parameter Isolation Forest
contamination = 0.05  # Estimasi 5% data adalah anomali

# Inisialisasi dan train model
iso_forest = IsolationForest(
    contamination=contamination,
    random_state=42,
    n_estimators=100
)

# Fit model dan predict
y_pred = iso_forest.fit_predict(X_scaled)

# Konversi hasil prediksi
df_ecommerce['anomaly_pred'] = y_pred
df_ecommerce['anomaly_label'] = df_ecommerce['anomaly_pred'].map({1: 'Normal', -1: 'Anomali'})

print("HASIL ISOLATION FOREST:")
print(f"Parameter: contamination={contamination}")
print(f"Jumlah data terdeteksi sebagai Anomali: {(df_ecommerce['anomaly_pred'] == -1).sum()}")
print(f"Persentase Anomali: {((df_ecommerce['anomaly_pred'] == -1).sum() / len(df_ecommerce)) * 100:.2f}%")

In [None]:
# Langkah 11: Visualisasi Hasil Deteksi Anomali

print("VISUALISASI HASIL ANOMALI DETECTION")

plt.figure(figsize=(15, 10))

# Plot 1: Scatter plot dua fitur pertama dengan anomali
if len(features) >= 2:
    plt.subplot(2, 2, 1)
    colors = {'Normal': 'blue', 'Anomali': 'red'}
    plt.scatter(df_ecommerce[features[0]], df_ecommerce[features[1]],
               c=df_ecommerce['anomaly_label'].map(colors), alpha=0.6)
    plt.xlabel(features[0])
    plt.ylabel(features[1])
    plt.title(f'Deteksi Anomali: {features[0]} vs {features[1]}')
    plt.legend(handles=[plt.Line2D([0], [0], marker='o', color='w',
                                  markerfacecolor='blue', markersize=8, label='Normal'),
                       plt.Line2D([0], [0], marker='o', color='w',
                                  markerfacecolor='red', markersize=8, label='Anomali')])

# Plot 2: Distribusi fitur pertama dengan anomali
plt.subplot(2, 2, 2)
sns.histplot(data=df_ecommerce, x=features[0], hue='anomaly_label',
             bins=30, alpha=0.6, palette=['blue', 'red'])
plt.title(f'Distribusi {features[0]}: Normal vs Anomali')

# Plot 3: Distribusi fitur kedua dengan anomali
if len(features) >= 2:
    plt.subplot(2, 2, 3)
    sns.histplot(data=df_ecommerce, x=features[1], hue='anomaly_label',
                 bins=30, alpha=0.6, palette=['blue', 'red'])
    plt.title(f'Distribusi {features[1]}: Normal vs Anomali')

# Plot 4: Boxplot per fitur
plt.subplot(2, 2, 4)
anomaly_data = []
normal_data = []
for feature in features[:2]:  # Ambil 2 fitur pertama
    anomaly_data.extend(df_ecommerce[df_ecommerce['anomaly_pred'] == -1][feature].values)
    normal_data.extend(df_ecommerce[df_ecommerce['anomaly_pred'] == 1][feature].values)

box_data = [normal_data, anomaly_data]
plt.boxplot(box_data, labels=['Normal', 'Anomali'])
plt.title('Perbandingan Nilai Fitur: Normal vs Anomali')
plt.ylabel('Nilai Fitur (Standardized)')

plt.tight_layout()
plt.show()

In [None]:
# Langkah 12: Analisis Transaksi Anomali dan Rekomendasi

print("ANALISIS TRANSAKSI ANOMALI:")
print("=" * 40)

anomalous_transactions = df_ecommerce[df_ecommerce['anomaly_pred'] == -1]
normal_transactions = df_ecommerce[df_ecommerce['anomaly_pred'] == 1]

print(f"Jumlah transaksi anomali: {len(anomalous_transactions)}")
print(f"Jumlah transaksi normal: {len(normal_transactions)}")

if len(anomalous_transactions) > 0:
    print(f"\n STATISTIK TRANSAKSI ANOMALI:")
    print(anomalous_transactions[features].describe())

    print(f"\n CONTOH TRANSAKSI ANOMALI:")
    print(anomalous_transactions.head().to_string(index=False))

    # Analisis karakteristik anomali
    print(f"\n KARAKTERISTIK ANOMALI:")
    for feature in features:
        anomaly_mean = anomalous_transactions[feature].mean()
        normal_mean = normal_transactions[feature].mean()
        print(f"• {feature}: Anomali {anomaly_mean:.2f} vs Normal {normal_mean:.2f}")

print("\n REKOMENDASI BISNIS:")
print("=" * 30)
if len(anomalous_transactions) > 0:
    print("1. **INVESTIGASI SEGERA:** Periksa {} transaksi anomali".format(len(anomalous_transactions)))
    print("2. **FRAUD DETECTION:** Cek transaksi dengan nilai ekstrem")
    print("3. **REVIEW SYSTEM:** Tinjau aturan bisnis untuk transaksi tidak biasa")
    print("4. **MONITORING:** Implementasi alert untuk pola serupa di masa depan")
else:
    print("Tidak terdeteksi anomali. Sistem berjalan normal.")

print("\n ANALISIS SELESAI!")