# Data Mining Project

In [2]:
# ===============================
# * Import Required Libraries
# ===============================
import numpy as np
import pandas as pd
# from sklearn import datasets => iris = datasets.load_iris()
from sklearn.datasets import load_iris # => iris = load_iris()
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
import matplotlib.pyplot as plt
import seaborn as sns


In [4]:
# ===============================
# #. Load the Iris Dataset
# ===============================
df = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['species'] = iris.target
df['species'] = df['species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})
df.head()


NameError: name 'iris' is not defined

In [None]:
# ===============================
# 1. Normalization & Standardization
# ===============================
minmax_scaler = MinMaxScaler()
zscore_scaler = StandardScaler()

df_minmax = pd.DataFrame(minmax_scaler.fit_transform(df.iloc[:, :-1]), columns=df.columns[:-1])
df_zscore = pd.DataFrame(zscore_scaler.fit_transform(df.iloc[:, :-1]), columns=df.columns[:-1])

df_minmax['species'] = df['species']
df_zscore['species'] = df['species']

print("Min-Max Normalized Data:")
display(df_minmax.head())

print("\nZ-Score Standardized Data:")
display(df_zscore.head())
# Compare original, min-max, and z-score distributions for one feature
fig, ax = plt.subplots(1, 3, figsize=(15, 4))

sns.histplot(df['sepal length (cm)'], kde=True, ax=ax[0], color="skyblue")
ax[0].set_title("Original Sepal Length")

sns.histplot(df_minmax['sepal length (cm)'], kde=True, ax=ax[1], color="orange")
ax[1].set_title("Min-Max Normalized")

sns.histplot(df_zscore['sepal length (cm)'], kde=True, ax=ax[2], color="green")
ax[2].set_title("Z-Score Standardized")

plt.suptitle("Normalization and Standardization Comparison", fontsize=14)
plt.show()

In [None]:
# ===============================
# 3. K-Means Clustering (1D & 2D)
# ===============================
# 1D Clustering (Sepal Length)
X1 = df[['sepal length (cm)']]
kmeans_1d = KMeans(n_clusters=3, random_state=42)
df['Cluster_1D'] = kmeans_1d.fit_predict(X1)

plt.scatter(X1, [0.05] * len(X1), c=df['Cluster_1D'], cmap='viridis')
plt.xlabel('Sepal Length (cm)')
plt.title('1D K-Means Clustering')
plt.show()

# 2D Clustering (Sepal Length & Width)
X2 = df[['sepal length (cm)', 'sepal width (cm)']]
kmeans_2d = KMeans(n_clusters=3, random_state=42)
df['Cluster_2D'] = kmeans_2d.fit_predict(X2)

plt.scatter(X2.iloc[:, 0], X2.iloc[:, 1], c=df['Cluster_2D'], cmap='rainbow')
plt.xlabel('Sepal Length (cm)')
plt.ylabel('Sepal Width (cm)')
plt.title('2D K-Means Clustering')
plt.show()


In [None]:
# ===============================
# 4. Classification â€” KNN, NB, Decision Tree
# ===============================
X = df.iloc[:, :-3]
y = df['species']

classifiers = {
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(random_state=42)
}

results = []

for name, model in classifiers.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred, labels=['setosa', 'versicolor', 'virginica'])
    TN = cm[0][0]
    FP = cm[0][1] + cm[0][2]
    FN = cm[1][0] + cm[2][0]
    TP = cm[1][1] + cm[2][2]
    
    TPR = TP / (TP + FN)
    FPR = FP / (FP + TN)
    TNR = TN / (TN + FP)
    FNR = FN / (FN + TP)
    
    results.append({
        'Model': name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, average='macro'),
        'Recall': recall_score(y_test, y_pred, average='macro'),
        'F1-Score': f1_score(y_test, y_pred, average='macro'),
        'TPR': TPR, 'FPR': FPR, 'TNR': TNR, 'FNR': FNR
    })

pd.DataFrame(results)


In [None]:
# ===============================
# 5. Apriori Algorithm
# ===============================
# Convert dataset to transaction-like format
transactions = []
for _, row in df.iterrows():
    transaction = [
        f"SepalLength={round(row['sepal length (cm)'], 1)}",
        f"SepalWidth={round(row['sepal width (cm)'], 1)}",
        f"PetalLength={round(row['petal length (cm)'], 1)}",
        f"PetalWidth={round(row['petal width (cm)'], 1)}",
        f"Class={row['species']}"
    ]
    transactions.append(transaction)

# Encode transactions
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_apriori = pd.DataFrame(te_ary, columns=te.columns_)

# Run Apriori with two different thresholds
freq_items_1 = apriori(df_apriori, min_support=0.5, use_colnames=True)
rules_1 = association_rules(freq_items_1, metric='confidence', min_threshold=0.75)
print("Apriori (Support=50%, Confidence=75%)")
display(rules_1[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

freq_items_2 = apriori(df_apriori, min_support=0.6, use_colnames=True)
rules_2 = association_rules(freq_items_2, metric='confidence', min_threshold=0.6)
print("\nApriori (Support=60%, Confidence=60%)")
display(rules_2[['antecedents', 'consequents', 'support', 'confidence', 'lift']])
