In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, adjusted_rand_score, normalized_mutual_info_score

# Dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name="target")

# %70 Train – %10 Val – %20 Test
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.125, stratify=y_temp, random_state=42
)

print(X_train.shape, X_val.shape, X_test.shape)


def evaluate_model(model, X_tr, y_tr, X_v, y_v):
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_v)
    y_proba = model.predict_proba(X_v)[:, 1]

    return {
        "Accuracy": accuracy_score(y_v, y_pred),
        "Precision": precision_score(y_v, y_pred),
        "Recall": recall_score(y_v, y_pred),
        "F1": f1_score(y_v, y_pred),
        "ROC-AUC": roc_auc_score(y_v, y_proba)
    }

knn_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", KNeighborsClassifier(n_neighbors=5))
])

kne_baseline = evaluate_model(knn_pipeline, X_train, y_train, X_val, y_val)
param_grid_knn = {
    "model__n_neighbors": list(range(1, 32, 2)),
    "model__weights": ["uniform", "distance"],
    "model__p": [1, 2]
}

grid_knn = GridSearchCV(
    Pipeline([
        ("scaler", StandardScaler()),
        ("model", KNeighborsClassifier())
    ]),
    param_grid_knn,
    scoring="f1",
    cv=5,
    n_jobs=-1
)

grid_knn.fit(X_train, y_train)
best_knn = grid_knn.best_estimator_
knn_val = evaluate_model(best_knn, X_train, y_train, X_val, y_val)
results = pd.DataFrame(grid_knn.cv_results_)
k_f1 = results.groupby("param_model__n_neighbors")["mean_test_score"].mean()

plt.figure()
plt.plot(k_f1.index.astype(int), k_f1.values, marker="o")
plt.xlabel("n_neighbors (k)")
plt.ylabel("Validation F1")
plt.title("KNN: k vs Validation F1")
plt.grid(True)
plt.show()
#------------------------Linear SVM--------------------------
svm_linear = GridSearchCV(
    Pipeline([
        ("scaler", StandardScaler()),
        ("model", SVC(kernel="linear", probability=True, random_state=42))
    ]),
    {"model__C": [0.01, 0.1, 1, 10, 100]},
    scoring="f1",
    cv=5,
    n_jobs=-1
)

svm_linear.fit(X_train, y_train)
results_linear = pd.DataFrame(svm_linear.cv_results_)

plt.figure()
plt.semilogx(
    results_linear["param_model__C"].astype(float),
    results_linear["mean_test_score"],
    marker="o"
)
plt.xlabel("C")
plt.ylabel("Validation F1 Score")
plt.title("Linear SVM: C – Validation F1 İlişkisi")
plt.grid(True)
plt.show()

best_svm_linear = svm_linear.best_estimator_
svm_linear_val = evaluate_model(best_svm_linear, X_train, y_train, X_val, y_val)
#---------------------------------RBF SVM-----------------------
svm_rbf = GridSearchCV(
    Pipeline([
        ("scaler", StandardScaler()),
        ("model", SVC(kernel="rbf", probability=True, random_state=42))
    ]),
    {
        "model__C": [0.01, 0.1, 1, 10, 100],
        "model__gamma": ["scale", "auto", 0.01, 0.1, 1]
    },
    scoring="f1",
    cv=5,
    n_jobs=-1
)

svm_rbf.fit(X_train, y_train)
results_rbf = pd.DataFrame(svm_rbf.cv_results_)

pivot = results_rbf.pivot_table(
    values="mean_test_score",
    index="param_model__C",
    columns="param_model__gamma"
)

plt.figure()
plt.imshow(pivot, aspect="auto")
plt.colorbar(label="Validation F1 Score")
plt.xticks(range(len(pivot.columns)), pivot.columns)
plt.yticks(range(len(pivot.index)), pivot.index)
plt.xlabel("Gamma")
plt.ylabel("C")
plt.title("RBF SVM: C–Gamma Validation F1 Heatmap")
plt.show()

best_svm_rbf = svm_rbf.best_estimator_
svm_rbf_val = evaluate_model(best_svm_rbf, X_train, y_train, X_val, y_val)
#-----------------------------MLP CLASSIFICATION--------------------------
grid_mlp = GridSearchCV(
    Pipeline([
        ("scaler", StandardScaler()),
        ("model", MLPClassifier(
            max_iter=500,
            early_stopping=True,
            random_state=42
        ))
    ]),
    {
        "model__hidden_layer_sizes": [(50,), (100,), (50,50), (100,50)],
        "model__activation": ["relu", "tanh"],
        "model__alpha": [1e-5, 1e-4, 1e-3, 1e-2],
        "model__learning_rate_init": [0.001, 0.01]
    },
    scoring="f1",
    cv=5,
    n_jobs=-1
)

grid_mlp.fit(X_train, y_train)
best_mlp = grid_mlp.best_estimator_
mlp_val = evaluate_model(best_mlp, X_train, y_train, X_val, y_val)

plt.figure()
plt.plot(best_mlp.named_steps["model"].loss_curve_)
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("MLP Loss Curve")
plt.grid(True)
plt.show()

print("========== BEST HYPERPARAMETERS ==========")
print("Best KNN:", grid_knn.best_params_)
print("Best SVM Linear:", svm_linear.best_params_)
print("Best SVM RBF:", svm_rbf.best_params_)
print("Best MLP:", grid_mlp.best_params_)
print("==========================================")

#----------------------------------VALIDATION COMPARISON TABLE--------------------
validation_table = pd.DataFrame([
    {"Model": "KNN", **knn_val},
    {"Model": "SVM-Linear", **svm_linear_val},
    {"Model": "SVM-RBF", **svm_rbf_val},
    {"Model": "MLP", **mlp_val}
])

validation_table
# Sayıları 3 ondalık basamağa yuvarla
validation_table_rounded = validation_table.copy()
validation_table_rounded.iloc[:, 1:] = validation_table_rounded.iloc[:, 1:].round(3)

from IPython.display import display
print("=== Validation Set Performance Comparison (Rounded) ===")
display(validation_table_rounded)


#-----------------------------------FINAL MODEL & TEST RESULTS-----------------
best_model_name = validation_table.sort_values("F1", ascending=False).iloc[0]["Model"]

model_dict = {
    "KNN": best_knn,
    "SVM-Linear": best_svm_linear,
    "SVM-RBF": best_svm_rbf,
    "MLP": best_mlp
}

final_model = model_dict[best_model_name]
test_results = evaluate_model(final_model, X_train, y_train, X_test, y_test)
test_results
#-------------------------------------

from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Test seti için olasılık tahminleri
y_test_proba = final_model.predict_proba(X_test)[:, 1]

# ROC curve hesapla
fpr, tpr, thresholds = roc_curve(y_test, y_test_proba)
roc_auc = auc(fpr, tpr)

# ROC grafiği
plt.figure(figsize=(5,4))
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.3f}")
plt.plot([0,1], [0,1], 'k--')
plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.title("ROC Curve – Final Model (SVM-Linear)")
plt.legend(loc="lower right")
plt.grid(True)
plt.tight_layout()
plt.show()




#---------------------------------------CONFUSION MATRIX & ROC----------------
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Test seti tahminleri
y_test_pred = final_model.predict(X_test)

# Confusion matrix
cm = confusion_matrix(y_test, y_test_pred)

# Görselleştirme (heatmap)
disp = ConfusionMatrixDisplay(
    confusion_matrix=cm,
    display_labels=["Malignant", "Benign"]
)

plt.figure(figsize=(5,4))
disp.plot(cmap="Blues", values_format="d")
plt.title("Confusion Matrix – Final Model (SVM-Linear)")
plt.tight_layout()
plt.show()

#----------------------------KMEANS CLUSTERING--------------------

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

inertia = []
silhouette = []

for k in range(2,11):
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = km.fit_predict(X_scaled)
    inertia.append(km.inertia_)
    silhouette.append(silhouette_score(X_scaled, labels))
plt.figure()
plt.plot(range(2,11), inertia, marker="o")
plt.title("Elbow Method")
plt.xlabel("k")
plt.ylabel("Inertia")
plt.show()
plt.figure()
plt.plot(range(2,11), silhouette, marker="o")
plt.title("Silhouette Score")
plt.xlabel("k")
plt.ylabel("Score")
plt.show()
kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(X_scaled)

print("ARI:", adjusted_rand_score(y, cluster_labels))
print("NMI:", normalized_mutual_info_score(y, cluster_labels))
# ------------------ KMeans Cluster Profile ------------------

X_clustered = X.copy()
X_clustered["Cluster"] = cluster_labels

cluster_profile = X_clustered.groupby("Cluster").mean()

# İlk 5 özelliği rapor için göster
cluster_profile.iloc[:, :5]


# ================== SHAP ANALYSIS (FINAL – GUARANTEED) ==================


import shap
import pandas as pd
import numpy as np
shap.initjs()

# -------------------------------------------------
# Pipeline için callable wrapper
# -------------------------------------------------
def model_predict_proba(X):
    X_df = pd.DataFrame(X, columns=X_train.columns)
    return final_model.predict_proba(X_df)

# -------------------------------------------------
# Background (küçük ve stabil)
# -------------------------------------------------
background = X_train.sample(50, random_state=42)

explainer = shap.KernelExplainer(
    model_predict_proba,
    background
)

# -------------------------------------------------
# Açıklanacak örnekler
# -------------------------------------------------
X_explain = X_val.copy()
shap_values = explainer.shap_values(X_explain)

# -------------------------------------------------
# Pozitif sınıf SHAP değerleri
# (liste gelirse 1. sınıfı al)
# -------------------------------------------------
if isinstance(shap_values, list):
    shap_vals = shap_values[1]
else:
    shap_vals = shap_values

# -------------------------------------------------
import matplotlib.pyplot as plt

# Summary plot
plt.figure()
shap.summary_plot(
    shap_vals,
    X_explain,
    feature_names=X_train.columns,
    show=False
)
plt.title("SHAP Summary Plot (Beeswarm)")
plt.show()

# Bar plot
plt.figure()
shap.summary_plot(
    shap_vals,
    X_explain,
    feature_names=X_train.columns,
    plot_type="bar",
    show=False
)
plt.title("SHAP Feature Importance (Bar Plot)")
plt.show()



# ------------------ KMeans 2D Visualization ------------------

# Seçilen iki özellik
feat_x = "mean radius"
feat_y = "mean texture"

X_2d = X[[feat_x, feat_y]]

# Ölçekleme
scaler_2d = StandardScaler()
X_2d_scaled = scaler_2d.fit_transform(X_2d)

# KMeans (k=2)
kmeans_2d = KMeans(n_clusters=2, random_state=42, n_init=10)
clusters_2d = kmeans_2d.fit_predict(X_2d_scaled)

# --- Scatter plot: Cluster etiketlerine göre ---
plt.figure()
plt.scatter(
    X_2d[feat_x],
    X_2d[feat_y],
    c=clusters_2d,
    cmap="viridis",
    alpha=0.7
)
plt.xlabel(feat_x)
plt.ylabel(feat_y)
plt.title("KMeans Clusters (k=2)")
plt.show()

# --- Scatter plot: Gerçek sınıflara göre ---
plt.figure()
plt.scatter(
    X_2d[feat_x],
    X_2d[feat_y],
    c=y,
    cmap="coolwarm",
    alpha=0.7
)
plt.xlabel(feat_x)
plt.ylabel(feat_y)
plt.title("True Class Labels")
plt.show()

