# Week 8 Take-Home Assignment

This notebook covers both Logistic Regression and Clustering analyses using the `glass.csv` and `iris.csv` datasets.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_curve, auc
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load datasets
glass_df = pd.read_csv("glass.csv")
iris_df = pd.read_csv("iris.csv")

## Logistic Regression on Glass Dataset (Binary Classification)

In [None]:
# Convert to binary classification: Type == 1 as 1, others as 0
glass_df['TypeBinary'] = (glass_df['Type'] == 1).astype(int)

# Feature and target split
X = glass_df.drop(columns=['Type', 'TypeBinary'])
y = glass_df['TypeBinary']

# Normalize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Fit model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict probabilities
y_probs = model.predict_proba(X_test)[:, 1]

# Threshold analysis
thresholds = np.arange(0.1, 0.9, 0.1)
results = []

for threshold in thresholds:
    y_pred = (y_probs >= threshold).astype(int)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred)
    results.append((threshold, acc, prec, rec))

pd.DataFrame(results, columns=["Threshold", "Accuracy", "Precision", "Recall"])

In [None]:
# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_probs)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label=f"ROC curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], 'k--')
plt.title("ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.grid(True)
plt.show()

## Clustering on Iris and Glass Datasets

In [None]:
def run_clustering(df, name):
    results = []
    X = df.select_dtypes(include=[np.number])
    
    for scale in [True, False]:
        data = StandardScaler().fit_transform(X) if scale else X.values
        for k in [2, 3, 4, 5, 6]:
            kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)
            labels = kmeans.fit_predict(data)
            inertia = kmeans.inertia_
            sil_score = silhouette_score(data, labels)
            results.append({
                "Dataset": name,
                "Scaled": scale,
                "k": k,
                "Inertia": inertia,
                "Silhouette": sil_score
            })
    return pd.DataFrame(results)

iris_results = run_clustering(iris_df.drop(columns=['Name']), "Iris")
glass_results = run_clustering(glass_df.drop(columns=['Type', 'TypeBinary']), "Glass")
pd.concat([iris_results, glass_results])