In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv('graduation_dataset.csv')

### Dropper target for unsupervised clustering, kan brukes senere for validation kanskje

In [None]:
X = df.drop(columns=["Target"])
y = df["Target"]

In [None]:
X.describe()

### Må gjøre kolonnene om til kategoriske for one hot encoding da de ble tolket som kontinuerlige..

In [None]:
categorical_cols = [
    'Marital status',
    'Application mode',
    'Application order',
    'Course',
    'Daytime/evening attendance',
    'Previous qualification',
    'Nacionality',
    "Mother's qualification",
    "Father's qualification",
    "Mother's occupation",
    "Father's occupation",
    'Displaced',
    'Educational special needs',
    'Debtor',
    'Tuition fees up to date',
    'Gender',
    'Scholarship holder',
    'International',
    'Curricular units 1st sem (credited)',
    'Curricular units 1st sem (enrolled)',
    'Curricular units 1st sem (evaluations)',
    'Curricular units 1st sem (approved)',
    'Curricular units 1st sem (without evaluations)',
    'Curricular units 2nd sem (credited)',
    'Curricular units 2nd sem (enrolled)',
    'Curricular units 2nd sem (evaluations)',
    'Curricular units 2nd sem (approved)',
    'Curricular units 2nd sem (without evaluations)',
]

for col in categorical_cols:
    X[col] = X[col].astype('category')


### Scaling av datasettet

In [None]:
from sklearn.preprocessing import StandardScaler

#one hot encoding and scaling
X_encoded = pd.get_dummies(X)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)


In [None]:
X_scaled_df = pd.DataFrame(X_scaled, columns=X_encoded.columns)
X_scaled_df

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

pca_full = PCA().fit(X_scaled)
plt.plot(np.cumsum(pca_full.explained_variance_ratio_))
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Explained Variance")
plt.title("PCA Explained Variance (Full Dataset)")
plt.grid(True)
plt.show()


## PCA

In [None]:
from sklearn.decomposition import PCA
# 8, 18, 36 decent 45 tilslutt
randomstate = 42
num_k = 4

pca = PCA(n_components=150, random_state=randomstate)
X_pca = pca.fit_transform(X_scaled_df)
print("PCA components:", X_pca.shape[1])
print(f'Randiom state: {randomstate}')

In [None]:
explained_variance = pca.explained_variance_ratio_.sum()
print(explained_variance)

## CLUSTERING

### K-means

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
import matplotlib.pyplot as plt

In [None]:
K = range(2, 10)
inertia = []
sil_scores = []

for k in K:
    km = KMeans(n_clusters=k, random_state=randomstate)
    labels = km.fit_predict(X_pca)
    inertia.append(km.inertia_)
    sil_scores.append(silhouette_score(X_pca, labels))

### Elbow plot and silhouette plot

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

axes[0].plot(K, inertia, marker='o')
axes[0].set_xlabel("k")
axes[0].set_ylabel("Inertia")
axes[0].set_title("Elbow Method")

axes[1].plot(K, sil_scores, marker='o')
axes[1].set_xlabel("k")
axes[1].set_ylabel("Silhouette Score")
axes[1].set_title("Silhouette Method")

plt.tight_layout()
plt.show()

### Choose K based on plots

In [None]:
k_values = [2, 3, 4, 6, 8, 10]

In [None]:
kmeans = KMeans(n_clusters=num_k, random_state=randomstate)
clusters = kmeans.fit_predict(X_pca)

In [None]:
print(f'Med k={num_k} og randomstate={randomstate}')
print("Final silhouette score:", silhouette_score(X_pca, clusters))
print("Davies-Bouldin score:", davies_bouldin_score(X_pca, clusters))

### PCA2 for visualization

In [None]:
x1 = X_pca[:, 0]
x2 = X_pca[:, 1]

rows = 2
cols = (len(k_values) + 1) // 2
plt.figure(figsize=(15, 8))

for i, k in enumerate(k_values, 1):
    kmeans = KMeans(n_clusters=k, random_state=randomstate)
    labels = kmeans.fit_predict(X_pca)

    plt.subplot(rows, cols, i)
    plt.scatter(x1, x2, c=labels, s=10, cmap='tab20')
    plt.title(f'K-means (k={k})')
    plt.xlabel("PCA 1")
    plt.ylabel("PCA 2")

plt.tight_layout()
plt.show()

## Andre algoritmer

DBScan

In [None]:
# -----------------------------------------------------------
# 1. K-DISTANCE PLOT (to inspect elbow around eps ≈ 20)
# -----------------------------------------------------------
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import numpy as np

k = 5  # typical k for DBSCAN k-distance plot
nn = NearestNeighbors(n_neighbors=k)
nn.fit(X_pca)
distances, _ = nn.kneighbors(X_pca)

kdist = np.sort(distances[:, -1])

plt.figure(figsize=(7, 4))
plt.plot(kdist)
plt.title(f"{k}-Distance Plot")
plt.xlabel("Points sorted by distance")
plt.ylabel("k-distance")
plt.grid(True)
plt.show()


In [None]:
# -----------------------------------------------------------
# 2. PARAMETER RANGES
# -----------------------------------------------------------
min_samples_list = list(range(3, 9))
eps_values = [10, 15, 20, 25, 30]

from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score


In [None]:
# -----------------------------------------------------------
# 3. GRID-SEARCH WITH METRICS
#    - silhouette score
#    - cluster size distribution
#    - core ratio
# -----------------------------------------------------------

results = []

for ms in min_samples_list:
    for eps in eps_values:
        db = DBSCAN(min_samples=ms, eps=eps).fit(X_pca)
        labels = db.labels_
        
        # Skip invalid silhouette (all noise or 1 cluster)
        if len(set(labels)) <= 1 or (set(labels) == {-1}):
            sil = -1
        else:
            sil = silhouette_score(X_pca, labels)

        unique, counts = np.unique(labels, return_counts=True)
        cluster_sizes = dict(zip(unique, counts))

        core_samples_mask = np.zeros_like(labels, dtype=bool)
        core_samples_mask[db.core_sample_indices_] = True
        core_ratio = core_samples_mask.sum() / len(labels)

        results.append({
            "min_samples": ms,
            "eps": eps,
            "silhouette": sil,
            "cluster_sizes": cluster_sizes,
            "core_ratio": core_ratio
        })

        print(f"min_samples={ms}, eps={eps}, silhouette={sil}, core_ratio={core_ratio}")
        print(f"Cluster sizes: {cluster_sizes}\n")


In [None]:
# -----------------------------------------------------------
# 4. SELECT BEST CONFIGURATION (MAX SILHOUETTE)
# -----------------------------------------------------------
results_sorted = sorted(results, key=lambda x: x["silhouette"], reverse=True)
best = results_sorted[0]

best


In [None]:
# -----------------------------------------------------------
# 5. FINAL DBSCAN RUN WITH BEST SETTINGS
# -----------------------------------------------------------
best_ms = best["min_samples"]
best_eps = best["eps"]

db_final = DBSCAN(eps=best_eps, min_samples=best_ms).fit(X_pca)
labels_final = db_final.labels_

core_samples_mask = np.zeros_like(labels_final, dtype=bool)
core_samples_mask[db_final.core_sample_indices_] = True

n_clusters_final = len(set(labels_final)) - (1 if -1 in labels_final else 0)
n_noise_final = list(labels_final).count(-1)

print("Final clusters:", n_clusters_final)
print("Final noise points:", n_noise_final)


In [None]:
# -----------------------------------------------------------
# 6. 2D PLOT OF FINAL CLUSTERS
# -----------------------------------------------------------
plt.figure(figsize=(6, 5))
unique_labels = set(labels_final)
colors = plt.cm.tab20(np.linspace(0, 1, len(unique_labels)))

for k, col in zip(unique_labels, colors):
    if k == -1:
        col = "k"

    class_member_mask = (labels_final == k)

    xy_core = X_pca[class_member_mask & core_samples_mask]
    xy_border = X_pca[class_member_mask & ~core_samples_mask]

    plt.plot(xy_core[:, 0], xy_core[:, 1], "o", markerfacecolor=col,
             markeredgecolor="k", markersize=6)
    plt.plot(xy_border[:, 0], xy_border[:, 1], "o", markerfacecolor=col,
             markeredgecolor="k", markersize=4)

plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.title(f"DBSCAN Final Model (eps={best_eps}, min_samples={best_ms})")
plt.grid(True)
plt.show()


In [None]:
# -----------------------------------------------------------
# 7. 3D VISUALIZATION (ROTATING VIEWPOINTS)
# -----------------------------------------------------------
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(7, 5))
ax = fig.add_subplot(111, projection='3d')

colors = plt.cm.tab20(np.linspace(0, 1, len(unique_labels)))

for k, col in zip(unique_labels, colors):
    if k == -1:
        col = "k"

    class_member_mask = (labels_final == k)
    pts = X_pca[class_member_mask][:, :3]

    ax.scatter(pts[:, 0], pts[:, 1], pts[:, 2], s=15, c=[col])

ax.set_xlabel("PCA 1")
ax.set_ylabel("PCA 2")
ax.set_zlabel("PCA 3")
ax.set_title("DBSCAN 3D View")
plt.show()


In [None]:
# -----------------------------------------------------------
# 8. MULTIPLE VIEW ANGLES OF THE SAME 3D SCATTER
# -----------------------------------------------------------
fig = plt.figure(figsize=(14, 4))

angles = [0, 45, 90]
for i, angle in enumerate(angles, 1):
    ax = fig.add_subplot(1, 3, i, projection='3d')
    for k, col in zip(unique_labels, colors):
        if k == -1:
            col = "k"

        class_member_mask = (labels_final == k)
        pts = X_pca[class_member_mask][:, :3]
        ax.scatter(pts[:, 0], pts[:, 1], pts[:, 2], s=12, c=[col])

    ax.view_init(elev=20, azim=angle)
    ax.set_title(f"View angle: {angle}°")

plt.tight_layout()
plt.show()


In [None]:
# -----------------------------------------------------------
# Multiple 2D PCA pair projections
# -----------------------------------------------------------
import itertools
pairs = [(0,1), (0,2), (1,2), (2,3), (3,4), (4,5)]

fig, axes = plt.subplots(2, 3, figsize=(15, 10))

for ax, (i, j) in zip(axes.ravel(), pairs):
    for lab in set(labels_final):
        mask = labels_final == lab
        col = "k" if lab == -1 else None
        ax.scatter(X_pca[mask, i], X_pca[mask, j], s=8, label=str(lab), c=col)
    ax.set_xlabel(f"PCA {i+1}")
    ax.set_ylabel(f"PCA {j+1}")
    ax.set_title(f"PCA {i+1} vs PCA {j+1}")

plt.tight_layout()
plt.show()


In [None]:
# -----------------------------------------------------------
# t-SNE 2D
# -----------------------------------------------------------
from sklearn.manifold import TSNE

tsne_2d = TSNE(n_components=2, learning_rate="auto", init="pca").fit_transform(X_pca)

plt.figure(figsize=(6,5))
for lab in set(labels_final):
    mask = labels_final == lab
    col = "k" if lab == -1 else None
    plt.scatter(tsne_2d[mask,0], tsne_2d[mask,1], s=8, c=col)
plt.title("t-SNE 2D view")
plt.show()
