In [57]:
pip install scikit-learn matplotlib seaborn pandas numpy

Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import seaborn as sns


In [None]:
df=pd.read_csv('degrees-that-pay-back.csv')
df.head()

In [None]:
columns_to_clean = ['Starting Median Salary', 'Mid-Career Median Salary',
                    'Mid-Career 10th Percentile Salary',
                    'Mid-Career 25th Percentile Salary',
                    'Mid-Career 75th Percentile Salary',
                    'Mid-Career 90th Percentile Salary']

for col in columns_to_clean:
    df[col] = df[col].replace('[\$,]', '', regex=True).astype(float)

df.dropna(inplace=True)
df.head()

In [None]:
features=df[columns_to_clean]
features

In [None]:
scaler= StandardScaler()
scaled_features = scaler.fit_transform(features)
plt.scatter(scaled_features[:, 0], scaled_features[:, 1])

In [None]:
# Perform KMeans clustering
km= KMeans(n_clusters=2, random_state=42)
km.fit(scaled_features)

plt.scatter(scaled_features[:, 0], scaled_features[:, 1], c=km.labels_, cmap='viridis')
plt.title('KMeans Clustering of Scaled Features')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

In [None]:
#Elbow method to find optimal k
k=1
inertias=[]
for k in range(1, 11):
    km=KMeans(n_clusters=k)
    km.fit(scaled_features)
    inertias.append(km.inertia_)
inertias

plt.plot(range(1, 11), inertias, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.show()


In [None]:
pca = PCA(n_components=2)
reduced = pca.fit_transform(scaled_features)

plt.figure(figsize=(8, 6))
plt.scatter(reduced[:, 0], reduced[:, 1], c=km.labels_, cmap='viridis', s=50)
plt.title('KMeans Clustering Visualized with PCA (k=4)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.colorbar(label='Cluster')
plt.grid(True)
plt.show()