In [1]:
# Import the required libraries and dependencies
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
# Read the CSV file into a Pandas DataFrame
data = Path('compensation_experience')
df = pd.read_csv(data)
df

Unnamed: 0,totalyearlycompensation,basesalary,Masters_Degree,Bachelors_Degree,Doctorate_Degree,Highschool,Some_College,yearsofexperience,yearsatcompany
0,127000,107000.0,0,0,0,0,0,1.5,1.5
1,100000,0.0,0,0,0,0,0,5.0,3.0
2,310000,155000.0,0,0,0,0,0,8.0,0.0
3,372000,157000.0,0,0,0,0,0,7.0,5.0
4,157000,0.0,0,0,0,0,0,5.0,3.0
...,...,...,...,...,...,...,...,...,...
62637,327000,155000.0,0,0,0,0,0,10.0,1.0
62638,237000,146900.0,0,0,0,0,0,2.0,2.0
62639,220000,157000.0,0,0,0,0,0,14.0,12.0
62640,280000,194688.0,0,0,0,0,0,8.0,4.0


In [3]:
df.shape

(62642, 9)

In [4]:
df.columns

Index(['totalyearlycompensation', 'basesalary', 'Masters_Degree',
       'Bachelors_Degree', 'Doctorate_Degree', 'Highschool', 'Some_College',
       'yearsofexperience', 'yearsatcompany'],
      dtype='object')

In [5]:
df_scaled = StandardScaler().fit_transform(
    df[["totalyearlycompensation", "basesalary", "yearsofexperience", "yearsatcompany"]])

In [6]:
# Create a DataFrame with the scaled data only
df_scaled = pd.DataFrame(
    df_scaled,
    columns=['totalyearlycompensation', 'basesalary',
       'yearsofexperience', 'yearsatcompany']
)

# Display sample data
df_scaled.head()

Unnamed: 0,totalyearlycompensation,basesalary,yearsofexperience,yearsatcompany
0,-0.646951,-0.483752,-0.97668,-0.36833
1,-0.842557,-2.227309,-0.377399,0.091281
2,0.678822,0.298404,0.136271,-0.827941
3,1.127991,0.330994,-0.034953,0.704096
4,-0.429611,-2.227309,-0.377399,0.091281


In [7]:
#df with educational columns
education_df = pd.DataFrame(df, columns=['Masters_Degree',
       'Bachelors_Degree', 'Doctorate_Degree', 'Highschool', 'Some_College'])
education_df.head()

Unnamed: 0,Masters_Degree,Bachelors_Degree,Doctorate_Degree,Highschool,Some_College
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,0,0,0,0,0
4,0,0,0,0,0


In [8]:
#concat both dfs
# Concatenate the "EnergyType" variables with the scaled data DataFrame.
df_concat = pd.concat([df_scaled, education_df], axis=1)

# Display the sample data
df_concat

Unnamed: 0,totalyearlycompensation,basesalary,yearsofexperience,yearsatcompany,Masters_Degree,Bachelors_Degree,Doctorate_Degree,Highschool,Some_College
0,-0.646951,-0.483752,-0.976680,-0.368330,0,0,0,0,0
1,-0.842557,-2.227309,-0.377399,0.091281,0,0,0,0,0
2,0.678822,0.298404,0.136271,-0.827941,0,0,0,0,0
3,1.127991,0.330994,-0.034953,0.704096,0,0,0,0,0
4,-0.429611,-2.227309,-0.377399,0.091281,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
62637,0.801981,0.298404,0.478717,-0.521534,0,0,0,0,0
62638,0.149962,0.166416,-0.891069,-0.215126,0,0,0,0,0
62639,0.026803,0.330994,1.163610,2.848947,0,0,0,0,0
62640,0.461482,0.945118,0.136271,0.397688,0,0,0,0,0


In [9]:
# Initialize the K-Means model with n_clusters=3
model = KMeans(n_clusters=3)

In [10]:
# Fit the model for the concat DataFrame
model.fit(df_concat)

  super()._check_params_vs_input(X, default_n_init=10)


In [11]:
# Predict the model segments (clusters)
concat_clusters = model.predict(df_concat)

# View the stock segments
print(concat_clusters)

[2 2 0 ... 1 0 2]


In [12]:
# Create a new column in the DataFrame with the predicted clusters
df_concat["cluster"] = concat_clusters

# Review the DataFrame
df_concat.head()

Unnamed: 0,totalyearlycompensation,basesalary,yearsofexperience,yearsatcompany,Masters_Degree,Bachelors_Degree,Doctorate_Degree,Highschool,Some_College,cluster
0,-0.646951,-0.483752,-0.97668,-0.36833,0,0,0,0,0,2
1,-0.842557,-2.227309,-0.377399,0.091281,0,0,0,0,0,2
2,0.678822,0.298404,0.136271,-0.827941,0,0,0,0,0,0
3,1.127991,0.330994,-0.034953,0.704096,0,0,0,0,0,0
4,-0.429611,-2.227309,-0.377399,0.091281,0,0,0,0,0,2


In [13]:
#scatter plot to visualize the "Cluster" using  
#"yearsofexperience" as the x-variable and "totalyearlycompensation" as the y-variable.  

df_concat.hvplot.scatter(
    x="yearsofexperience",
    y="totalyearlycompensation",
    by="cluster", 
    title = "Scatter Plot by Segment - k=3"
)


In [14]:
# Create the PCA model instance where n_components=2
pca = PCA(n_components=2)

In [15]:
# Fit the df_concat data to the PCA
pca_data = pca.fit_transform(df_concat)

# Review the first five rows of the PCA data
# using bracket notation ([0:5])
pca_data[:5]

array([[-1.40247328, -0.18730223],
       [-1.92478402,  1.13023199],
       [ 0.91774397, -1.08849117],
       [ 1.53351063, -0.17542811],
       [-1.71701667,  0.98886515]])

In [16]:
# Calculate the explained variance
pca.explained_variance_ratio_

array([0.51255341, 0.2140416 ])

In [17]:
# Creating a DataFrame with the PCA data
df_pca = pd.DataFrame(pca_data, columns=["PCA1", "PCA2"])
df_pca.head()

Unnamed: 0,PCA1,PCA2
0,-1.402473,-0.187302
1,-1.924784,1.130232
2,0.917744,-1.088491
3,1.533511,-0.175428
4,-1.717017,0.988865


In [18]:
# Initialize the K-Means model with n_clusters=3
model = KMeans(n_clusters=3)

# Fit the model for the df_stocks_pca DataFrame
model.fit(df_pca)

# Predict the model segments (clusters)
clusters_predict = model.predict(df_pca)

# Print the stock segments
print(clusters_predict)

  super()._check_params_vs_input(X, default_n_init=10)


[0 0 1 ... 2 1 0]


In [19]:
# Create a copy of the df_stocks_pca DataFrame and name it as df_stocks_pca_predictions
df_pca_predictions = df_pca.copy()

# Create a new column in the DataFrame with the predicted clusters
df_pca_predictions["Cluster"] = clusters_predict

# Review the DataFrame
df_pca_predictions.head()

Unnamed: 0,PCA1,PCA2,Cluster
0,-1.402473,-0.187302,0
1,-1.924784,1.130232,0
2,0.917744,-1.088491,1
3,1.533511,-0.175428,1
4,-1.717017,0.988865,0


In [20]:
# Create the scatter plot with x="PC1" and y="PC2"
df_pca_predictions.hvplot.scatter(
    x="PCA1",
    y="PCA2",
    by="Cluster",
    title = "Scatter Plot by Cluster- PCA=2"
)