In [None]:
# Import dependencies
import pandas as pd
from pathlib import Path
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [None]:
# Read in the CSV file as a Pandas DataFrame

data_df = pd.read_csv(
    Path("data/Most-Recent-Cohorts-Institution.csv")
)

# Review the DataFrame
data_df.head()

In [None]:
#select columns to keeps
desired_columns = ['UNITID', 'STABBR', 'HIGHDEG','CONTROL', 'ADM_RATE_ALL', 'COSTT4_A', 'TUITIONFEE_IN', 'TUITIONFEE_OUT','MD_EARN_WNE_P10','COMPL_RPY_3YR_RT' 
                ]
only_cols = data_df.loc[:, desired_columns]

only_cols.head()

In [None]:
only_cols.info()

In [None]:
#drop NA values
column_names = ['UNITID', 'STABBR', 'HIGHDEG', 'CONTROL', 'ADM_RATE_ALL', 'COSTT4_A', 'TUITIONFEE_IN', 'TUITIONFEE_OUT', 'MD_EARN_WNE_P10', 'COMPL_RPY_3YR_RT']

# Initialize the DataFrame 'cleaned_df' as a copy of 'only_cols'
cleaned_df = only_cols.copy()

# Iterate through the specified columns and drop rows with NaN values in each column
for col in column_names:
    cleaned_df = cleaned_df[cleaned_df[col].notna()]


In [None]:
cleaned_df = cleaned_df [cleaned_df ['COMPL_RPY_3YR_RT'] != 'PrivacySuppressed']
cleaned_df.head()

In [None]:
cleaned_df.nunique()

In [None]:
#change COMPL data type from object to float
cleaned_df['COMPL_RPY_3YR_RT'] = cleaned_df['COMPL_RPY_3YR_RT'].astype(float)
cleaned_df.head()

In [None]:
cleaned_df.info()

In [None]:
Nan_values = cleaned_df.isna().sum()
Nan_values

In [None]:
len(cleaned_df)

In [None]:
# Scaling the numeric columns
cleaned_df_scaled = StandardScaler().fit_transform(cleaned_df[['HIGHDEG','CONTROL', 'ADM_RATE_ALL', 'COSTT4_A', 'TUITIONFEE_IN', 'TUITIONFEE_OUT','MD_EARN_WNE_P10','COMPL_RPY_3YR_RT']])
# Creating a DataFrame with with the scaled data
df_transformed = pd.DataFrame(cleaned_df_scaled, columns=['HIGHDEG','CONTROL', 'ADM_RATE_ALL', 'COSTT4_A', 'TUITIONFEE_IN', 'TUITIONFEE_OUT','MD_EARN_WNE_P10','COMPL_RPY_3YR_RT'])

# Display sample data
df_transformed.head()

In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [None]:
# Instantiate the PCA instance and declare the number of PCA variables
pca=PCA(n_components=2)

In [None]:
# Fit the PCA model on the transformed student dataframe
data_pca = pca.fit_transform(df_transformed)

# Review the first 5 rows of the array of list data
data_pca[:5]

In [None]:
# Calculate the PCA explained variance ratio
pca.explained_variance_ratio_
#about 68% of the total variance is condensed into the 2 PCA variables.

In [None]:
data_pca_df = pd.DataFrame(
    data_pca,
    columns=["PCA1", "PCA2"]
)

# Review the PCA DataFrame
data_pca_df.head()

In [None]:
# Create a a list to store inertia values and the values of k
inertia = []
k = list(range(1, 15))

In [None]:
# Create a for-loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the service_ratings DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=0)
    k_model.fit(data_pca_df)
    inertia.append(k_model.inertia_)

In [None]:
# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia}

# Create the DataFrame from the elbow data
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow.head(15)

In [None]:
# Plot the Elbow Curve
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)


In [None]:
# Define the model Kmeans model using the optimal value of k for the number of clusters.
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(data_pca_df)

# Make predictions
k_4 = model.predict(data_pca_df)

# Create a copy of the customers_pca_df DataFrame
data_pca_predictions_df = data_pca_df.copy()

# Add a class column with the labels
data_pca_predictions_df ["cluster"] = k_4

In [None]:
# Plot the clusters
data_pca_predictions_df.hvplot.scatter(
    x="PCA1",
    y="PCA2",
    by="cluster"
)

In [None]:
cleaned_df.to_csv('cleaned_data.csv')