In [7]:
#Import dependencies
import hvplot.pandas
import matplotlib.pyplot as plt
import pandas as pd 
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [8]:
# Read in the CSV file as the Pandas DataFrame 
student_df = pd.read_csv(Path("./cleaned_student_mat.csv"))

#Review the Dataframe
student_df.head()

Unnamed: 0,school,sex,age,famsize,Parent_status,Mother_Edu,Father_Edu,Mjob,Fjob,reason,...,activities,nursery,higher,internet,romantic,freetime,goout,health,absences,final_grade
0,GP,F,18,GT3,A,4,4,at_home,teacher,course,...,0,1,1,0,0,3,4,3,6,6
1,GP,F,17,GT3,T,1,1,at_home,other,course,...,0,0,1,1,0,3,3,3,4,6
2,GP,F,15,LE3,T,1,1,at_home,other,other,...,0,1,1,1,0,3,2,3,10,10
3,GP,F,15,GT3,T,4,2,health,services,home,...,1,1,1,1,1,2,2,5,2,15
4,GP,F,16,GT3,T,3,3,other,other,home,...,0,1,1,0,0,3,2,5,4,10


**Preping the Data**

In [9]:
#Verify categories of the categorical information
student_df[["school","sex","Parent_status","Mjob","Fjob","reason"]].value_counts()

school  sex  Parent_status  Mjob     Fjob     reason    
GP      F    T              other    other    reputation    19
        M    T              other    other    course        18
        F    T              other    other    home          14
        M    T              other    other    home          14
        F    T              at_home  other    course        10
                                                            ..
        M    A              other    other    reputation     1
                                              home           1
                            health   health   other          1
        F    T              teacher  teacher  reputation     1
MS      M    T              teacher  teacher  home           1
Name: count, Length: 172, dtype: int64

In [10]:
#Convert dummy varibales (encode) for categorical columns 
categorical_encoded = pd.get_dummies(student_df[["school","sex","Parent_status","Mjob",
                                                 "Fjob","reason"]])
categorical_encoded.tail()

Unnamed: 0,school_GP,school_MS,sex_F,sex_M,Parent_status_A,Parent_status_T,Mjob_at_home,Mjob_health,Mjob_other,Mjob_services,Mjob_teacher,Fjob_at_home,Fjob_health,Fjob_other,Fjob_services,Fjob_teacher,reason_course,reason_home,reason_other,reason_reputation
390,False,True,False,True,True,False,False,False,False,True,False,False,False,False,True,False,True,False,False,False
391,False,True,False,True,False,True,False,False,False,True,False,False,False,False,True,False,True,False,False,False
392,False,True,False,True,False,True,False,False,True,False,False,False,False,True,False,False,True,False,False,False
393,False,True,False,True,False,True,False,False,False,True,False,False,False,True,False,False,True,False,False,False
394,False,True,False,True,False,True,False,False,True,False,False,True,False,False,False,False,True,False,False,False


In [11]:
#Scale all columns with numerical values (non-categorical data) 
scaled_student_df = StandardScaler().fit_transform(student_df[["age","Mother_Edu", 
                                                               "Father_Edu","traveltime","Study_Time_Hours",
                                                               "failures","schoolsup","famsup","paid","activities",
                                                               "nursery","higher","internet","romantic","freetime",
                                                               "health","absences","final_grade"]])
scaled_student_df[0:5]


array([[ 1.02304645,  1.14385567,  1.36037064,  0.79225076, -0.04228585,
        -0.44994364,  2.59713266, -1.25765629, -0.91967081, -1.01788137,
         0.50789938,  0.23094011, -2.23267743, -0.70844982, -0.2360102 ,
        -0.39928949,  0.03642446, -0.96493392],
       [ 0.23837976, -1.60000865, -1.39997047, -0.64324947, -0.04228585,
        -0.44994364, -0.38504002,  0.7951298 , -0.91967081, -1.01788137,
        -1.96889391,  0.23094011,  0.44789274, -0.70844982, -0.2360102 ,
        -0.39928949, -0.21379577, -0.96493392],
       [-1.33095364, -1.60000865, -1.39997047, -0.64324947, -0.04228585,
         3.58932316,  2.59713266, -1.25765629,  1.08734559, -1.01788137,
         0.50789938,  0.23094011,  0.44789274, -0.70844982, -0.2360102 ,
        -0.39928949,  0.53686493, -0.0907392 ],
       [-1.33095364,  1.14385567, -0.47985677, -0.64324947,  1.15077909,
        -0.44994364, -0.38504002,  0.7951298 ,  1.08734559,  0.98243276,
         0.50789938,  0.23094011,  0.44789274,  1.411

In [12]:
#Create a DataFrame that calls the scaled data 
scaled_student_df = pd.DataFrame(
    scaled_student_df, 
    columns=["age","Mother_Edu", "Father_Edu","traveltime","Study_Time_Hours",
             "failures","schoolsup","famsup","paid","activities",
             "nursery","higher","internet","romantic","freetime",
             "health","absences","final_grade"])

scaled_student_df

Unnamed: 0,age,Mother_Edu,Father_Edu,traveltime,Study_Time_Hours,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,freetime,health,absences,final_grade
0,1.023046,1.143856,1.360371,0.792251,-0.042286,-0.449944,2.597133,-1.257656,-0.919671,-1.017881,0.507899,0.23094,-2.232677,-0.708450,-0.236010,-0.399289,0.036424,-0.964934
1,0.238380,-1.600009,-1.399970,-0.643249,-0.042286,-0.449944,-0.385040,0.795130,-0.919671,-1.017881,-1.968894,0.23094,0.447893,-0.708450,-0.236010,-0.399289,-0.213796,-0.964934
2,-1.330954,-1.600009,-1.399970,-0.643249,-0.042286,3.589323,2.597133,-1.257656,1.087346,-1.017881,0.507899,0.23094,0.447893,-0.708450,-0.236010,-0.399289,0.536865,-0.090739
3,-1.330954,1.143856,-0.479857,-0.643249,1.150779,-0.449944,-0.385040,0.795130,1.087346,0.982433,0.507899,0.23094,0.447893,1.411533,-1.238419,1.041070,-0.464016,1.002004
4,-0.546287,0.229234,0.440257,-0.643249,-0.042286,-0.449944,-0.385040,0.795130,1.087346,-1.017881,0.507899,0.23094,-2.232677,-0.708450,-0.236010,1.041070,-0.213796,-0.090739
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,2.592380,-0.685387,-0.479857,-0.643249,-0.042286,2.242901,-0.385040,0.795130,1.087346,-1.017881,0.507899,0.23094,-2.232677,-0.708450,1.768808,0.320890,0.661975,-0.309288
391,0.238380,0.229234,-1.399970,0.792251,-1.235351,-0.449944,-0.385040,-1.257656,-0.919671,-1.017881,-1.968894,0.23094,0.447893,-0.708450,0.766399,-1.119469,-0.338906,1.220553
392,3.377047,-1.600009,-1.399970,-0.643249,-1.235351,3.589323,-0.385040,-1.257656,-0.919671,-1.017881,-1.968894,0.23094,-2.232677,-0.708450,1.768808,-0.399289,-0.338906,-0.746385
393,1.023046,0.229234,-0.479857,2.227751,-1.235351,-0.449944,-0.385040,-1.257656,-0.919671,-1.017881,-1.968894,0.23094,0.447893,-0.708450,0.766399,1.041070,-0.714236,-0.090739


In [13]:
#Concade the scaled data with encoded categorical data 
student_df_scaled = pd.concat([scaled_student_df, categorical_encoded], axis =1)

#Drop any null values   
#student_df_scaled.dropna(inplace = True)
student_df_scaled.tail()

Unnamed: 0,age,Mother_Edu,Father_Edu,traveltime,Study_Time_Hours,failures,schoolsup,famsup,paid,activities,...,Mjob_teacher,Fjob_at_home,Fjob_health,Fjob_other,Fjob_services,Fjob_teacher,reason_course,reason_home,reason_other,reason_reputation
390,2.59238,-0.685387,-0.479857,-0.643249,-0.042286,2.242901,-0.38504,0.79513,1.087346,-1.017881,...,False,False,False,False,True,False,True,False,False,False
391,0.23838,0.229234,-1.39997,0.792251,-1.235351,-0.449944,-0.38504,-1.257656,-0.919671,-1.017881,...,False,False,False,False,True,False,True,False,False,False
392,3.377047,-1.600009,-1.39997,-0.643249,-1.235351,3.589323,-0.38504,-1.257656,-0.919671,-1.017881,...,False,False,False,True,False,False,True,False,False,False
393,1.023046,0.229234,-0.479857,2.227751,-1.235351,-0.449944,-0.38504,-1.257656,-0.919671,-1.017881,...,False,False,False,True,False,False,True,False,False,False
394,1.807713,-1.600009,-1.39997,-0.643249,-1.235351,-0.449944,-0.38504,-1.257656,-0.919671,-1.017881,...,False,True,False,False,False,False,True,False,False,False


**Elbow Method**

In [14]:
#Create a list to store inertia values and the values of K 
inertia = []
k = list(range(1,11))

In [15]:
#Create a loop where each value of K is evaluated 
for i in k:
    k_model = KMeans(n_clusters = i, random_state =1)
    k_model.fit(student_df_scaled)
    inertia.append(k_model.inertia_)

In [16]:
#Define a new DataFrame to hold k and inertia values
elbow_data = {"k" : k, "inertia" : inertia}
df_elbow = pd.DataFrame(elbow_data)

df_elbow.head()

Unnamed: 0,k,inertia
0,1,8280.405063
1,2,7723.959456
2,3,7124.183801
3,4,6836.404431
4,5,6588.766623


In [17]:
# Plot the Dataframe 
df_elbow.hvplot.line(
    x = "k",
    y = "inertia",
    title = "Elbow Curve",
    xticks = k
)

The optimal K value would be hosted at K = 6, we can see that after this point the grpah has not completely pleatued but also does not steepen significnaly. This would indicate that the optimal amount of learning is dont at K = 6 and any before this it would not be optimal for the machiene to stop learning.

**Fit + Predict**

In [18]:
#Initalize the K-Means model with 6 clusters 
model = KMeans(n_clusters=6, max_iter = 800, algorithm = 'elkan',random_state =1)

In [19]:
#Fit the model for the scaled and encoded data 
model.fit(student_df_scaled)


In [20]:
#Save predicted model to a new DataFrame
student_clusters = model.predict(student_df_scaled)

print(student_clusters)

[4 5 4 1 5 2 5 4 2 2 2 5 2 2 5 2 2 4 3 2 2 2 2 5 4 5 5 2 4 1 2 2 2 2 5 3 2
 1 4 4 1 2 2 4 4 4 2 2 2 4 5 2 3 4 2 5 2 2 4 2 2 4 4 4 4 2 1 4 4 5 5 2 4 3
 4 2 5 5 0 2 4 4 5 2 5 1 5 2 3 2 1 2 4 2 5 4 4 5 2 4 4 2 2 4 2 4 4 2 1 1 2
 4 4 2 3 2 2 2 3 2 5 5 1 2 5 4 4 0 3 2 1 3 1 2 3 1 3 3 3 2 4 3 2 5 3 5 3 5
 0 3 0 3 1 0 4 4 3 3 3 1 0 4 3 0 0 0 0 1 5 1 3 5 2 3 2 2 5 3 1 1 2 1 2 1 5
 1 5 5 1 3 2 3 3 2 3 1 2 2 2 2 2 4 5 5 5 1 5 1 5 1 1 1 4 5 2 5 1 1 5 5 5 3
 4 5 2 3 3 5 4 3 1 5 4 2 5 1 5 3 3 0 1 2 3 2 5 3 3 0 1 5 0 4 0 3 3 3 1 1 3
 1 1 2 3 3 1 2 1 2 1 0 1 1 5 5 1 4 0 2 4 2 1 3 5 4 5 5 5 5 5 2 1 2 3 5 5 2
 1 1 1 1 1 2 5 5 1 1 3 1 3 4 3 0 3 1 3 1 5 1 2 2 1 5 5 5 5 2 2 3 2 1 5 1 3
 3 3 2 5 1 1 5 1 2 1 3 5 1 1 1 1 3 3 2 0 3 1 1 2 3 3 5 3 0 1 1 3 5 1 5 1 1
 3 0 3 3 2 3 1 2 1 5 2 3 3 3 3 5 1 3 5 3 3 3 3 3 3]


In [21]:
#Create a copy of the concated data 
scaled_pred = student_df_scaled.copy()

#Create a new column for predictions 
scaled_pred['Student_Clusters'] = student_clusters

scaled_pred.head()

Unnamed: 0,age,Mother_Edu,Father_Edu,traveltime,Study_Time_Hours,failures,schoolsup,famsup,paid,activities,...,Fjob_at_home,Fjob_health,Fjob_other,Fjob_services,Fjob_teacher,reason_course,reason_home,reason_other,reason_reputation,Student_Clusters
0,1.023046,1.143856,1.360371,0.792251,-0.042286,-0.449944,2.597133,-1.257656,-0.919671,-1.017881,...,False,False,False,False,True,True,False,False,False,4
1,0.23838,-1.600009,-1.39997,-0.643249,-0.042286,-0.449944,-0.38504,0.79513,-0.919671,-1.017881,...,False,False,True,False,False,True,False,False,False,5
2,-1.330954,-1.600009,-1.39997,-0.643249,-0.042286,3.589323,2.597133,-1.257656,1.087346,-1.017881,...,False,False,True,False,False,False,False,True,False,4
3,-1.330954,1.143856,-0.479857,-0.643249,1.150779,-0.449944,-0.38504,0.79513,1.087346,0.982433,...,False,False,False,True,False,False,True,False,False,1
4,-0.546287,0.229234,0.440257,-0.643249,-0.042286,-0.449944,-0.38504,0.79513,1.087346,-1.017881,...,False,False,True,False,False,False,True,False,False,5


In [22]:
#Plot the clusters 
scaled_pred.hvplot.scatter(
    x = "Mother_Edu",
    y = "Father_Edu",
    by = "Student_Clusters",
    title = "Scatter Plot by Student Clusters - k = 6"
)


In [23]:
#Exporting scaled and fitted data set as CSV 
scaled_pred.to_csv("Resources/KMeans_data.csv",index=False,header=True)

**PCA: Reducing Factors**

In [24]:
#Declare number of PCA varibales
pca = PCA(n_components=2)

In [25]:
#Fit and transform the PCA model
reduced_data = pca.fit_transform(student_df_scaled)

#Create a DataFrame
reduced_df = pd.DataFrame(
    reduced_data,
    columns=["PCA1", "PCA2"]
)
reduced_df

Unnamed: 0,PCA1,PCA2
0,0.165603,-0.676828
1,1.679710,-1.488141
2,1.718653,-1.464333
3,-2.235782,-0.082752
4,-0.663533,-1.042419
...,...,...
390,2.069415,0.983910
391,1.301539,-0.404570
392,5.341076,0.391117
393,1.947234,-0.060925


In [26]:
#Create a list to store inertia values and k values 
inertia_PCA= []
k_PCA = list(range(1,11))


In [27]:
#Create a loop where each value of K is evaluated 
for i in k_PCA:
    k_model2 = KMeans(n_clusters=i, random_state=0)
    k_model2.fit(reduced_df)
    inertia_PCA.append(k_model2.inertia_)


In [28]:
#Define a new DataFrame to hold k and inertia values
PCA_elbow = {'k':k_PCA, "inertia":inertia_PCA}
PCA_df = pd.DataFrame(PCA_elbow)

#Use the DataFrame to plot the Elbow Curve
PCA_df.hvplot.line(
    x = "k",
    y = "inertia",
    title = "Elbow Curve",
    xticks = k_PCA
)

The optimal K value would be hosted at K = 5, we can see that after this point the grpah has not completely pleatued but also does not steepen significnaly. This would indicate that the optimal amount of learning is dont at K = 5 and any before this it would not be optimal for the machiene to stop learning.

**Segmenting PCA with K-Means**

In [29]:
#Initalize the K-Means model with 6 clusters 
model_2 = KMeans(n_clusters=5, random_state=1)

#Fit the model 
model_2.fit(reduced_df)

#Predict with the fit model 
k_3 = model_2.predict(reduced_df)

#Add the cluster data to the DataFrame
PCA_clusters = reduced_df.copy()

PCA_clusters["Student_Clusters"] = k_3

In [32]:
#Plot the clusters 
PCA_clusters.hvplot.scatter(
    x= "PCA1",
    y = "PCA2",
    by = "Student_Clusters",
    title = "Scatter Plot by Student Clusters - k = 5"
)

In [34]:
#Exporting scaled and fitted data set as CSV 
PCA_clusters.to_csv("Resources/PCA_data.csv",index=False,header=True)