In [None]:
#dependencies
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt



#Part 1: Prepare the data

In [None]:
#load data
file_path = Path("Resources/myopia.csv")
df = pd.read_csv(file_path)
df.head()

In [None]:
#delete MYOPIC column
df_myopia = df.drop(columns=["MYOPIC"])
df_myopia.head()

In [None]:
#Standardize dataset so that columns that contain larger values o not influence the outcome more than columns with smaller values
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_myopia)

#Part 2: Apply Dimensionality Reduction

In [None]:
#initialize pca
pca = PCA(n_components=0.9)
#get principal components for the data
myopia_pca = pca.fit_transform(scaled_data) 
#fetch explained variance
pca.explained_variance_ratio_.sum()

In [None]:
#reduce dataset dimensions with t-SNE
#initialize t-SNE model
tsne = TSNE(learning_rate=250)
tsne_features = tsne.fit_transform(myopia_pca)
tsne_features.shape

In [None]:
#create a scatter plot of the t-SNE output
plt.scatter(tsne_features[:,0], tsne_features[:,1])
plt.show()

#Part 3: Perform a cluster analysis with k-means

In [None]:
#create an elbow plot to identify best number of clusters

#determine inertia for each k between 1 through 10
inertia = [] 
k = list(range(1,11))

# Looking for the best k
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(tsne_features)
    inertia.append(km.inertia_)

# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

plt.plot(df_elbow['k'], df_elbow['inertia'])
plt.xticks(range(1,11))
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

#Part 4: Make a recommendation
Based on the t-SNE and elbow plot using k-means, the patients can be clustered in distinct groups of 3-5.