In [None]:
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

In [None]:
#Prepare the Data

In [None]:
#read the csv into a Pandas DF
file_path = Path("myopia.csv")
myopia_df = pd.read_csv(file_path)
myopia_df.head()

In [None]:
#features of the dataset
myopia_df.columns

In [None]:
#remove MYOPIC column from dataset
myopia_df = myopia_df.drop(columns =['MYOPIC'])
myopia_df.head()

In [None]:
#standardize dataset so that columns that contain larger values do not influence outcomedf_scaled = StandardScaler().fit_transform(df)
scaler = StandardScaler()
df_scaled = scaler.fit_transform(myopia_df)
print(df_scaled)

In [None]:
#transform data into a DF
transformed_df = pd.DataFrame(df_scaled)
transformed_df.head()

In [None]:
##Part 2 Apply Dimensionality Reduction


In [None]:
#perform dimensionality reduction with PCA
#initialize PCA model
pca = PCA(n_components = .90)
df_pca = pca.fit_transform(df_scaled)
#how did the number of features change?
df_pca.shape

In [None]:
#further reduce the dataset dimensions with t-SNE and visually inspect the results
#initialize t-SNE model
tsne = TSNE(learning_rate = 250)
#reduce dimensions
tsne_features = tsne.fit_transform(df_scaled)
#check results
tsne_features.shape

In [None]:
#plot the results
#visualize the clusters
plt.scatter(tsne_features[:,0], tsne_features[:,1])
plt.show()

In [None]:
###Part 3 : Perform Cluster Analysis with K-means


In [None]:
#create an elbow plot to identify the best number of clusters
inertia = []
k = list(range(1,11))

#calculate the intertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=42)
    km.fit(tsne_features)
    inertia.append(km.inertia_)

#create elbow curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.head()

In [None]:
#plot the elbow curve 
plt.plot(df_elbow['k'], df_elbow['inertia'])
plt.xticks(range(1,11))
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('Elbow Curve for Myopia')
plt.show()

In [None]:
#elbow of the curve appears to be at a k value of 3

In [None]:
#### Make a Recommendation
#Based on my findings, the patients can be clustered into what looks like 4 or 5 groupings 