In [None]:
# Prevent memory leak bug
import os
os.environ["OMP_NUM_THREADS"] = '1'

In [None]:
# Import initial dependences
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Data Prep

In [None]:
# Load data
file_path = Path("Resources/myopia.csv")
myopia_df = pd.read_csv(file_path)

In [None]:
# View data
myopia_df.head()

In [None]:
# Remove MYOPIC classifier since we're using unsupervised machine learning
new_myopia_df = myopia_df.drop("MYOPIC", axis='columns')

In [None]:
# View data
new_myopia_df.head()

In [None]:
# Standardize the data with StandardScaler
X = StandardScaler().fit_transform(new_myopia_df)

In [None]:
print(X[0:5])

# Apply Dimensionality Reduction

In [None]:
# Apply PCA to reduce dimensions to preserve 90% of explained variance in dimensionality reduction

# Initialize model
pca = PCA(n_components = 0.9)

# Get principal components for myopic data
X_pca = pca.fit_transform(X)

In [None]:
# Determine how many principle components there are
print(f"Reduced columns to {len(X_pca[0])} principal components from {len(X[0])} from the scaled dataset.")

In [None]:
# Further reduce the dimensions using t-SNE.

# Initialize t-SNE model
tsne = TSNE()

In [None]:
# Reduce dimensions
tsne_features = tsne.fit_transform(X_pca)

In [None]:
# Check shape of new dataset
print(f"Reduced columns to {tsne_features.shape[1]} principal components from {len(X_pca[0])} from the pca dataset.")

# Reduced Features Plot
The plot below will be different everytime, but for the most part once can pick out 4-5 distinct clusters in the plot.

In [None]:
# Plot reduced features
plt.scatter(tsne_features[:,0], tsne_features[:,1])
plt.show()

# Cluster Analysis Using K-Means

In [None]:
### Code below taken from 20-Unsupervised-Machine-Learning/1/Activities/06-Ins_PCA/Solved/Ins_PCA.ipynb

# Finding best value for k
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(tsne_features)
    inertia.append(km.inertia_)

# Create the Elbow Curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

plt.plot(df_elbow['k'], df_elbow['inertia'])
plt.xticks(list(range(11)))
plt.title('Elbow Curve')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

In [None]:
# From the graph above, we will choose k = 5 since we do get a decent performance boost from k = 3.

# Initialize dataframe
tsne_df = pd.DataFrame(data=tsne_features, columns=["principal_component_1", "principal_component_2"])

# Initialize the k-means model
model = KMeans(n_clusters = 5, random_state = 0)

# Fit model
model.fit(tsne_df)

# Predict clusters
predictions = model.predict(tsne_df)

# Add the predicted class column to the dataframe
tsne_df["class"] = model.labels_
tsne_df.head()

In [None]:
# Visualize the clusters
plt.scatter(x=tsne_df['principal_component_1'], y=tsne_df['principal_component_2'], c=tsne_df['class'])
plt.xlabel('Principal component 1')
plt.ylabel('Principal component 2')
plt.title('Myopia Clusters')
plt.show()

# Analysis and Conclusion