In [None]:
# Import ncessary modules.
import xlrd
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler



In [None]:
# File path to your Excel file.
file_path = 'Olivine_May17_2023_Iolite_EXP_Full.xlsx'

# Define the sheet name to read.
sheet_name = 'Data'

# Read only two columns "Th_ppm" and "U_ppm".
df = pd.read_excel(file_path, sheet_name=sheet_name, usecols = ['Th_ppm','U_ppm'])

# Remove all rows that have at least one NaN.
df = df.dropna()

# Remove "<" from the values.
df['Th_ppm'] = df['Th_ppm'].apply(lambda x: x.replace('<', '') if isinstance(x, str) and '<' in x else x)
df['U_ppm'] = df['U_ppm'].apply(lambda x: x.replace('<', '') if isinstance(x, str) and '<' in x else x)

# Change to "object" to "float".
df['Th_ppm'] = df['Th_ppm'].astype(float)
df['U_ppm'] = df['U_ppm'].astype(float)

In [None]:
# Set the data for clustering
X = df[['Th_ppm', 'U_ppm']].values

# Choose the number of clusters (k)
k = 3

# Initialize and fit the KMeans model
kmeans = KMeans(n_clusters=k)
kmeans.fit(X)

# Add cluster labels to the DataFrame
df['cluster'] = kmeans.labels_

In [None]:
# Scatter plot with the centroids.
sns.scatterplot(data=df, x="Th_ppm", y="U_ppm", hue=kmeans.labels_)
plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1],
            marker="X", c="r", s=80, label="centroids")
plt.legend()
plt.show()

In [None]:
# Plot the distortion score elbow.
from yellowbrick.cluster import KElbowVisualizer
model = KMeans()
visualizer = KElbowVisualizer(model, k=(1,12)).fit(df)
visualizer.show()

In [None]:
# Import additional libraries for hierarchical clustering
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage

# ... Previous code ...

# Perform KMeans clustering as before
kmeans = KMeans(n_clusters=k)
kmeans.fit(X)

# Add cluster labels to the DataFrame
df['cluster'] = kmeans.labels_

# Visualize KMeans clusters
sns.scatterplot(data=df, x="Th_ppm", y="U_ppm", hue=kmeans.labels_)
plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1],
            marker="X", c="r", s=80, label="centroids")
plt.legend()
plt.show()

# Apply hierarchical clustering within each KMeans cluster
for cluster_label in range(k):
    cluster_data = df[df['cluster'] == cluster_label][['Th_ppm', 'U_ppm']]

    # Perform hierarchical clustering within the current KMeans cluster
    hierarchical_cluster = AgglomerativeClustering(n_clusters=2, linkage='ward')
    hierarchical_labels = hierarchical_cluster.fit_predict(cluster_data)

    # Add hierarchical cluster labels to the DataFrame
    df.loc[df['cluster'] == cluster_label, 'hierarchical_cluster'] = hierarchical_labels

# Visualize hierarchical clusters within each KMeans cluster
sns.scatterplot(data=df, x="Th_ppm", y="U_ppm", hue='hierarchical_cluster')
plt.show()


In [None]:
# Import the necessary module for hierarchical clustering.
from sklearn.cluster import AgglomerativeClustering

# ...

# Continue from where the KMeans clustering left off
# The KMeans clusters are already assigned in the 'cluster' column of the DataFrame 'df'.

# Extract the data of each cluster
cluster_data = {}
for cluster_label in range(k):
cluster_data[cluster_label] = X[df['cluster'] == cluster_label]

# Create a dictionary to store hierarchical clustering labels for each cluster
hierarchical_labels = {}

# Perform hierarchical clustering within each KMeans cluster
for cluster_label in range(k):
    hierarchical_clusterer = AgglomerativeClustering(n_clusters=2)  # You can specify the number of clusters within each KMeans cluster
    hierarchical_cluster_labels = hierarchical_clusterer.fit_predict(cluster_data[cluster_label])
    hierarchical_labels[cluster_label] = hierarchical_cluster_labels

# Add hierarchical cluster labels to the DataFrame
df['hierarchical_cluster'] = np.nan

for cluster_label in range(k):
    df.loc[df['cluster'] == cluster_label, 'hierarchical_cluster'] = hierarchical_labels[cluster_label]

# Scatter plot the hierarchical clusters within the KMeans clusters
sns.scatterplot(data=df, x="Th_ppm", y="U_ppm", hue="hierarchical_cluster")
plt.legend()
plt.show()


In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from yellowbrick.cluster import KElbowVisualizer
from scipy.spatial.distance import cdist
import numpy as np

# ...

# Continue from where the KMeans clustering left off
# The KMeans clusters are already assigned in the 'cluster' column of the DataFrame 'df'.

# Create a list to store the distortion scores for hierarchical clusters within each KMeans cluster
distortion_scores = []

# Initialize the KElbowVisualizer for hierarchical clusters within KMeans clusters
model = AgglomerativeClustering()

# Perform hierarchical clustering within each KMeans cluster and calculate the distortion score
for cluster_label in range(k):
    hierarchical_clusterer = AgglomerativeClustering(n_clusters=2)  # You can specify the number of clusters within each KMeans cluster
    hierarchical_cluster_labels = hierarchical_clusterer.fit_predict(cluster_data[cluster_label])

    # Calculate the distortion score for the hierarchical clusters
    hierarchical_cluster_data = cluster_data[cluster_label]
    hierarchical_distortion_score = sum(np.min(cdist(hierarchical_cluster_data, hierarchical_cluster_data, 'euclidean'), axis=1)) / hierarchical_cluster_data.shape[0]

    distortion_scores.append(hierarchical_distortion_score)

# Plot the distortion score for each hierarchical cluster within KMeans clusters
visualizer = KElbowVisualizer(model, k=(1, 12))
visualizer.fit(df[['Th_ppm', 'U_ppm', 'hierarchical_cluster']])
visualizer.show()
