In [None]:
# Import required libraries and dependencies
import pandas as pd
import hvplot.pandas  # Assuming you have hvplot installed; if not, you might use matplotlib or seaborn
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


# Load the data into a Pandas DataFrame
df_market_data = pd.read_csv(
    Path("Resources/crypto_market_data.csv"),
    index_col="coin_id")

# Normalize the data
scaler = StandardScaler()
df_market_data_scaled = scaler.fit_transform(df_market_data)


# Create a DataFrame with the scaled data
df_market_data_scaled = pd.DataFrame(
    df_market_data_scaled,
    columns=df_market_data.columns,
    index=df_market_data.index
)

k_range = range(1, 12)
inertia = []

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(df_market_data_scaled)
    inertia.append(kmeans.inertia_)
    
df_elbow_original = pd.DataFrame({
    'k': list(k_range),
    'inertia': inertia
})
    
plt.figure(figsize=(10, 6))
plt.plot(k_range, inertia, marker='o', linestyle='-', color='b')
plt.title('Elbow Method For Optimal k')
plt.xlabel('Number of clusters, k')
plt.ylabel('Inertia')
plt.xticks(k_range)
plt.grid(True)
plt.show()

display(HTML('<h2 style="font-size: 24px; font-weight: bold;">The best K value is 4</h2>'))



# Assuming df_market_data_scaled is your scaled original data
kmeans = KMeans(n_clusters=4, random_state=42)
kmeans.fit(df_market_data_scaled)

clustersA = kmeans.predict(df_market_data_scaled)
print(clusters)  # This prints the array of cluster labels

# Create a copy of the original data to not alter the original DataFrame
df_market_data_with_clustersA = df_market_data.copy()
df_market_data_with_clusters['ClusterA'] = clustersA

# Review the DataFrame with the new 'Cluster' column
print(df_market_data_with_clustersA.head())

# Perform PCA with n_components=3
pca = PCA(n_components=3)
pca_results = pca.fit_transform(df_market_data_scaled)

# Create a DataFrame for PCA results
df_pca = pd.DataFrame(data=pca_results, columns=['PC1', 'PC2', 'PC3'], index=df_market_data.index)

# Initialize and fit the K-means model with four clusters
kmeans = KMeans(n_clusters=4, random_state=42)
kmeans.fit(df_pca)

# Predict the clusters
clustersB = kmeans.predict(df_pca)

# Add cluster labels to the PCA DataFrame
df_pca['ClusterB'] = clustersB

# Plotting using matplotlib
plt.figure(figsize=(10, 6))
plt.scatter(df_pca['PC1'], df_pca['PC2'], c=df_pca['ClusterB'], cmap='viridis', marker='o', alpha=0.7)
plt.title('Cryptocurrency Clusters based on PCA Components')
plt.xlabel('Principal Component 1 (PC1)')
plt.ylabel('Principal Component 2 (PC2)')
plt.colorbar(label='Cluster')
plt.show()


k_range = range(1, 12)
inertia_pca = []

for k in k_range:
    kmeans_pca = KMeans(n_clusters=k, random_state=42)
    kmeans_pca.fit(pca_results)
    inertia_pca.append(kmeans_pca.inertia_)

    
df_elbow_pca = pd.DataFrame({
    'k': list(k_range),
    'inertia': inertia
})
    
plt.figure(figsize=(10, 6))
plt.plot(k_range, inertia_pca, marker='o', linestyle='-', color='b')
plt.title('Elbow Method For Optimal k (PCA Data)')
plt.xlabel('Number of clusters, k')
plt.ylabel('Inertia')
plt.xticks(k_range)
plt.grid(True)
plt.show()


display(HTML('<h2 style="font-size: 24px; font-weight: bold;">The best K value is 4 for both the original and PCA-transformed data.</h2>'))


# Initialize the K-means model with four clusters using the best value for k found
kmeans_pca = KMeans(n_clusters=4, random_state=42)

# Fit the K-means model using the PCA-transformed data
kmeans_pca.fit(pca_results)  # Assuming pca_results is your PCA data from earlier steps

# Predict the clusters
clusters_pca = kmeans_pca.predict(pca_results)

# Review the resulting array of cluster values
print(clusters_pca)

# Create a DataFrame from the PCA results with a new column for clusters
df_pca_clusters = pd.DataFrame(data=pca_results, columns=['PC1', 'PC2', 'PC3'], index=df_market_data.index)
df_pca_clusters['Cluster'] = clusters_pca

# Visualize the clusters based on PCA components using hvPlot
# Note: Replace 'coin_id' with the actual name column in your DataFrame if different
display(HTML('<h2 style="font-size: 24px; font-weight: bold;">Visualize the clusters based on PCA components using hvPlot</h2>'))
df_pca_clusters.hvplot.scatter(x='PC1', y='PC2', by='Cluster', hover_cols=['coin_id'], colormap='viridis')
