# Clustering Crypto

In [None]:
# Initial imports
import pandas as pd
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


In [None]:
# this notebook starts off with reading the X_encoded.csv file 
# saved in crypto_clustering_data_cleaning.ipynb
X_encoded = pd.read_csv("./Data/X_encoded.csv", index_col=False)
X_encoded.head()

In [None]:
# set index to Unnamed: 0
X_encoded = X_encoded.set_index("Unnamed: 0")
X_encoded.head()

In [None]:
X_encoded.index

### Standardize The features in X_encoded
using the StandardScaler fit_transform() function

In [None]:
# Standardize the data with StandardScaler().
# create an instance of the StandardScaler method
scaler = StandardScaler()

# train and transform the data_scaler using the fit_transform() method
X_scaled = scaler.fit_transform(X_encoded)
X_scaled[: 1]

### Deliverable 2: Reducing Data Dimensions Using PCA

In [None]:
# Using PCA to reduce dimension to three principal components.

# Initialize PCA model for 3 principal components
pca = PCA(n_components=3)

# Get 3 principal components for the X_scaled data where X is our feature matrix.
X_pca = pca.fit_transform(X_scaled)

# Note: These new components are just the three main dimensions of variation 
# that contain most of the information in the original dataset.

X_pca

In [None]:
# Create a DataFrame with the three principal components.
X_pca_df = pd.DataFrame(
            data=X_pca,
            columns = ["PC 1", "PC 2", "PC 3"],
            index=X_encoded.index) 

In [None]:
X_pca_df

In [None]:
# examine the explained variance ratio
pca.explained_variance_ratio_

### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [None]:
# Create an elbow curve to find the best value for K.
# Find the best value for K
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(X_pca_df)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")


Running K-Means with `k=4`

In [None]:
# Initialize the K-Means model.
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(X_pca_df)

# Predict clusters
predictions = model.predict(X_pca_df)


# Add the predicted class columns
X_pca_df["class"] = model.labels_
X_pca_df.head()

In [None]:
X_pca_df["class"].value_counts()

In [None]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
# YOUR CODE HERE

#  Add a new column, "CoinName" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
# YOUR CODE HERE

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
# YOUR CODE HERE

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

### Deliverable 4: Visualizing Cryptocurrencies Results

#### 3D-Scatter with Clusters

In [None]:
# Create a table with tradable cryptocurrencies.
crypto_df = pd.read_csv("./Data/clean_crypto.csv")
crypto_df.head()

In [None]:
# set_index to Unnamed: 0
crypto_df = crypto_df.set_index("Unnamed: 0")
crypto_df.head()

In [None]:
# we will concatonate this will our X_pca_df
clustered_df = pd.concat([crypto_df, X_pca_df],axis=1)
clustered_df.head()


In [None]:
# read in the crypto_names.csv file
crypto_names = pd.read_csv("./Data/crypto_names.csv")
crypto_names.head()

In [None]:
crypto_names.set_index("Unnamed: 0", inplace=True)
crypto_names

In [None]:
crypto_names["CoinName"]

In [None]:
# since in the same order 
clustered_df["CoinName"] = crypto_names["CoinName"]
clustered_df.head()

In [None]:
# Print the total number of tradable cryptocurrencies.
print(f"The total number of tradable cryptocurrencies: {clustered_df.shape[0]}")

In [None]:
# Creating a 3D-Scatter with the PCA data and the clusters
# Plotting the clusters with three features
fig = px.scatter_3d(clustered_df, x="PC 1", y="PC 2", z="PC 3", color="class", symbol="class", width=800)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()


In [None]:
# view data
clustered_df.head()

In [None]:
# Scaling data to create the scatter plot with tradable cryptocurrencies.
# use minmax scaler
mm_scaler = MinMaxScaler()

plot_data = mm_scaler.fit_transform(
    clustered_df[["TotalCoinsMined", "TotalCoinSupply"]]
)

plot_data[:5]

In [None]:
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
plot_df = pd.DataFrame(
    plot_data, columns=["TotalCoinsMined", "TotalCoinSupply"], index=clustered_df.index
)

# Add the "CoinName" column from the clustered_df DataFrame to the new DataFrame.
plot_df["CoinName"] = clustered_df["CoinName"]

# Add the "Class" column from the clustered_df DataFrame to the new DataFrame. 
plot_df["Class"] = clustered_df["Class"]

plot_df.head(10)

In [None]:
# Create a hvplot.scatter plot using x="TotalCoinsMined" and y="TotalCoinSupply".
# YOUR CODE HERE


In [None]:
clustered_df

In [None]:
# save our clustered_df as a csv file and an image
clustered_df.to_csv("./Data/crypto_clustered_.csv")

In [None]:
# create a table with the following coluns
# column_names = ["C", "A", "B"]
#  df = df.reindex(columns=column_names)

