# Unsupervised Learning
```{admonition} Revised
31 May 2023
```

---

## Programming Environment

In [10]:
import numpy             as np
np.set_printoptions(suppress=True, formatter={'float_kind' : '{:0.8f}'.format})
import pandas            as pd
pd.set_option('display.float_format', lambda x: f'{x:0.8f}')
import matplotlib        as mpl
import matplotlib.pyplot as plt

import sklearn
from   sklearn.cluster       import KMeans
from   sklearn.decomposition import PCA
from   sklearn.preprocessing import StandardScaler

import hvplot
import hvplot.pandas

from   pathlib  import Path

import warnings
warnings.filterwarnings('ignore')

import datetime
from   importlib.metadata import version
import os
import platform as p
import sys

pad = 20
print(  f"\n{'Executed' : <{pad}} : {datetime.datetime.now().astimezone().strftime('%Y-%m-%d %H:%M:%S %z %Z')}"
        f"\n{'Platform' : <{pad}} : {p.platform(aliased = False, terse = False)}"
        f"\n{'Conda'    : <{pad}} : {os.environ['CONDA_DEFAULT_ENV'] or sys.executable.split('/')[-3]}"
        f"\n{'Python'   : <{pad}} : {p.python_implementation()} {p.python_version()} {sys.executable}")
print(*[f'{name : <{pad}} : {version(name)}'
        for name in ['hvPlot', 'Matplotlib', 'NumPy', 'Pandas', 'Scikit-Learn']], sep = '\n')


Executed             : 2023-09-04 17:34:08 -0400 EDT
Platform             : macOS-13.5.1-arm64-arm-64bit
Conda                : ml
Python               : CPython 3.11.5 /Users/df/anaconda3/envs/ml/bin/python
hvPlot               : 0.8.4
Matplotlib           : 3.7.2
NumPy                : 1.23.5
Pandas               : 2.1.0
Scikit-Learn         : 1.3.0


---

### Data Import

In [11]:
df_market_data = pd.read_csv(filepath_or_buffer='data/crypto_market_data.csv',
                             index_col         ='coin_id')
df_market_data.head()

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,1.08388,7.60278,6.57509,7.67258,-3.25185,83.5184,37.51761
ethereum,0.22392,10.38134,4.80849,0.13169,-12.8889,186.77418,101.96023
tether,-0.21173,0.04935,0.0064,-0.04237,0.28037,-0.00542,0.01954
ripple,-0.37819,-0.60926,2.24984,0.23455,-17.55245,39.53888,-16.60193
bitcoin-cash,2.90585,17.09717,14.75334,15.74903,-13.71793,21.66042,14.49384


In [12]:
df_market_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
price_change_percentage_24h,41.0,-0.26968634,2.6947932,-13.52786,-0.60897,-0.06341,0.61209,4.84033
price_change_percentage_7d,41.0,4.49714732,6.37521822,-6.09456,0.04726,3.29641,7.60278,20.69459
price_change_percentage_14d,41.0,0.18578707,8.37693935,-18.1589,-5.02662,0.10974,5.51074,24.23919
price_change_percentage_30d,41.0,1.54569341,26.34421795,-34.70548,-10.43847,-0.04237,4.57813,140.7957
price_change_percentage_60d,41.0,-0.09411854,47.36580318,-44.82248,-25.90799,-7.54455,0.65726,223.06437
price_change_percentage_200d,41.0,236.53743171,435.22530433,-0.3921,21.66042,83.9052,216.17761,2227.92782
price_change_percentage_1y,41.0,347.66795561,1247.84288433,-17.56753,0.40617,69.69195,168.37251,7852.0897


In [13]:
df_market_data.hvplot.line(
  height=400,
  rot   =90,
  width =800,
)

### Data Preparation

In [14]:
df_market_data_scaled = pd.DataFrame(
  data   =StandardScaler().fit_transform(df_market_data),
  columns=df_market_data.columns,
)
df_market_data_scaled['coin_id'] = df_market_data.index
df_market_data_scaled            = df_market_data_scaled.set_index(keys=['coin_id'])
df_market_data_scaled.head()

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,0.50852937,0.49319307,0.77220043,0.23545963,-0.0674951,-0.35595348,-0.25163688
ethereum,0.18544589,0.93444504,0.55869212,-0.05434093,-0.27348273,-0.11575947,-0.19935211
tether,0.02177396,-0.70633685,-0.02168042,-0.06103015,0.00800452,-0.55024692,-0.28206051
ripple,-0.04076438,-0.81092807,0.24945797,-0.05038797,-0.37316402,-0.45825882,-0.29554614
bitcoin-cash,1.19303608,2.00095907,1.76061001,0.54584206,-0.29120287,-0.49984776,-0.27031695


### k-Means Clustering

In [15]:
k       = list(range(1,11))
inertia = []
for i in k:
  model = KMeans(n_clusters  =i,
                 n_init      ='auto',
                 random_state=0)
  model.fit(X=df_market_data_scaled)
  inertia.append(model.inertia_)

df_elbow = pd.DataFrame(data={
  'k'       : k,
  'inertia' : inertia,
})

elbow_plot = df_elbow.hvplot.line(
  x     ='k',
  y     ='inertia',
  title ='Elbow Curve',
  xticks=k,
)
display(elbow_plot)

model = KMeans(n_clusters=4,
               n_init    ='auto')
model.fit(X=df_market_data_scaled)

crypto_clusters = model.predict(X=df_market_data_scaled)
#print(crypto_clusters)

df_market_data_scaled_predictions                   = df_market_data_scaled.copy()
df_market_data_scaled_predictions['crypto_cluster'] = crypto_clusters
#print(df_market_data_scaled_predictions)

clusters_plot = df_market_data_scaled_predictions.hvplot.scatter(
  x         ='price_change_percentage_24h',
  y         ='price_change_percentage_7d',
  by        ='crypto_cluster',
  hover_cols=['coin_id'],
  marker    =['hex','square','cross','inverted_triangle'],
  title     ='Cryptocurrenices Clusters',
)
clusters_plot

### k-Means Clustering (PCA)

In [16]:
pca             = PCA(n_components=3)
market_pca_data = pca.fit_transform(X=df_market_data_scaled)

print(market_pca_data[:5])

print()
print('Explained variance ratio:')
print(pca.explained_variance_ratio_)

df_market_data_pca            = pd.DataFrame(
  data   =market_pca_data,
  columns=['PC1','PC2','PC3'],
)
df_market_data_pca['coin_id'] = df_market_data.index
df_market_data_pca            = df_market_data_pca.set_index(keys=['coin_id'])

print()
print(df_market_data_pca.head())

k       = list(range(1,11))
inertia = []
for i in k:
  model = KMeans(n_clusters  =i,
                 n_init      ='auto',
                 random_state=0)
  model.fit(X=df_market_data_pca)
  inertia.append(model.inertia_)

df_elbow = pd.DataFrame(data={
  'k'       : k,
  'inertia' : inertia,
})

elbow_plot_pca = df_elbow.hvplot.line(
  x     ='k',
  y     ='inertia',
  title ='Elbow Curve (PCA)',
  xticks=k,
)
display(elbow_plot_pca)

model_pca = KMeans(n_clusters=4,
                   n_init    ='auto')
model_pca.fit(X=df_market_data_pca)

crypto_clusters_pca = model_pca.predict(X=df_market_data_pca)
#print(crypto_clusters_pca)

df_market_data_pca_predictions                   = df_market_data_pca.copy()
df_market_data_pca_predictions['crypto_cluster'] = crypto_clusters_pca
df_market_data_pca_predictions.head()

clusters_plot_pca = df_market_data_pca_predictions.hvplot.scatter(
  x         ='PC1',
  y         ='PC2',
  by        ='crypto_cluster',
  hover_cols=['coin_id'],
  marker    =['hex','square','cross','inverted_triangle'],
  title     ='Cryptocurrenices Clusters (PCA)',
)
clusters_plot_pca

[[-0.60066733 0.84276006 0.46159457]
 [-0.45826071 0.45846566 0.95287678]
 [-0.43306981 -0.16812638 -0.64175193]
 [-0.47183495 -0.22266008 -0.47905316]
 [-1.15779997 2.04120919 1.85971527]]

Explained variance ratio:
[0.37198560 0.34700813 0.17603793]

                     PC1         PC2         PC3
coin_id                                         
bitcoin      -0.60066733  0.84276006  0.46159457
ethereum     -0.45826071  0.45846566  0.95287678
tether       -0.43306981 -0.16812638 -0.64175193
ripple       -0.47183495 -0.22266008 -0.47905316
bitcoin-cash -1.15779997  2.04120919  1.85971527


In [17]:
elbow_plot + elbow_plot_pca

In [18]:
clusters_plot + clusters_plot_pca

---

## Terms

* [[W](https://en.wikipedia.org/wiki/Cluster_analysis)] Cluster Analysis
* [[W](https://en.wikipedia.org/wiki/Data_pre-processing)] Data Preprocessing
* [[W](https://en.wikipedia.org/wiki/Elbow_method_(clustering))] Elbow Method
* [[W](https://en.wikipedia.org/wiki/Explained_variation)] Explained Variation
* [[W](https://en.wikipedia.org/wiki/Feature_scaling)] Feature Scaling
* [[W](https://en.wikipedia.org/wiki/K-means_clustering)] k-Means Clustering
* [[W](https://en.wikipedia.org/wiki/Determining_the_number_of_clusters_in_a_data_set)] the number of clusters in a data set
* [[W](https://en.wikipedia.org/wiki/Principal_component_analysis)] Principal Component Analysis (PCA)
* [[W](https://en.wikipedia.org/wiki/Variation_ratio)] Variation Ratio

---