In [10]:
# Import required libraries and dependencies

import pandas as pd
import hvplot.pandas
from path import Path
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


In [11]:
# Load the data into a Pandas DataFrame

data_df = pd.read_csv(
    Path("data/neural_network_df.csv"),
    index_col="state")


# Display sample data

data_df.head(10)


Unnamed: 0_level_0,temperature,precip,humidity,visibility,pressure,cloudcover,heatindex,dewpoint,windchill,windgust,feelslike,mintemp,maxtemp,avgtemp,totalsnow,sunhour,blmn
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Florida,18.5311,0.124402,60.258373,9.966507,1018.114833,28.937799,17.062201,8.641148,16.239234,17.559809,16.37799,14.363636,18.5311,16.593301,0.0,7.513397,18.34
Arizona,17.733333,0.0,31.533333,10.0,1019.066667,8.266667,13.133333,-3.933333,12.666667,7.133333,12.666667,8.733333,17.733333,13.066667,0.0,8.7,18.34
South Carolina,12.875,0.0,67.291667,10.0,1017.041667,21.708333,9.666667,3.75,8.583333,12.916667,8.583333,7.083333,12.875,10.166667,0.0,8.2875,18.34
Georgia,12.032258,0.0,65.935484,10.0,1019.129032,9.225806,7.354839,1.096774,5.322581,15.322581,5.322581,3.741935,12.032258,7.870968,0.0,8.7,18.34
Nevada,11.625,2.2375,46.25,9.5,1018.0,29.375,7.25,-5.0,6.25,11.375,6.25,2.375,11.625,8.0,1.225,7.4,18.34
Florida,18.732057,0.0,55.483254,10.0,1018.382775,0.669856,15.473684,6.282297,15.07177,18.449761,15.07177,12.444976,18.732057,15.866029,0.0,8.7,19.05
South Carolina,13.333333,0.008333,66.541667,9.958333,1015.791667,23.125,9.333333,3.0,7.75,17.208333,7.75,6.291667,13.333333,9.833333,0.0,8.3,19.05
Nevada,12.75,0.5,47.625,10.0,1021.125,9.875,7.625,-4.625,6.5,12.5,6.5,2.125,12.75,8.125,0.0,8.7,19.05
Georgia,11.806452,0.070968,68.709677,9.645161,1018.032258,28.516129,8.677419,2.645161,6.774194,16.741935,6.774194,6.193548,11.806452,9.064516,0.0,7.732258,19.05
Arizona,20.133333,0.0,31.933333,10.0,1017.933333,12.4,15.066667,-2.2,14.866667,7.2,14.866667,10.933333,20.133333,15.8,0.0,8.7,19.05


In [12]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file

scaled_df = StandardScaler().fit_transform(data_df)


In [13]:
# Create a DataFrame with the scaled data

df_data_scaled = pd.DataFrame(
    scaled_df,
    columns=data_df.columns
)


# Copy the names from the original data

df_data_scaled["state"] = data_df.index


# Set the coinid column as index

df_data_scaled = df_data_scaled.set_index("state")


# Display sample data

df_data_scaled.head()


Unnamed: 0_level_0,temperature,precip,humidity,visibility,pressure,cloudcover,heatindex,dewpoint,windchill,windgust,feelslike,mintemp,maxtemp,avgtemp,totalsnow,sunhour,blmn
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Florida,0.125232,-0.323726,0.207152,0.505145,-0.083318,-0.117824,0.632379,0.668157,0.623923,0.148144,0.614232,0.857644,0.125232,0.469833,-0.119349,-0.367592,-1.374349
Arizona,-0.014518,-0.348823,-1.182897,0.551185,0.114792,-0.911427,-0.085586,-0.87743,0.02711,-1.506408,0.010501,-0.158127,-0.014518,-0.186122,-0.119349,0.278116,-1.374349
South Carolina,-0.865584,-0.348823,0.547505,0.551185,-0.306682,-0.395376,-0.719087,0.066964,-0.655028,-0.588665,-0.653746,-0.455805,-0.865584,-0.725523,-0.119349,0.053648,-1.374349
Georgia,-1.013213,-0.348823,0.481877,0.551185,0.127772,-0.874604,-1.141552,-0.259157,-1.199751,-0.206876,-1.184182,-1.058632,-1.013213,-1.152523,-0.119349,0.278116,-1.374349
Nevada,-1.084555,0.102575,-0.470735,-0.136127,-0.107219,-0.101039,-1.160711,-1.008539,-1.044822,-0.833308,-1.033316,-1.305242,-1.084555,-1.128523,1.533474,-0.429299,-1.374349


In [14]:
# Create a list with the number of k-values to try
# Use a range from 1 to 11

k = list(range(1, 11))


In [15]:
# Create an empy list to store the inertia values

inertia = []

In [17]:
# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_scaled`
# 3. Append the model.inertia_ to the inertia list

for i in k:
    model = KMeans(n_clusters=i, random_state=0)
    model.fit(df_data_scaled)
    inertia.append(model.inertia_)
    
inertia


  f"KMeans is known to have a memory leak on Windows "


[6765.999999999998,
 4661.141369741431,
 3859.982305273202,
 3292.115410132177,
 2919.8471901828593,
 2665.7350460616167,
 2454.518660832577,
 2267.798608789935,
 2153.651013986613,
 2020.1474682515425]

In [18]:
# Create a dictionary with the data to plot the Elbow curve

elbow_data = {
    "k" : k,
    "i" : inertia
}


# Create a DataFrame with the data to plot the Elbow curve

df_elbow = pd.DataFrame(elbow_data)

df_elbow


Unnamed: 0,k,i
0,1,6766.0
1,2,4661.14137
2,3,3859.982305
3,4,3292.11541
4,5,2919.84719
5,6,2665.735046
6,7,2454.518661
7,8,2267.798609
8,9,2153.651014
9,10,2020.147468


In [19]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.

df_elbow_plot = df_elbow.hvplot.line(
    x="k",
    y="i",
    title="Elbow Curve")

df_elbow_plot


In [20]:
# Initialize the K-Means model using the best value for k (using could be 2 or 4 based on graph above, using 4 for this model)

model = KMeans(n_clusters=4)


In [21]:
# Fit the K-Means model using the scaled data

model.fit(df_data_scaled)


KMeans(n_clusters=4)

In [23]:
# Predict the clusters to group the data using the scaled data

data_clusters = model.predict(df_data_scaled)


# View the resulting array of cluster values.

data_clusters


array([3, 3, 1, 1, 1, 3, 1, 1, 1, 3, 3, 1, 1, 1, 3, 1, 1, 0, 3, 2, 0, 0,
       1, 2, 3, 0, 2, 1, 0, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 3, 3, 3, 1, 1,
       1, 1, 2, 1, 1, 3, 1, 3, 3, 1, 1, 1, 1, 1, 3, 3, 3, 1, 0, 0, 0, 3,
       0, 1, 0, 2, 0, 0, 0, 2, 0, 0, 2, 1, 2, 2, 0, 0, 2, 0, 1, 1, 1, 3,
       1, 0, 1, 0, 1, 3, 0, 2, 3, 1, 0, 0, 1, 1, 1, 1, 3, 1, 1, 1, 1, 3,
       1, 1, 3, 3, 1, 0, 1, 2, 3, 0, 1, 3, 1, 2, 1, 0, 1, 3, 2, 1, 3, 1,
       0, 0, 2, 1, 3, 2, 0, 0, 2, 0, 0, 0, 3, 0, 1, 1, 1, 2, 1, 2, 1, 1,
       1, 1, 1, 0, 0, 2, 2, 0, 1, 1, 3, 0, 2, 0, 3, 1, 3, 2, 1, 1, 1, 3,
       3, 2, 3, 1, 3, 1, 2, 3, 3, 0, 2, 1, 0, 3, 2, 1, 1, 2, 2, 1, 0, 2,
       3, 1, 3, 0, 0, 1, 2, 1, 3, 3, 2, 3, 3, 3, 3, 1, 2, 3, 1, 3, 1, 1,
       1, 3, 1, 1, 2, 3, 1, 1, 3, 2, 3, 1, 2, 3, 1, 2, 3, 1, 0, 3, 3, 2,
       3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 2, 2, 3, 2, 3, 2, 2, 3, 3, 3,
       2, 3, 3, 3, 2, 1, 2, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 2, 3, 2, 3, 1,
       3, 2, 3, 3, 2, 0, 0, 3, 1, 1, 2, 3, 1, 3, 1,

In [24]:
# Create a copy of the DataFrame

df_data_segments = df_data_scaled.copy()


In [25]:
# Add a new column to the DataFrame with the predicted clusters

df_data_segments["PricePredictions"] = data_clusters


# Display sample data

df_data_segments.head()


Unnamed: 0_level_0,temperature,precip,humidity,visibility,pressure,cloudcover,heatindex,dewpoint,windchill,windgust,feelslike,mintemp,maxtemp,avgtemp,totalsnow,sunhour,blmn,PricePredictions
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Florida,0.125232,-0.323726,0.207152,0.505145,-0.083318,-0.117824,0.632379,0.668157,0.623923,0.148144,0.614232,0.857644,0.125232,0.469833,-0.119349,-0.367592,-1.374349,3
Arizona,-0.014518,-0.348823,-1.182897,0.551185,0.114792,-0.911427,-0.085586,-0.87743,0.02711,-1.506408,0.010501,-0.158127,-0.014518,-0.186122,-0.119349,0.278116,-1.374349,3
South Carolina,-0.865584,-0.348823,0.547505,0.551185,-0.306682,-0.395376,-0.719087,0.066964,-0.655028,-0.588665,-0.653746,-0.455805,-0.865584,-0.725523,-0.119349,0.053648,-1.374349,1
Georgia,-1.013213,-0.348823,0.481877,0.551185,0.127772,-0.874604,-1.141552,-0.259157,-1.199751,-0.206876,-1.184182,-1.058632,-1.013213,-1.152523,-0.119349,0.278116,-1.374349,1
Nevada,-1.084555,0.102575,-0.470735,-0.136127,-0.107219,-0.101039,-1.160711,-1.008539,-1.044822,-0.833308,-1.033316,-1.305242,-1.084555,-1.128523,1.533474,-0.429299,-1.374349,1


In [34]:
# Create a scatter plot using hvPlot by setting 
# `x="price_change_percentage_24h"` and `y="price_change_percentage_7d"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.

df_data_segments_plot = df_data_segments.hvplot.scatter(
    x="precip",
    y="temperature",
    xlabel="precip",
    ylabel="temp",
    by="PricePredictions",
    title="Weather/Stock Price Clusters",
    hover_cols="state"
   )

df_data_segments_plot

