## Imports

In [13]:
import numpy as np 
import pandas as pd 
import plotly.express as px 
import plotly.graph_objs as go 
import sqlite3 

from sklearn.cluster import KMeans,AgglomerativeClustering
from sklearn.preprocessing import StandardScaler



## Load Data

In [2]:
def connect2db (path:str, tb:str):
    conn = sqlite3.connect(path)
    cur = conn.cursor()
    q = "SELECT * FROM" + " " + tb 
    df = pd.read_sql(sql = q, con = conn)
    conn.close()
    return df

In [6]:
# Load points and metrics tables
points = connect2db(path = "D:/GEE_Project/Databases/database.db", tb = "points")
metrics = connect2db(path = "D:/GEE_Project/Databases/polyfitsMets.db", tb = "metrics")

# Join tables
ptsMet = pd.merge(points, metrics, on = "id")


## Explore CC2011 dataset

In [30]:
# Filter values for cc2011 eruption
cc = ptsMet.query("eruption == 'CC2011' & dataset == 'LSSR.NDVI.CDI'")
cc.head()

Unnamed: 0,id,lat,lon,label,distance,heading,eruption,elevation,slope,aspect,...,delayV,preconT,preconTS,preconV,improvDeclT,improvDeclTS,improvDeclV,budgetT,budgetTS,budgetV
1189,308308553992445489352924561631568000084,-39.4557,-71.3664,"compo=1m,LS=LE7",141.3,27.1,CC2011,1180,9.030243,307.672363,...,,,,,,,,,,
1198,314363253535504424972969257561612139910,-39.8304,-71.1597,"compo=1m,LS=LE7",117.3,44.2,CC2011,909,13.45892,298.260681,...,,739.829503,2013-06-12,-0.512793,1474.071031,2015-06-17,-0.052934,,,
1207,222941773661361278045265061893625964793,-41.3661,-70.5747,"compo=1m,LS=LE7",155.8,124.2,CC2011,1245,5.65418,240.625854,...,,498.917616,2012-10-14,-0.488236,,,,1147.664082,2014-07-25,0.0
1216,271831421774115499318642335443404264562,-40.6426,-71.5689,"compo=1m,LS=LE7",46.7,97.4,CC2011,1743,29.02319,297.827454,...,,1552.537982,2015-09-03,-5.539992,,,,,,
1225,307809111278932268455037052411994127818,-40.9754,-71.8836,"compo=1m,LS=LE7",47.1,155.4,CC2011,1291,38.252384,218.718491,...,,,,,,,,,,


## KMeans Clustering

In [34]:
def clusterFeat(df:pd.DataFrame,feat:list,nClust:int):
    # Filter features
    X = df[feat]
    # Normalise Data
    X = StandardScaler().fit(X).transform(X.astype(float))

    model = KMeans(nClust)
    model.fit(X)
    return model


In [45]:
# Select features which arnt categorical, mainly not landcover
selectedFeatures = ["distance","heading","elevation","slope","aspect","CHILI"]
clusterFeat(df=cc, feat = selectedFeatures, nClust = 4)

# Assess best option : elbow plot
inertiaScores = []
for i in range(1,21):
    model = clusterFeat(df = cc, feat = selectedFeatures, nClust = i)
    inertiaScores.append(model.inertia_)

fig = go.Figure()
fig.add_trace(go.Scatter(
    x = np.arange(1,10),
    y = inertiaScores,
    mode = "lines+markers",
    line = dict(color = "slategray")
))
fig.update_layout(xaxis_title = "no of cluster", yaxis_title = "inertia score", template = "plotly_white")
fig.show()

## Select five clusters : Explore trends


In [49]:
kmeans5 = clusterFeat(df = cc, feat = selectedFeatures, nClust = 5)
cc["Kmeans5_Pred"] = kmeans5.labels_

In [71]:
# Color plot 
fig = px.scatter(
    x = cc.lat,
    y = cc.lon,
    color = cc.Kmeans5_Pred.astype("str"),
    color_discrete_sequence = px.colors.qualitative.D3
)
fig.update_layout(template = "plotly_white", xaxis_title = "lat", yaxis_title = "lon")
fig.show()

In [65]:
def plotHist(df:pd.DataFrame, xvals:str):
    fig = px.histogram(x = df[xvals], color = df["Kmeans5_Pred"], color_discrete_sequence = px.colors.qualitative.D3)
    fig.update_layout(template = "plotly_white", xaxis_title = xvals)
    return fig

In [76]:
plotHist(df = cc, xvals = "preconV")