In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

%load_ext autotime

from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans, DBSCAN, OPTICS
from sklearn.metrics import  silhouette_score
from sklearn import datasets
from scipy.cluster.hierarchy import dendrogram, linkage
from yellowbrick.cluster import SilhouetteVisualizer, KElbowVisualizer

from hdbscan import HDBSCAN
from joblib import Memory, dump

In [None]:
pd.options.display.max_columns = 100
pd.options.display.max_colwidth = 50
pd.options.display.max_rows = 200

In [None]:
df3 = pd.read_csv('../data/data_final.csv', index_col=0)

In [None]:
df3

### KMeans

In [None]:
def get_kmeans(dframe, clusters=4, scaler=None):
    if scaler is None:
        data = dframe.copy()
    else:
        data = scaler.fit_transform(dframe.copy())
    km = KMeans(n_clusters=clusters)
    km.fit(data)
    dd = dframe.copy()
    dd['kmeans'] = km.labels_
    
    return dd, km
    
    

In [None]:
# Normalize data
scaler = MinMaxScaler()
# scaler = None
dff, model = get_kmeans(df3, 3, scaler)

In [None]:
dff.groupby('kmeans')['months_ago'].count()

In [None]:
def q90(x):
    return x.quantile(0.90)

In [None]:
group = dff.groupby('kmeans').agg(['mean', 'std', q90]).reset_index()

In [None]:
group

In [None]:
stats = ['mean', 'std', q90]
to_agg = dict.fromkeys(['value_over_price', 'months_ago', 'variant', 'print_nr', 'searched', 
                        'issues_total', 'pub_issues_total', 'special', 'event', 'volume_type_ongoing'], stats)
dff.groupby('kmeans').agg(to_agg) 

In [None]:
sns.histplot(dff[dff.kmeans == 1], x='value_over_price', stat='percent')

In [None]:
# sns.pairplot(data=dff.sample(10000), hue='kmeans', vars=['value_over_price', 'months_ago', 'variant', 'special', 
#                                                          'volume_type_limited-series', 'volume_type_ongoing', ''])

In [None]:
fig2 = px.parallel_coordinates(group, color="kmeans", width=2000, range_color=[0, 2])
fig2.show()

In [None]:
scaler = MinMaxScaler()
scaled = scaler.fit_transform(df3)

In [None]:
scaled

In [None]:
opt = OPTICS(min_samples=100, min_cluster_size=20000, memory='./cache/', n_jobs=-1, max_eps=100)

In [None]:
opt.fit(scaled)

In [None]:
# visualizer = KElbowVisualizer(KMeans(n_clusters=4, random_state=0), colors='yellowbrick')
# visualizer.fit(scaled)
# visualizer.show()  

In [None]:
visualizer = SilhouetteVisualizer(KMeans(n_clusters=5, random_state=0), colors=[''])
visualizer.fit(scaled)
visualizer.show()   

### Hierarchical Clustering

In [None]:
# Create Dendrogram

def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)


# setting distance_threshold=0 ensures we compute the full tree.
model = AgglomerativeClustering(distance_threshold=0, memory='./agglo_cache/', n_clusters=None,  
                                linkage='ward')

# take a sample from data
sample = df_dum.sample(1000)
# Normalize data
scaler = MinMaxScaler()
scaled_sampled = scaler.fit_transform(sample)
model = model.fit(scaled_sampled)

plt.title("Hierarchical Clustering Dendrogram")
# plot the top three levels of the dendrogram
plot_dendrogram(model, truncate_mode="level", p=100)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()