In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%load_ext autotime

from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import  silhouette_score
from sklearn import datasets
from scipy.cluster.hierarchy import dendrogram, linkage
from yellowbrick.cluster import SilhouetteVisualizer, KElbowVisualizer

In [None]:
pd.options.display.max_columns = 100
pd.options.display.max_colwidth = 50
pd.options.display.max_rows = 200

In [None]:
df = pd.read_csv('../data/data_featurized.csv', index_col=0)

In [None]:
# one hοt encoding of volume_type, special
df_dum = df.join(pd.get_dummies(df['volume_type'], prefix='volume_type'))
df_dum = df_dum.join(pd.get_dummies(df_dum['special'], prefix='special'))
# drop unwanted columns
df_dum = df_dum.drop(columns=['volume_type', 'special'])

In [None]:
df1 = df_dum[df_dum.value_over_price < 1000]

In [None]:
# Normalize data
scaler = MinMaxScaler()
scaled = scaler.fit_transform(df1.drop('value_over_price', axis=1))

### Hierarchical Clustering

In [None]:
# Create Dendrogram

def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)


# setting distance_threshold=0 ensures we compute the full tree.
model = AgglomerativeClustering(distance_threshold=0, memory='./agglo_cache/', n_clusters=None,  
                                linkage='ward')

# take a sample from data
sample = df_dum.sample(1000)
# Normalize data
scaler = MinMaxScaler()
scaled_sampled = scaler.fit_transform(sample)
model = model.fit(scaled_sampled)

plt.title("Hierarchical Clustering Dendrogram")
# plot the top three levels of the dendrogram
plot_dendrogram(model, truncate_mode="level", p=100)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()

### KMeans

In [None]:
km = KMeans(n_clusters=3, init='k-means++')

In [None]:
km.fit(scaled)

In [None]:
km.labels_.shape

In [None]:
visualizer = KElbowVisualizer(KMeans(n_clusters=4, random_state=0), colors='yellowbrick')
visualizer.fit(scaled)
visualizer.show()  

In [None]:
# visualizer = SilhouetteVisualizer(KMeans(n_clusters=5, random_state=0), colors='yellowbrick')
# visualizer.fit(scaled)
# visualizer.show()   

In [None]:
df1.groupby('kmeans')['months_ago'].count()

In [None]:
df1['kmeans'] = km.labels_

In [None]:
group = df1.groupby('kmeans').agg('mean').reset_index()

In [None]:
pd.plotting.parallel_coordinates(group, )

In [None]:
stats = ['mean', 'median', 'std']
to_agg = dict.fromkeys(['value_over_price', 'months_ago', 'issue_nr', 'print_nr', 'searched'], stats)
df1.groupby('kmeans').agg(to_agg) 

In [None]:
sns.scatterplot(data=df1, x='value_over_price', y='searched', hue='kmeans')
plt.ylim(0, 50000)

In [None]:
km.score()