In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%load_ext autotime

from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import  silhouette_score
from sklearn import datasets
from scipy.cluster.hierarchy import dendrogram, linkage
from yellowbrick.cluster import SilhouetteVisualizer, KElbowVisualizer

In [None]:
pd.options.display.max_columns = 100
pd.options.display.max_colwidth = 50
pd.options.display.max_rows = 200

In [None]:
df = pd.read_csv('../data/data_featurized.csv', index_col=0)

Due to the vast difference in current values we will only consider issues that are up to 1000 times their initial price. Although unfortunate, it is extremely unlikely that we will see issues that cost hundreds of thousands or millions of dollars.

In [None]:
df1 = df[df.value_over_price < 1000]

Some features that are very skewed can be combined to one so that we get a more balanced distribution.
Since `special`, `event` and some categories from `volume_type` tend to go to the same clusters we can combine them. 

In [None]:
# Make the special editions to boolean indexing indicating if it is a special edition or not i.e if the 
# edition is one of "limited", "incentive" or "deluxe" it will be 1 else 0
df1.loc[:, 'special'] = df1.special.replace('not', 0)
df1.loc[: , 'special'] = df1.special.where(df1.special == 0, 1)
df1.loc[: , 'special'] = df1.special.astype(int)

In [None]:
# Turn the columns for events into 1 column with 1 if there is a special event and 0 if not
df1.loc[: , 'event'] = (df1.first_appear_event + df1.death_event + df1.origin_event).astype(bool).astype(int)

In [None]:
df2 = df1.drop(['first_appear_event', 'death_event', 'origin_event'], axis=1)

In [None]:
# Combine categories 'fanzine', 'fcbd', 'Prepack', 'magazine' and 'four-color' of volume_type
# into one category named 'other'
df2.loc[:,  'volume_type'] = df2.volume_type.replace(['fanzine', 'magazine', 'Prepack', 
                                                      'fcbd', 'four-color'], 'other')

In [None]:
df2.info()

In [None]:
# one hοt encoding of volume_type, special
df3 = df2.join(pd.get_dummies(df2['volume_type'], prefix='volume_type'))
# drop unwanted columns
df3 = df3.drop(columns=['volume_type'])

In [None]:
df3

### KMeans

In [None]:
def get_kmeans(dframe, clusters=4, scaler=None):
    if scaler is None:
        data = dframe.copy()
    else:
        data = scaler.fit_transform(dframe.copy())
    km = KMeans(n_clusters=clusters)
    km.fit(data)
    dd = dframe.copy()
    dd['kmeans'] = km.labels_
    
    return dd, km
    
    

In [None]:
# Normalize data
scaler = MinMaxScaler()
# scaler = None
dff, model = get_kmeans(df3, 3, scaler)

In [None]:
dff.groupby('kmeans')['months_ago'].count()

In [None]:
group = dff.groupby('kmeans').agg('mean').reset_index()

In [None]:
group

In [None]:
pd.plotting.parallel_coordinates(group, 'kmeans')

In [None]:
stats = ['mean', 'median', 'std']
to_agg = dict.fromkeys(['value_over_price', 'months_ago', 'issue_nr', 'print_nr', 'searched'], stats)
df1.groupby('kmeans').agg(to_agg) 

In [None]:
sns.pairplot(data=df1.sample(10000, hue='kmeans')

In [None]:
dff

In [None]:
columns = list(dff.groupby('kmeans').mean().columns)
for col in columns:
    ax = sns.boxplot(x='kmeans', y=col, data=dff)
    plt.show()

In [None]:
# visualizer = KElbowVisualizer(KMeans(n_clusters=4, random_state=0), colors='yellowbrick')
# visualizer.fit(scaled)
# visualizer.show()  

In [None]:
# visualizer = SilhouetteVisualizer(KMeans(n_clusters=5, random_state=0), colors='yellowbrick')
# visualizer.fit(scaled)
# visualizer.show()   

### Hierarchical Clustering

In [None]:
# Create Dendrogram

def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)


# setting distance_threshold=0 ensures we compute the full tree.
model = AgglomerativeClustering(distance_threshold=0, memory='./agglo_cache/', n_clusters=None,  
                                linkage='ward')

# take a sample from data
sample = df_dum.sample(1000)
# Normalize data
scaler = MinMaxScaler()
scaled_sampled = scaler.fit_transform(sample)
model = model.fit(scaled_sampled)

plt.title("Hierarchical Clustering Dendrogram")
# plot the top three levels of the dendrogram
plot_dendrogram(model, truncate_mode="level", p=100)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()