In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
from sklearn.metrics import  silhouette_score
from sklearn import datasets
from scipy.cluster.hierarchy import dendrogram, linkage
from yellowbrick.cluster import SilhouetteVisualizer
from yellowbrick.cluster import KElbowVisualizer

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 50
pd.options.display.max_rows = 200

In [None]:
data = pd.read_csv('../data/data_clean.csv', index_col=[0])

In [None]:
df = data.copy()
df.dropna(inplace=True)

In [None]:
# true/false cols to 1/0
df['variant'] = df['variant'].apply(int)
df['first_appear_event'] = df['first_appear_event'].apply(int)
df['death_event'] = df['death_event'].apply(int)
df['origin_event'] = df['origin_event'].apply(int)

In [None]:
# create months_ago column
df['months_ago'] = df['date'].apply(lambda x: (2022-int(x[:4]))*12+(1-int(x[5:7])))

In [None]:
# create volume_type
df['volume_type'] = df['volume'][df['volume']>'9']
df['volume_type'] = df['volume_type'].fillna('ongoing')

In [None]:
# remove volume types
df = df[~df['volume_type'].isin(['TPB/SC', 'HC', 'GN'])]

In [None]:
# set '0' to string 'volume'
df['volume'][df['volume']>'9'] = '0'
df['volume'] = df['volume'].apply(int)

In [None]:
# drop cover price >9
df = df[df['cover_price']<9]

In [None]:
# drop years
df['year'] = df['date'].str[:4].apply(int)
df = df[df['year']>1935]
df = df[df['year']!=2022]

In [None]:
# 0 cover price to year mean
d = dict(df.groupby('year').mean()['cover_price'].sort_index())
df['cover_price'][df['cover_price']==0] = df['year'].map(d)

In [None]:
# create current_value/cover_price
df['current_value'][df['current_value']==0] = df['cover_price']
df['value_over_price'] = df['current_value']/df['cover_price']

In [None]:
# create pub_issues_total/pub_titles_total
df['pub_issues_per_title'] = df['pub_issues_total']/df['pub_titles_total']

In [None]:
# one hit encoding of volume_type, special
df = df.join(pd.get_dummies(df['volume_type'], prefix='volume_type'))
df = df.join(pd.get_dummies(df['special'], prefix='special'))

In [None]:
# drop unwanted columns
#df = df.sample(10000)
df_keep = df.drop(columns=['year', 'cover_price', 'current_value', 'pub_titles_total', 'pub_name', 'title', 'title_id', 'issues_total', 'volume_type', 'special', 'date'])

In [None]:
# create X scaled
ss = StandardScaler()
ss.fit(df_keep)
X = ss.transform(df_keep)

In [None]:
model_kmeans = KMeans(n_clusters=4, random_state=526).fit(X)

In [None]:
silhouette_score(X, model_kmeans.labels_)

In [None]:
visualizer = SilhouetteVisualizer(model_kmeans, colors='yellowbrick')
visualizer.fit(X)
visualizer.poof()

In [None]:
visualizer = KElbowVisualizer(model_kmeans, k=(2,10))
visualizer.fit(X)
visualizer.show()

# Cluster exploration

In [None]:
# Global plot parameteres
plt.rcParams['figure.figsize'] = 10,8
sns.set(font_scale=1.5, style='dark')
plt.style.use("dark_background")

In [None]:
# put labels on dataset
y = model_kmeans.labels_
df['label'] = y

In [None]:
df['label'].value_counts().sort_index()

In [None]:
df.groupby('label').mean()

In [None]:
columns = list(df.groupby('label').mean().columns)
for col in columns:
    ax = sns.boxplot(x='label', y=col, data=df)
    plt.show()

In [None]:
labels = list(df['label'].value_counts().sort_index().index)
for l in labels:
    df_to_plot = df[df['label']==l].groupby('year').mean()['current_value']
    title='Label: '+str(l)+' Average current value per year published'

    df_to_plot.plot()
    plt.title(title, size=20)
    plt.show()