In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
from sklearn.metrics import  silhouette_score
from sklearn import datasets
from scipy.cluster.hierarchy import dendrogram, linkage
from yellowbrick.cluster import SilhouetteVisualizer
from yellowbrick.cluster import KElbowVisualizer

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 50
pd.options.display.max_rows = 200

In [None]:
data = pd.read_csv('../data/data_clean.csv', index_col=[0])

In [None]:
df = data.copy()
df.dropna(inplace=True)

In [None]:
# true/false cols to 1/0
df['variant'] = df['variant'].apply(int)
df['first_appear_event'] = df['first_appear_event'].apply(int)
df['death_event'] = df['death_event'].apply(int)
df['origin_event'] = df['origin_event'].apply(int)

In [None]:
# create months_ago column
df['months_ago'] = df['date'].apply(lambda x: (2022-int(x[:4]))*12+(1-int(x[5:7])))

In [None]:
# create volume_type
df['volume_type'] = df['volume'][df['volume']>'9']
df['volume_type'] = df['volume_type'].fillna('ongoing')

In [None]:
# set '0' to string 'volume'
df['volume'][df['volume']>'9'] = '0'
df['volume'] = df['volume'].apply(int)

In [None]:
# one hit encoding of volume_type, special
df = df.join(pd.get_dummies(df['volume_type'], prefix='volume_type'))
df = df.join(pd.get_dummies(df['special'], prefix='special'))

In [None]:
# drop unwanted columns
df_num = df.drop(columns=['pub_name', 'title', 'title_id', 'issues_total', 'volume_type', 'special', 'date'])

In [None]:
df_num

In [None]:
df_sample = df_num.sample(10000)

In [None]:
# create X scaled
ss = StandardScaler()
ss.fit(df_sample)
X = ss.transform(df_sample)

In [None]:
model_kmeans = KMeans(n_clusters=3, random_state=0).fit(X)

In [None]:
silhouette_score(X, model_kmeans.labels_)

In [None]:
visualizer = SilhouetteVisualizer(model_kmeans, colors='yellowbrick')
visualizer.fit(X)
visualizer.poof()

In [None]:
visualizer = KElbowVisualizer(model_kmeans, k=(2,10))
visualizer.fit(X)
visualizer.show()