In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
from sklearn.metrics import  silhouette_score
from sklearn import datasets
from scipy.cluster.hierarchy import dendrogram, linkage
from yellowbrick.cluster import SilhouetteVisualizer
from yellowbrick.cluster import KElbowVisualizer
import datetime
%load_ext autotime

In [None]:
pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 50
pd.options.display.max_rows = 200

In [None]:
data = pd.read_parquet('../data/all_data.parquet')
df = pd.read_csv('../data/data_clean.csv', index_col=[0])

In [None]:
# data.info()

In [None]:
# get the proper date info
proper_date = pd.read_csv('../data/proper_date.csv')
data['date'] = proper_date.proper_date.values

In [None]:
# extract titles' and issues' unique ids
data['title_id'] = data.copy().title_link.str.extract(pat=r'/series.(\d+)/\d.+')
data['issue_id'] = data.copy().issue_link.str.extract(pat=r'/series/\d+/(\d+)/.+')
# keep relevant columns
data = data[['pub_name', 'title', 'title_id', 'variant_of', 'volume', 'cover_date', 
           'years', 'date', 'cover_price', 'current_value', 'searched', 
           'owned', 'issues_total', 'pub_titles_total', 'pub_issues_total',
           'contributors_names', 'contributors_roles', 'characters', 'synopsis']]

In [None]:
# true/false cols to 1/0
df['variant'] = df['variant'].apply(int)
df['first_appear_event'] = df['first_appear_event'].apply(int)
df['death_event'] = df['death_event'].apply(int)
df['origin_event'] = df['origin_event'].apply(int)
# create months_ago column
# df['months_ago'] = df['date'].apply(lambda x: (2022-int(x[:4]))*12+(1-int(x[5:7])))
df['date'] = pd.to_datetime(df.date)
months = (datetime.datetime.now() - df.date)/np.timedelta64(1, 'M')
df['months_ago'] = months
# create volume_type
df['volume_type'] = df['volume'][df['volume']>'9']
df['volume_type'] = df['volume_type'].fillna('ongoing')
# set '0' to string 'volume'
df['volume'][df['volume']>'9'] = '0'
df['volume'] = df['volume'].apply(int)

In [None]:
# drop all comics with date before 1936
df = df[(df.date.dt.year > 1935) & (df.date.dt.year < 2022)]
# drop all tpb, hc and gn type comics. Due to missing data for these types we will also filter by price
df = df[df.cover_price <= 9]
df = df[~df.volume_type.isin(['TPB/SC', 'HC', 'GN'])]


In [None]:
# Fill 0 cover_price with the average from the same year
df['year'] = df.date.dt.year
avrg_cp = dict(df[df.cover_price != 0].groupby('year')['cover_price'].mean())
df['cover_price'] = df.cover_price.where(df.cover_price > 0, df.year.map(avrg_cp), axis=0)

In [None]:
# get issues_total column from the counts f dataset
issue_counts = dict(df.groupby('title_id')['months_ago'].count())
df['issues_total'] = df.issues_total.where((df.issues_total != 0), df.title_id.map(issue_counts))
df['issues_total'] = df.issues_total.fillna(df.title_id.map(issue_counts))

In [None]:
# put wherever current_value = 0 the cover_price
df['current_value'] = df.current_value.where(df.current_value != 0, df.cover_price)

In [None]:
# make column with the ration of current_value/ cover_price
df['value_over_price'] = df.current_value/ df.cover_price

In [None]:
df.info()

In [None]:
# pub issues / titles
df['pub_issues_over_titles'] = df.pub_issues_total / df.pub_titles_total

In [None]:
df.info()

In [None]:
df1 = df.copy()
df1 = df1[['issue_nr', 'variant', 'volume', 'volume_type', 'print_nr', 'months_ago', 'value_over_price',
          'special', 'searched', 'owned', 'first_appear_event', 'death_event', 'origin_event', 'issues_total', 
          'pub_issues_total', 'pub_issues_over_titles']]

In [None]:
df1

In [None]:
df1.to_csv('../data/data_featurized.csv')

In [None]:
# one hοt encoding of volume_type, special
df_dum = df1.join(pd.get_dummies(df['volume_type'], prefix='volume_type'))
df_dum = df_dum.join(pd.get_dummies(df['special'], prefix='special'))
# drop unwanted columns
df_dum = df_dum.drop(columns=['volume_type', 'special'])

In [None]:
df1.dropna(inplace=True)

In [None]:
# from pandas_profiling import ProfileReport
# prof = ProfileReport(df1)
# prof.to_file(output_file='output.html')

In [None]:
(df1.cover_price > 9).sum()

In [None]:
df1[(df1.cover_price > 9)].current_value.sum()

In [None]:
df_sample = df_num.sample(10000)

In [None]:
# create X scaled
ss = StandardScaler()
ss.fit(df_sample)
X = ss.transform(df_sample)

In [None]:
model_kmeans = KMeans(n_clusters=3, random_state=0).fit(X)

In [None]:
silhouette_score(X, model_kmeans.labels_)

In [None]:
visualizer = SilhouetteVisualizer(model_kmeans, colors='yellowbrick')
visualizer.fit(X)
visualizer.poof()

In [None]:
visualizer = KElbowVisualizer(model_kmeans, k=(2,10))
visualizer.fit(X)
visualizer.show()