In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
pd.options.display.max_columns = 50
pd.options.display.max_rows = 200

In [None]:
data = pd.read_parquet('../data/all_data.parquet')

In [None]:
data.info(memory_usage='deep')

In [None]:
titles = pd.read_csv('../data/titles_list_data.csv', low_memory=False, index_col=[0])

In [None]:
titles.title.duplicated().value_counts()
# there are identical titles so we have to connect them with a unique id for convenience

In [None]:
data.issue_link[0]

In [None]:
# extract titles' and issues' unique ids
data['title_id'] = data.title_link.str.extract(pat=r'/series.(.+\d)/\d.+')
data['issue_id'] = data.issue_link.str.extract(pat=r'/series/.+\d/(.+\d)/.+')

In [None]:
# keep relevant columns
df = data[['pub_name', 'title', 'title_id', 'issue', 'variant_of', 'cover_date', 
           'years', 'cover_price', 'current_value', 'searched', 
          'owned', 'volume', 'issues_total',
           'pub_titles_total', 'pub_issues_total',
           'contributors_names', 'contributors_roles', 'characters']]

In [None]:
# turn prices into numerical
df.loc[:, 'cover_price'] = df.loc[:, 'cover_price'].str.replace('[$,]', '', regex=True)
df.loc[:, 'cover_price'] = df.loc[:, 'cover_price'].str.replace('Free', '0', regex=False)
df.loc[:, 'cover_price'] = df.loc[:, 'cover_price'].apply(eval)

In [None]:
df.loc[:, 'current_value'] = df.loc[:, 'current_value'].str.replace('[$,]', '', regex=True)
df.loc[:, 'current_value'] = df.loc[:, 'current_value'].str.replace('Free', '0', regex=False)
df.loc[:, 'current_value'] = df.loc[:, 'current_value'].apply(eval)

In [None]:
# Many issues with cover price 0 have high current value so we keep them
df[df.cover_price == 0].sort_values('current_value', ascending=False).head(5)

### `issue` column

There are several types of issues as indicated by their numbering and many strange numberings:

* Simple ascending numbering # 1-...
* 'Ashcan' Issues which are traditionally used to promote a new series (see [wiki](https://en.wikipedia.org/wiki/Ashcan_comic)). Ashcan comics can be quite rare and valuable, especially from the [Golden Age](https://en.wikipedia.org/wiki/Golden_Age_of_Comic_Books) (30s-50s)
* Issues with `nn` meaning 'non-numbered'. Typically 0th issues or one-shots
* Issues with numbering of the format: # (number)(variant-cover/ special ed./ convention ed. etc.): Normally these issues should be indicated as "variant of" (issue number) in the `variant_of` column
* Of "Vol ** # **" format (where ** indicated a number). i.e. two enumerations are indicated, one for volume, one for issue number
* Instead of issue number a date is given e.g. "Spring 2007" or simply "1989"
* Numbering which includes letters e.g. "C-2" or "x"
* Just the name of the protagonist or in general something explanatory e.g. "Batwoman"
* The print run is indicated e.g. '# 2 - 2nd print'


In [None]:
# Let's get rid of 