In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import re

In [None]:
pd.options.display.max_columns = 50
pd.options.display.max_rows = 200

In [None]:
# Import the os module
import os

# Get the current working directory
cwd = os.getcwd()

# Print the current working directory
print("Current working directory: {0}".format(cwd))

In [None]:
data = pd.read_parquet('../data/all_data.parquet')

In [None]:
data.info(memory_usage='deep')

In [None]:
titles = pd.read_csv('../data/titles_list_data.csv', low_memory=False, index_col=[0])

In [None]:
titles.title.duplicated().value_counts()
# there are identical titles so we have to connect them with a unique id for convenience

In [None]:
data.issue_link[0]

In [None]:
# extract titles' and issues' unique ids
data['title_id'] = data.title_link.str.extract(pat=r'/series.(.+\d)/\d.+')
data['issue_id'] = data.issue_link.str.extract(pat=r'/series/.+\d/(.+\d)/.+')

In [None]:
# keep relevant columns
df = data[['pub_name', 'title', 'title_id', 'issue', 'variant_of', 'cover_date', 
           'years', 'cover_price', 'current_value', 'searched', 
          'owned', 'volume', 'issues_total',
           'pub_titles_total', 'pub_issues_total',
           'contributors_names', 'contributors_roles', 'characters']]

In [None]:
# turn prices into numerical
df.loc[:, 'cover_price'] = df.loc[:, 'cover_price'].str.replace('[$,]', '', regex=True)
df.loc[:, 'cover_price'] = df.loc[:, 'cover_price'].str.replace('Free', '0', regex=False)
df.loc[:, 'cover_price'] = df.loc[:, 'cover_price'].apply(eval)

In [None]:
df.loc[:, 'current_value'] = df.loc[:, 'current_value'].str.replace('[$,]', '', regex=True)
df.loc[:, 'current_value'] = df.loc[:, 'current_value'].str.replace('Free', '0', regex=False)
df.loc[:, 'current_value'] = df.loc[:, 'current_value'].apply(eval)

In [None]:
# Many issues with cover price 0 have high current value so we keep them
df[df.cover_price == 0].sort_values('current_value', ascending=False).head(5)

### `issue` column

There are several types of issues as indicated by their numbering and many strange numberings:

* Simple ascending numbering # 1-...
* 'Ashcan' Issues which are traditionally used to promote a new series (see [wiki](https://en.wikipedia.org/wiki/Ashcan_comic)). Ashcan comics can be quite rare and valuable, especially from the [Golden Age](https://en.wikipedia.org/wiki/Golden_Age_of_Comic_Books) (30s-50s)
* Issues with `nn` meaning 'non-numbered'. Typically 0th issues or one-shots
* Issues with numbering of the format: # (number)(variant-cover/ special ed./ convention ed. etc.): Normally these issues should be indicated as "variant of" (issue number) in the `variant_of` column
* Of "Vol ** # **" format (where ** indicated a number). i.e. two enumerations are indicated, one for volume, one for issue number
* Instead of issue number a date is given e.g. "Spring 2007" or simply "1989"
* Numbering which includes letters e.g. "C-2" or "x"
* Just the name of the protagonist or in general something explanatory e.g. "Batwoman"
* The print run is indicated e.g. '# 2 - 2nd print'


In [None]:
# Let's get rid of 

In [None]:
df.loc[:, 'proper_date'] = df.loc[:, 'cover_date']

In [None]:
df.loc[:, 'proper_date'] = df.loc[:, 'proper_date'].str.replace('Winter ', '', regex=False)
df.loc[:, 'proper_date'] = df.loc[:, 'proper_date'].str.replace('Early ', '', regex=False)
df.loc[:, 'proper_date'] = df.loc[:, 'proper_date'].str.replace('Spring ', '', regex=False)
df.loc[:, 'proper_date'] = df.loc[:, 'proper_date'].str.replace('Holiday ', '', regex=False)
df.loc[:, 'proper_date'] = df.loc[:, 'proper_date'].str.replace('Summer ', '', regex=False)
df.loc[:, 'proper_date'] = df.loc[:, 'proper_date'].str.replace('Late ', '', regex=False)
df.loc[:, 'proper_date'] = df.loc[:, 'proper_date'].str.replace('Fall ', '', regex=False)
df.loc[:, 'proper_date'] = df.loc[:, 'proper_date'].str.replace('Mid ', '', regex=False)

In [None]:
df.loc[df['proper_date'] == 'Spring','proper_date'] = np.nan
df.loc[df['proper_date'] == 'Unda','proper_date'] = np.nan
df.loc[df['proper_date'] == 'Unkn','proper_date'] = np.nan
df.loc[df['proper_date'] == 'Winter','proper_date'] = np.nan
df.loc[df['proper_date'] == 'Early','proper_date'] = np.nan
df.loc[df['proper_date'] == 'Late','proper_date'] = np.nan
df.loc[df['proper_date'] == 'No D','proper_date'] = np.nan
df.loc[df['proper_date'] == '199','proper_date'] = '1990.1'

In [None]:
# convert yy to yyyy

for i in range(30,100):
    oldvalue = "'"+str(i)
    newvalue = '19'+str(i)
    df.loc[:, 'proper_date'] = df.loc[:, 'proper_date'].str.replace(oldvalue, newvalue, regex=False)

for i in range(0,10):
    oldvalue = "'0"+str(i)
    newvalue = '200'+str(i)
    df.loc[:, 'proper_date'] = df.loc[:, 'proper_date'].str.replace(oldvalue, newvalue, regex=False)
    
for i in range(10,23):
    oldvalue = "'"+str(i)
    newvalue = '20'+str(i)
    df.loc[:, 'proper_date'] = df.loc[:, 'proper_date'].str.replace(oldvalue, newvalue, regex=False)

In [None]:
# save checkpoint
cp = df['proper_date'].copy()

In [None]:
# load checkpoint
df['proper_date'] = cp.copy()

In [None]:
df2 = df.loc[:, ['issue','title_id','volume','years']][df['proper_date'].isnull()].copy()

In [None]:
df2.loc[:, 'years'] = df2.loc[:, 'years'].str.replace('Present', '2022', regex=False)
df2.loc[:, 'years'] = df2.loc[:, 'years'].str.replace('present', '2022', regex=False)
df2.loc[:, 'years'] = df2.loc[:, 'years'].str.replace("'", '', regex=False)
df2.loc[:, 'years'] = df2.loc[:, 'years'].str.replace("s", '', regex=False)

df2.loc[df2['years'] == 'No Date','years'] = np.nan
df2.loc[df2['years'] == 'Undated','years'] = np.nan
df2.loc[df2['years'] == 'Unknown','years'] = np.nan
df2.loc[df2['years'] == 'Late 1960','years'] = '1968'
df2.loc[df2['years'] == '199','years'] = '1990'
df2.loc[df2['years'] == '1986-187','years'] = '1986-1987'

In [None]:
df2.loc[:, 'start_year'] = df2.loc[:, 'years'].str[:4]
df2.loc[:, 'end_year'] = df2.loc[:, 'years'].str[-4:]

In [None]:
df2['issue'].fillna('1', inplace=True)
df2['volume'].fillna('1', inplace=True)

In [None]:
df2.loc[:, 'issue_list'] = df2.loc[:, 'volume'] + df2.loc[:, 'issue']

In [None]:
df2.loc[:, 'issue_list'] = df2.loc[:, 'issue_list'].apply(lambda x: re.findall('\d+', x))

In [None]:
df2.loc[:, 'issue_0'] = '1'+df2.loc[:, 'issue_list'].str[0]
df2.loc[:, 'issue_1'] = '1'+df2.loc[:, 'issue_list'].str[1]
df2.loc[:, 'issue_2'] = '1'+df2.loc[:, 'issue_list'].str[2]
df2.loc[:, 'issue_3'] = '1'+df2.loc[:, 'issue_list'].str[3]

In [None]:
df2.loc[:, 'issue_0'].fillna('1', inplace=True)
df2.loc[:, 'issue_1'].fillna('1', inplace=True)
df2.loc[:, 'issue_2'].fillna('1', inplace=True)
df2.loc[:, 'issue_3'].fillna('1', inplace=True)

In [None]:
df2.loc[:, 'issue_0'] = df2.loc[:, 'issue_0'].apply(eval)
df2.loc[:, 'issue_1'] = df2.loc[:, 'issue_1'].apply(eval)
df2.loc[:, 'issue_2'] = df2.loc[:, 'issue_2'].apply(eval)
df2.loc[:, 'issue_3'] = df2.loc[:, 'issue_3'].apply(eval)

In [None]:
df2.loc[:, 'order'] = df2.loc[:, 'issue_0']*1000000000+df2.loc[:, 'issue_1']*1000000+df2.loc[:, 'issue_2']*1000+df2.loc[:, 'issue_3']

In [None]:
title_ids = list(df2.loc[:, 'title_id'].value_counts().index)

In [None]:
d={}
for t in title_ids:
    L = list(df2.loc[:][df2['title_id']==t].sort_values('order')['order'])
    if len(L)>1:
        d.update({str(v)+'-'+t: i/(len(L)-1) for i, v in enumerate(L)})
    else:
        d.update({str(L[0])+'-'+t : 0.0})

In [None]:
df2.loc[:, 'multiplier'] = df2.loc[:, 'order'].apply(str)+'-'+df2.loc[:, 'title_id']

In [None]:
df2['multiplier'].replace(d, inplace=True)

In [None]:
df2.loc[:, 'start_year'].fillna('0', inplace=True)
df2.loc[:, 'end_year'].fillna('0', inplace=True)

In [None]:
df2.loc[:, 'start_year'] = df2.loc[:, 'start_year'].apply(eval)
df2.loc[:, 'end_year'] = df2.loc[:, 'end_year'].apply(eval)

In [None]:
df2['est_date'] = df2['start_year']+(df2['end_year']-df2['start_year'])*df2['multiplier']

In [None]:
def year_func(a):
    return(str(int(math.floor(a)))+'.'+str(math.floor((a%1)*12)+1)+'.1')

In [None]:
df2.loc[:, 'est_date'] = df2.loc[:, 'est_date'].apply(year_func)

In [None]:
df.loc[:, 'proper_date'][df['proper_date'].isnull()] = df2['est_date']

In [None]:
df.loc[:, 'proper_date'] = pd.to_datetime(df['proper_date'], errors='coerce')

In [None]:
df.loc[:, 'proper_date'].to_csv('../data/proper_date.csv', index=False)