# Prep data

Using only annotation type == "Main Description" and annotation source == "Ingram"

In [23]:
import ast
import re
import pandas as pd

df = pd.read_csv("data.csv")
print(f"original dataframe shape: {df.shape}")

df = df[
    (df["annot_type"] == "Main Description") & (df["annot_source"] == "Ingram")
].copy()
df["BISAC"] = df["BISAC"].apply(ast.literal_eval)
print(f'"Main Description", "Ingram" dataframe shape: {df.shape}')

# convert BISAC to only alpha characters
df["BISAC"] = df["BISAC"].apply(
    lambda x: [re.sub(r"[^a-zA-Z]", "", bisac) for bisac in x]
)

# remove duplicate BISACs
df["BISAC"] = df["BISAC"].apply(lambda x: list(set(x)))

# get set of bisacs that have at least 500 books
bisac_counts = df["BISAC"].explode().value_counts()
bisac_counts = bisac_counts[bisac_counts >= 500]
bisac_counts = bisac_counts.index
bisacs_500 = set(bisac_counts)
print("bisacs with at least 500 books:")
print(bisacs_500)

# create a new dataframe with only books that have all of their bisacs in bisacs_500
df = df[df["BISAC"].apply(lambda x: all(bisac in bisacs_500 for bisac in x))]
print(
    f"dataframe with only books that have all of their bisacs in bisacs_500 shape: {df.shape}"
)

original dataframe shape: (148409, 5)
"Main Description", "Ingram" dataframe shape: (59633, 5)
bisacs with at least 500 books:
{'FIC', 'JUV', 'REL', 'ART', 'BUS', 'REF', 'SPO', 'HEA', 'PHO', 'MUS', 'MED', 'DRA', 'BIO', 'LAN', 'LIT', 'HIS', 'HUM', 'TRV', 'YAF', 'TEC', 'SCI', 'LCO', 'SOC', 'CKB', 'PER', 'CRA', 'SEL', 'COM', 'NAT', 'FAM', 'TRU', 'PHI', 'POE', 'PSY', 'POL', 'OCC'}
dataframe with only books that have all of their bisacs in bisacs_500 shape: (55280, 5)


# Classification