# Extracted Catalogue Entry Analysis

Analyse catalogue entries extracted by main.py or extract_catalogue_entries.ipynb.

In [None]:
import sys
if "../" not in sys.path:
    sys.path.append("../")
import glob
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
csv_path = r"\\ad\collections\TwoCenturies\TwoCenturies IV\Incunabula\split_data\BMC_[0-9]*\catalogue_entries.csv"
entry_csv_paths = glob.glob(csv_path)

In [None]:
entry_csvs = {p.split("\\")[-2]: pd.read_csv(p, converters={"entry": lambda x: x[2:-2].split("\', \'")}) for p in entry_csv_paths}

In [None]:
for vol, df in entry_csvs.items():
    df["vol"] = int(vol.split("_")[-1])

In [None]:
entry_df = pd.concat(list(entry_csvs.values())).rename_axis(index="volume_entry_num").reset_index()

In [None]:
entry_df["entry_length"] = entry_df["entry_text"].transform(lambda x: len(x))

In [None]:
entry_df.head()

In [None]:
ma = entry_df["entry_length"].rolling(window=100, center=True).mean()
mean = entry_df.groupby(by="vol")["entry_length"].mean()

In [None]:
# mean.rename_axis("Volume").rename("Mean Entry Length").to_csv("..\\data\\processed\\mean_lengths.csv")

In [None]:
n_entrys = entry_df.groupby(by="vol")["vol"].count()
n_entrys.loc[0] = 0
n_entrys.sort_index(inplace=True)
x_locs = n_entrys.cumsum() - n_entrys.cumsum().diff()/2

In [None]:
fig, ax = plt.subplots(figsize=(12,8))
ax.plot(entry_df["entry_length"], lw=1)
ax.plot(ma, "black", label="Moving average")
ax.set_title("Catalogue Entry Length For Incunabula Volumes 1-10", fontsize='x-large')
ax.set_xlabel("Catalogue Entry Number (across all volumes)", fontsize='x-large')
ax.set_ylabel("Entry length (characters)", fontsize='x-large')
ax.tick_params(labelsize='large')
ax.vlines(n_entrys.cumsum(), 0, ax.get_ylim()[1], colors="black", linestyles="--")
ax.set_xlim(0, len(entry_df))
ax.set_ylim(0, entry_df["entry_length"].max() + 100)
for i, x in enumerate(x_locs.dropna()[:8]):
    ax.text(x, 10600, f"BMC {i+1}\n$\mu$: {mean.loc[i+1]:.0f}", ha="center")
    

ax.text(x_locs[9], 10600, f"BMC {9}", rotation="vertical", ha="center")
ax.text(x_locs[10], 10600, f"BMC {10}", rotation="vertical", ha="center")
ax.text(x_locs[9], 9100, f"$\mu$: {mean.loc[9]:.0f}", rotation="vertical", ha="center")
ax.text(x_locs[10], 9100, f"$\mu$: {mean.loc[10]:.0f}", rotation="vertical", ha="center")
ax.legend()

In [None]:
# fig.savefig("..\\reports\\figures\\entry_length.png", dpi=300, bbox_inches="tight")