# Combine the BISAC and summaries datasets

In [8]:
import pandas as pd

file1 = pd.read_csv("original-data/Dartmount_subjects.txt", sep="|")
file1["isbn13"] = file1["isbn13"].astype(str)

file2 = pd.read_csv("original-data/Dartmount_summaries.txt", sep="|")
file2["isbn13"] = file2["isbn13"].astype(str)

merged_df = pd.merge(file1, file2, on="isbn13")

merged_df["BISAC"] = merged_df["BISAC"].str.split("#")

In [9]:
print(file1.shape)
print(file2.shape)
print(merged_df.shape)

(149387, 2)
(153904, 4)
(148409, 5)


the one that causes the error: `print(html_to_text(merged_df["annot"][11138]))`

In [10]:
from bs4 import BeautifulSoup
import pandas as pd


def get_text(content):
    soup = BeautifulSoup(content)
    return soup.get_text(separator="\n", strip=True)


merged_df["annot"] = merged_df["annot"].apply(get_text)
merged_df.rename(columns={"isbn13": "isbn", "annot": "summary"}, inplace=True)

  soup = BeautifulSoup(content)


# Add a chunk of 2000 words from the full text to each row

In [11]:
import pandas as pd
import ast
from tqdm import tqdm

df = pd.read_csv("isbns.csv")

titles = []
isbns = []
for i, row in tqdm(df.iterrows(), total=len(df)):
    titles.append(row["filename"])
    isbn_1d = ast.literal_eval(row["isbns"])
    isbn = ""
    if len(isbn_1d) == 1 or len(str(isbn_1d[0])) == 13:
        isbns.append(isbn_1d[0])
    else:
        isbn_1d = [str(isbn) for isbn in isbn_1d]
        isbn_1d.sort(key=len, reverse=True)
        isbns.append(int(isbn_1d[0]))

isbn_to_title = {isbn: title for isbn, title in zip(isbns, titles)}

100%|██████████| 174918/174918 [00:11<00:00, 15220.04it/s]


### Only take summaries that are of type "Main Description" and from "Publisher", "Ingram", or "Ingram Syndetics"

In [12]:
merged_df = merged_df[(merged_df["annot_type"] == "Main Description")]
merged_df = merged_df[
    merged_df["annot_source"].isin(["Publisher", "Ingram", "Ingram Syndetics"])
]
merged_df = merged_df[["isbn", "BISAC", "summary"]]
merged_df

Unnamed: 0,isbn,BISAC,summary
0,9780001847118,"[JUV037000, YAF019000, JUV001000, JUV000000, J...",This is a fantasy adventure by Alan Garner.
1,9780002006781,[HEA042000],"In those first heady days of a relationship, c..."
2,9780002007580,"[TRU001000, HIS037070]","In 1972, 11 Israeli athletes were murdered at ..."
3,9780002007801,[CKB000000],Watch a video trailer for The Devil's Picnic\n...
4,9780002189613,[SPO000000],In a world where it is becoming increasingly d...
...,...,...,...
148402,9789888273454,"[TRV003050, HIS021000]",A breathtaking romp through the city's Tokyo's...
148403,9789888422609,"[HIS021000, HIS003000]",Somebody knew. Who knew? Did Winston Churchill...
148405,9789927101892,[FIC019000],A Sudanese writer begins to suspect that one o...
148407,9789992194287,[FIC019000],When an opulent palace is built on the Jeddah ...


In [14]:
import multiprocessing
from tqdm import tqdm
import random

full_text_dir = "Bibliotik"


def get_chunk(isbn):
    try:
        title = isbn_to_title[int(isbn)]
        with open(f"{full_text_dir}/{title}", "r") as f:
            text = f.read()
        words = text.split()
        n_words = len(words)
        if n_words < 2000:
            return None
        start = random.randint(0, n_words - 2000)
        return " ".join(words[start : start + 2000])
    except Exception as e:
        print(f"Error processing ISBN {isbn}: {e}")
        return None


isbns = merged_df["isbn"].tolist()

num_processes = multiprocessing.cpu_count() - 1

pool = multiprocessing.Pool(processes=num_processes)

with multiprocessing.Pool(processes=num_processes) as pool:
    chunks = list(tqdm(pool.imap(get_chunk, isbns), total=len(isbns)))

pool.close()
pool.join()

 44%|████▍     | 62085/141696 [06:21<17:25, 76.11it/s]  

Error processing ISBN 9780919626102: 9780919626102


100%|██████████| 141696/141696 [19:32<00:00, 120.81it/s]


In [24]:
merged_df["text"] = chunks
merged_df.to_csv("data.csv", index=False)

# Remove rows with empty summaries, text, or BISAC codes

In [3]:
import pandas as pd

df = pd.read_csv("../data.csv")
df.dropna(subset=["summary", "text", "BISAC"], inplace=True)
df.to_csv("../data.csv", index=False)

# Merge the BISAC and summaries dataset with the KR dataset

In [5]:
df = pd.read_csv("../data.csv")
df_kr = pd.read_csv("kr.csv")

In [8]:
df["isbn"] = df["isbn"].astype(str)
df_kr["isbn"] = df_kr["ISBN"].astype(str).apply(lambda s: s.replace("-", ""))
df_kr["isbn"]

df_merged = pd.merge(df, df_kr, on="isbn")
df_merged = df_merged[
    [
        "isbn",
        "BISAC",
        "summary",
        "text",
        "Title",
        "Kirkus Star",
        "Author",
        "Genre",
        "Publisher",
    ]
]
df_merged.rename(
    columns={
        "Title": "title",
        "Kirkus Star": "kirkus_star",
        "Author": "author",
        "Genre": "genre",
        "Publisher": "publisher",
    },
    inplace=True,
)
df_merged.to_csv("../data_kr.csv", index=False)