# Dewey Scraper

Code to get fine-grained Dewey Decimal Classification (DDC) categories (categories past the decimal place) from the MDS table on [LibraryThing.com](https://www.librarything.com/mds/). Used to help generate a tree representation of the DDC for the paper "Quantifying Bias in Hierarchical Category Systems."

Number of collected categories may change subject to website updates. On March 23, 2022 32,661 fine-grained categories were collected. 

## Imports

In [None]:
import requests 
import pickle
import re
from bs4 import BeautifulSoup

## Functions

In [None]:
'''
Extract information from a table contianing DDC categories.
Input is a BeautifulSoup object representing the contents
of a page. 
'''
def get_dewey(soup):
    ddc = soup.find_all('table', class_="ddc")
    if len(ddc) == 0:
        return []
    else:
        ddc = ddc[0]
    # extract all rows
    ddc = ddc.find_all('tr')
    if len(ddc) == 0:
        return []
    else:
        return ddc[-1]

'''
Extract categories from a DDC table with both a name and
a number. 
'''
def get_data(ddc):
    data = []
    for dat in ddc.select('td'):
        num = re.search("\d+(\.\d+)?", dat.get('onclick'))[0]
        name = dat.select('.word')
        # if the category has both a name and a classification number
        if name != [] and not re.search('^-*$', name[0].get_text()):
            data.append((num, name[0].get_text()))
    return data

'''
Crawl a subpage of the MDS table on LibraryThings.com and
extract DDC category information.
'''
def crawl(new_url, base_url, class_data):
    page = requests.get(new_url)
    soup = BeautifulSoup(page.content, 'html.parser')
    ddc = get_dewey(soup)     
    data = get_data(ddc)
    # if sub categories already parsed 
    if all(d in class_data for d in data):
        pass
    else:
        # parse subcategories of a category
        class_data += data
        for dat in data:
            new_link = base_url + '/' + dat[0]
            prev_url = new_url[-1]
            if prev_url == new_link:
                break
            else:
                crawl(new_link, base_url, class_data)

'''
Read in existing DDC categories
'''
def all_valid_ddc(filePath):
    ddc = []
    with open(filePath, 'r', encoding="utf8") as f:
        data = f.readlines()
    for row in data[1:]:
        row_data = row.replace('\n', '').split('\t')
        ddc.append(row_data[0])
    return list(set(ddc))

'''
Function to extract categories past the decimal place
in the DDC from LibraryThing.com's MDS table. 
'''
def get_fine_grained_names(ddc, all_data):
    i = 1
    for num in ddc:
        url = 'https://www.librarything.com/mds/' + num
        print(f"crawling for ddc: {num} using url: {url}")
        crawl(url, 'https://www.librarything.com/mds', all_data)
        print(f"finished crawl {i}, list is now length {len(all_data)}")
        i += 1

In [None]:
ddc_3dig = all_valid_ddc('Library Data/DDC/ddc22-summaries-eng.txt')
ddc_3dig.sort()
all_data = []
get_fine_grained_names(ddc_3dig, all_data)
print(len(all_data))

In [None]:
with open('Library Data\\DDC\\ddc_fg.pk', 'wb') as f:
    pickle.dump(all_data, f)