In [1]:
import ebooklib
import pandas as pd
from bs4 import BeautifulSoup

def extract_textbook_categories(fpath):
    mapping = {}
    df = pd.read_csv(fpath, names=['href'])
    for item in df.href.to_list():
        soup = BeautifulSoup(item, 'html.parser')    
        a_tag = soup.find('a')
        uname = a_tag['href'].split('/')[-1]
        mapping[uname] = a_tag.text
    return mapping    


data_root = "/Users/jfries/Desktop/subject_area_open_textbooks/"
categories = {
    'biology': extract_textbook_categories(f"{data_root}/biology_open_textbook_library.txt"),
    'medicine': extract_textbook_categories(f"{data_root}/medical_open_textbook_library.txt"),
}


In [None]:
import requests
import os
from bs4 import BeautifulSoup
from urllib.parse import urlparse

def download_and_save_url(name, url, output_folder, overwrite=False, include_domain=False):
    
    if not overwrite: 
        filenames = [
            os.path.join(output_folder, f'{name}.html'),
            os.path.join(output_folder, f'{name}.pdf')
        ]
        for fpath in filenames:
            if os.path.exists(fpath):
                return

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-User": "?1",
        "Cache-Control": "max-age=0",
    }
    
    
    # Send a GET request to the URL
    response = requests.get(url, headers=headers)
    
    # Check if the request was successful
    if response.status_code == 200:
        
        # Get the final URL after following redirects
        final_url = response.url
        
        # Get the content type from the response headers
        content_type = response.headers.get('Content-Type', '').lower()
        
         # Parse the URL to extract the domain
        parsed_url = urlparse(final_url)
        domain = parsed_url.netloc
        
        # Check if the content type is HTML
        if 'html' in content_type:
            # If it's HTML, save it as .html
            outfpath = f'{domain}_{name}.html' if include_domain else f'{name}.html'
            filename = os.path.join(output_folder, outfpath)
            with open(filename, 'wb') as f:
                f.write(response.content)
        else:
            outfpath = f'{domain}_{name}.pdf' if include_domain else f'{name}.pdf'
            # If it's not HTML, save it as .pdf
            filename = os.path.join(output_folder, outfpath)
            with open(filename, 'wb') as f:
                f.write(response.content)
    else:
        print(f'Failed to download URL. Status code: {name} | {url} | {response.status_code}')

In [111]:
import glob 
import collections
from tqdm import tqdm 

def get_textbook_formats(soup, domain="https://open.umn.edu"):
    formats = {}
    links = soup.find_all('a')
    for a in links:
        formats[a.text] = a['href'] if a['href'][0] != "/" else f"{domain}{a['href']}"
    return formats
    
filelist = glob.glob('/Users/jfries/Desktop/open_textbook_library_final/opentextbook-detail-webpages/*')
filelist  = {fpath.split('/')[-1].split('.')[0]:fpath for fpath in filelist}


urls = []
for category in categories:
    for key in categories[category]:
        # missing some files due to age of cache
        if key not in filelist:
            continue
    
        fpath = filelist[key]
        
        soup = BeautifulSoup(open(fpath, 'r').read(), 'html.parser') 
        # find best copy of available textbooks
        formats = soup.find('section', attrs={'id':'Formats'})
        formats = get_textbook_formats(formats)
    
        license = soup.find('section', attrs={'id':'License'}) 
        license = license.find('span').text

        if 'eBook' in formats:
            fmt = 'eBook'
            
        elif 'Online' in formats:
            fmt =  'Online'
            
        elif 'PDF' in formats:
            fmt = 'PDF'
            
        else:
            freq['missing'] += 1
            continue
            
        urls.append([category, license, key, categories[category][key], fmt, formats[fmt]])

df = pd.DataFrame(urls, columns=['category', 'license', 'key', 'textbook_name', 'format', 'url'])
df

{'MS Word': 'https://open.umn.edu/opentextbooks/formats/1418'}
{'Google Doc': 'https://open.umn.edu/opentextbooks/formats/3049'}


Unnamed: 0,category,license,key,textbook_name,format,url
0,biology,CC BY,introducing-mathematical-biology,Introducing Mathematical Biology,eBook,https://open.umn.edu/opentextbooks/formats/3650
1,biology,CC BY-NC-SA,threshold-concepts-in-biochemistry,Threshold Concepts in Biochemistry,eBook,https://open.umn.edu/opentextbooks/formats/3599
2,biology,CC BY,anatomy-and-physiology-laboratory-manual-for-n...,Anatomy and Physiology Laboratory Manual for N...,eBook,https://open.umn.edu/opentextbooks/formats/3584
3,biology,CC BY-NC,introduction-to-systems-biology-workbook-for-f...,Introduction to Systems Biology: Workbook for ...,PDF,https://open.umn.edu/opentextbooks/formats/3457
4,biology,CC BY-NC,introduction-to-biological-psychology,Introduction to Biological Psychology,eBook,https://open.umn.edu/opentextbooks/formats/3416
...,...,...,...,...,...,...
106,medicine,CC BY,supporting-individuals-with-intellectual-disab...,Supporting Individuals with Intellectual Disab...,eBook,https://open.umn.edu/opentextbooks/formats/1741
107,medicine,CC BY-NC,field-trials-of-health-interventions-a-toolbox,Field Trials of Health Interventions: A Toolbox,Online,https://open.umn.edu/opentextbooks/formats/463
108,medicine,CC BY,clinical-procedures-for-safer-patient-care,Clinical Procedures for Safer Patient Care,eBook,https://open.umn.edu/opentextbooks/formats/1559
109,medicine,CC BY-NC-SA,creative-clinical-teaching-in-the-health-profe...,Creative Clinical Teaching In The Health Profe...,eBook,https://open.umn.edu/opentextbooks/formats/740


In [99]:
import requests
from bs4 import BeautifulSoup

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "none",
    "Sec-Fetch-User": "?1",
    "Cache-Control": "max-age=0",
}
    
def is_html(response):
    """
    Check if the response contains HTML content.
    """
    content_type = response.headers.get("Content-Type", "").lower()
    if "html" in content_type:
        return True
    try:
        # Attempt to parse the content with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        if soup.find('html'):
            return True
    except:
        pass
    return False

def determine_file_type(url, headers=None):
    """
    Determine if a URL points to an HTML file or a binary file.
    Returns reponse, file extension
    """
    magic_numbers_to_type = {
        b'\x25\x50\x44\x46': 'pdf',
        b'PK\x03\x04': 'epub',
    }
    try:
        response = requests.get(url, headers=headers)
        if is_html(response):
            return response, 'html'
            
        else:
            for magic_num, ext in magic_numbers_to_type.items():
                if response.content.startswith(magic_num):
                    return response, ext
            return response, "binary"
            
    except Exception as e:
        return f"Error: {e}", "unknown"


In [115]:
def write_file(content, fpath):
    with open(fpath, 'wb') as f:
        f.write(content)  

cache_dir = "/Users/jfries/Desktop/open_textbook_library_final/cache/"
cache = set([x.split('/')[-1].split('.')[0] for x in glob.glob(f'{cache_dir}/binary/*')])
#cache = cache | set([x.split('/')[-1].split('.')[0] for x in glob.glob(f'{cache_dir}/html/*')])

print(len(cache))

62


In [116]:
for row in df.itertuples():
    
    if row.key in cache:
        #print(f"SKIPPING {row.category} | {row.key}")
        continue
    
    print(row.key, row.url)
    # response, ext = determine_file_type(row.url, headers)
    
    # if ext == "html":
    #     # cache
    #     write_file(response.content, f"{cache_dir}/html/{row.key}.{ext}")
    # elif ext in ["pdf", "epub"]:
    #     write_file(response.content, f"{cache_dir}/binary/{row.key}.{ext}")
    # else:
    #     print(f"{row.key}: unrecognized binary file")

biomedical-engineering-for-africa https://open.umn.edu/opentextbooks/formats/3191
introduction-to-genetics https://open.umn.edu/opentextbooks/formats/3188
the-science-of-sleep https://open.umn.edu/opentextbooks/formats/3001
microbiology-a-laboratory-experience https://open.umn.edu/opentextbooks/formats/2570
introduction-to-biosystems-engineering https://open.umn.edu/opentextbooks/formats/2367
quality-assurance-regulatory-affairs-for-the-biosciences https://open.umn.edu/opentextbooks/formats/1794
biochemistry-free-for-all-ahern https://open.umn.edu/opentextbooks/formats/2150
explorations-an-open-invitation-to-biological-anthropology-shook https://open.umn.edu/opentextbooks/formats/1797
biotechnology-foundations https://open.umn.edu/opentextbooks/formats/1437
quantitative-ecology-a-new-unified-approach https://open.umn.edu/opentextbooks/formats/1280
unfolding-the-mystery-of-life-biology-lab-manual-for-non-science-majors https://open.umn.edu/opentextbooks/formats/1253
an-introduction-to-n

## Fetch HTML Links
Some Open Textbook Library links lead to external hosting sites that provide various formats. We try to find the best download from this page.

In [110]:
def get_matched_hrefs(soup, pattern):
    matched_links = []
    # Loop through all `a` tags
    for a in soup.find_all('a'):
        # Check if the text within the entire `a` tag matches the pattern
        if pattern.search(a.get_text()) and 'href' in a.attrs:
            matched_links.append(a['href'])
    return matched_links

# look for epub links in html
pattern = re.compile(r'(ebook|epub)', re.IGNORECASE)

n = 0
filelist = glob.glob(f'{cache_dir}/html/*.html')
for fpath in filelist:
    key = fpath.split('/')[-1].split('.')[0]
   
    soup = BeautifulSoup(open(fpath,'rb').read(), 'html.parser')
    links = get_matched_hrefs(soup, pattern)
    
    if not links:
        n += 1
        continue

    url = links[0]
    print(url)
    response, ext = determine_file_type(url, headers)
    
    if ext in ["pdf", "epub"]:
        write_file(response.content, f"{cache_dir}/binary/{key}.{ext}")
    else:
        print(f"{row.key}: unrecognized binary file")
        

https://newprairiepress.org/ebooks
nursing-care-at-the-end-of-life-what-every-clinician-should-know: unrecognized binary file
https://openoregon.pressbooks.pub/bodyphysics/open/download?type=epub


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://openbooks.lib.msu.edu/isb202/open/download?type=epub3


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://openoregon.pressbooks.pub/mhccbiology101/open/download?type=epub


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://wtcs.pressbooks.pub/nursingfundamentals/open/download?type=epub


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://usq.pressbooks.pub/anatomy/open/download?type=epub


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://open.oregonstate.education/computationalbiology/open/download?type=epub


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://open.oregonstate.education/epidemiology/open/download?type=epub


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://openpress.usask.ca/undergradimaging/open/download?type=epub


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://press.rebus.community/literaturereviewsedunursing/open/download?type=epub


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://open.oregonstate.education/generalmicrobiology/open/download?type=epub


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://wtcs.pressbooks.pub/nursingskills/open/download?type=epub


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://oercollective.caul.edu.au/threshold-concepts-in-biochemistry/open/download?type=epub


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


/bitstreams/201b50ec-910b-4d6f-8ffb-2b6e4c7897ec/download
nursing-care-at-the-end-of-life-what-every-clinician-should-know: unrecognized binary file
https://openeducationalberta.ca/mlsci/open/download?type=epub


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://pressbooks.pub/clinicalteaching/open/download?type=epub
nursing-care-at-the-end-of-life-what-every-clinician-should-know: unrecognized binary file
https://open.oregonstate.education/generalmicrobiology/open/download?type=epub


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://opentextbc.ca/caregivers/open/download?type=epub


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://pressbooks.uwf.edu/medicalterminology/open/download?type=epub


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://openoregon.pressbooks.pub/mhccbiology102/open/download?type=epub


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://wtcs.pressbooks.pub/nursingadvancedskills/open/download?type=epub


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://openoregon.pressbooks.pub/mhccmajorsbio/open/download?type=epub3


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://uta.pressbooks.pub/anatomylab/open/download?type=epub


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://openoregon.pressbooks.pub/nutritionscience/open/download?type=epub


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://www.facebook.com/sharer/sharer.php?u=https://archive.org/details/cnx-org-col11903
nursing-care-at-the-end-of-life-what-every-clinician-should-know: unrecognized binary file
https://pressbooks.uwf.edu/healthcarecareerfoundations/open/download?type=epub


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://wtcs.pressbooks.pub/nursingmhcc/open/download?type=epub


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://opentextbc.ca/clinicalskills/open/download?type=epub


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


http://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=epub


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://wtcs.pressbooks.pub/nursingmpc/open/download?type=epub


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://iastate.pressbooks.pub/curehumanphysiology/open/download?type=epub


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://openoregon.pressbooks.pub/envirobiology/open/download?type=epub3


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://openoregon.pressbooks.pub/histologyandembryology/open/download?type=epub3


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://wtcs.pressbooks.pub/pharmacology/open/download?type=epub


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://pressbooks.uwf.edu/ushealthcaresystem/open/download?type=epub


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://sheffield.pressbooks.pub/introducingmathematicalbiology/open/download?type=epub


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://pressbooks.bccampus.ca/healthcasestudies/open/download?type=epub


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


http://pressbooks.oer.hawaii.edu/anatomyandphysiology2lab/open/download?type=epub


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://openbooks.lib.msu.edu/neuroscience/open/download?type=epub3


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


In [96]:
# def get_matched_hrefs(soup, pattern):
#     matched_links = []
#     # Loop through all `a` tags
#     for a in soup.find_all('a'):
#         # Check if the text within the entire `a` tag matches the pattern
#         if pattern.search(a.get_text()) and 'href' in a.attrs:
#             matched_links.append(a['href'])
#     return matched_links
    
# # Compile a regular expression pattern to match "ebook" or "EPUB" (case-insensitive)
# pattern = re.compile(r'(ebook|epub)', re.IGNORECASE)
    
# fpath = "/Users/jfries/Desktop/open_textbook_library_final/cache/html/threshold-concepts-in-biochemistry.html"
# soup = BeautifulSoup(open(fpath,'rb').read(), 'html.parser') 


# get_matched_hrefs(soup, pattern)

['https://oercollective.caul.edu.au/threshold-concepts-in-biochemistry/open/download?type=epub']

In [65]:
# len(set(df.key))

text_root = "/Users/jfries/Desktop/Sept 12 organize/opentextbooks/text/"

n = 0
for row in df.itertuples():
    fpath = f"{text_root}/{row.key}.txt"
    if os.path.exists(fpath):
        n += 1
        
n, len(df)

# https://open.umn.edu/opentextbooks/formats/3650  
# https://open.umn.edu/opentextbooks/formats/3457 

(68, 111)

In [132]:
import re
import os
from ebooklib import epub
from lxml import html
from bs4 import BeautifulSoup
import html2text
import pickle 
import json

def extract_main_content(soup):
    
    # Remove unwanted elements like scripts and styles
    for script in soup(['script', 'style']):
        script.extract()

    # convert main HTML elments to markdown
    converter = html2text.HTML2Text()
    converter.ignore_links = True  # Ignore converting links to markdown
    converter.body_width = 0  # Disable fixed width wrapping
    converter.use_italics = False
    markdown_content = converter.handle(str(soup))

    # remove extranenous newlines
    markdown_content = re.sub(r'(?<!_)\_(?!\_)', '', markdown_content) 
    markdown_content = re.sub(r'\n{3,}', '\n\n', markdown_content) 
    return markdown_content


def extract_html_from_epub(epub_path, output_dir):
    # Load the EPUB book
    book = epub.read_epub(epub_path)

    # Ensure the output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Extract HTML content from the EPUB book
    for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
        # Get the file name and path for the HTML content
        file_name = os.path.basename(item.file_name)
        output_path = os.path.join(output_dir, file_name)

        # Write the HTML content to a file
        with open(output_path, 'w', encoding='utf-8') as f:
            content = html.tostring(item.content, encoding='unicode', method='html')
            f.write(content)

    print(f"HTML files extracted to {output_dir}")


epubs = glob.glob('/Users/jfries/Desktop/open_textbook_library_final/cache/binary/*.epub')
outputdir = "/Users/jfries/Desktop/open_textbook_library_final/"

# parse ebooks
jsonl = []
for fpath in epubs:
    book = epub.read_epub(fpath)
    sections = {}
    for i,item in enumerate(book.get_items_of_type(ebooklib.ITEM_DOCUMENT)):
        soup = BeautifulSoup(item.content, 'html.parser')
        key = str(item).split(':')[-1].split('.')[0]
        sections[key] = extract_main_content(soup)

    name = fpath.split('/')[-1].split('.')[0]
    
    text = '\n'.join([sections[key] for key in sections])
    with open(f'{outputdir}/markdown/{name}.md','w') as file:
        file.write(text)

    # dump to jsonl
    for key in sections: 
        d = {'id': f"{name}:{key}", 'textbook_name':name, 'text': sections[key]}
        jsonl.append(json.dumps(d))
    
print(len(jsonl))


3845


In [134]:
with open(f'{outputdir}/jsonl/train.jsonl','w') as file:
    for line in jsonl:
        file.write(line + '\n')