# Unpacking epub files contents into plain text

In [1]:
import os, glob, shutil, zipfile, shutil
from bs4 import BeautifulSoup

In [140]:
filenames = glob.glob('*.epub')

In [137]:
tmpdir = 'tmp'
if not os.path.isdir(tmpdir):
    os.mkdir(tmpdir)

In [131]:
def get_files(parse_dir):
    ftype = 'html'
    tobeparsed = glob.glob(parse_dir + '/*/*.' + ftype[:-1] + '*', recursive=True)

    if not tobeparsed:
        ftype = 'xml'
        tobeparsed = glob.glob(parse_dir + '/*/*.' + ftype, recursive=True)

    if not tobeparsed:
        print('nothing found in', parse_dir)
    else:
        print("found files in", ftype, "format:")
        print(*tobeparsed, sep='\n', end='\n---------\n')
    
    return tobeparsed

In [132]:
def parse_soup(tobeparsed):
    
    txt_out = []
    
    for fname in tobeparsed:
        with open(fname) as f:
            soup = BeautifulSoup(f)
            txt_out.append(soup.get_text())
            
    return txt_out

In [133]:
def write_parsed(name_noext, txt_out):
    
    resultdir = 'results'
    file_out = os.path.join(resultdir, name_noext + '.txt')
    
    if not os.path.isdir(resultdir):
        os.mkdir(resultdir)  
        
    with open(file_out, 'w') as out:
        for txt in txt_out:
            out.write(txt)

In [141]:
for filename in filenames:
    
    print('extracting:', filename)
    
    fn_noext = filename[:-5]
    extrdir = os.path.join(tmpdir, fn_noext)
    
    if not os.path.isdir(extrdir):
        os.mkdir(extrdir)        
        
        with zipfile.ZipFile(filename, 'r') as file:
            file.extractall(extrdir)
            
        meta = glob.glob(extrdir + '/META*')
        shutil.rmtree(meta[0])
        
    tobeparsed = get_files(extrdir)
    txt_out = parse_soup(tobeparsed)
    write_parsed(fn_noext, txt_out)

extracting: sarrasine.epub
found files in xml format:
tmp/sarrasine/OEBPS/text-3.xml
tmp/sarrasine/OEBPS/text-6.xml
tmp/sarrasine/OEBPS/text-1.xml
tmp/sarrasine/OEBPS/text-2.xml
tmp/sarrasine/OEBPS/text-5.xml
tmp/sarrasine/OEBPS/text-4.xml
---------
extracting: Adieu.epub
found files in xml format:
tmp/Adieu/OEBPS/text-3.xml
tmp/Adieu/OEBPS/text-6.xml
tmp/Adieu/OEBPS/text-1.xml
tmp/Adieu/OEBPS/text-7.xml
tmp/Adieu/OEBPS/text-2.xml
tmp/Adieu/OEBPS/text-5.xml
tmp/Adieu/OEBPS/text-4.xml
---------


In [135]:
shutil.rmtree(tmpdir)