In [1]:
import requests
import pyperclip
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
from pathlib import Path
import os
import re
import time

#stuff for epub extraction
import ebooklib
from ebooklib import epub

In [2]:
def getLinks(driver):
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    searchResults = soup.find_all('div',{'class':'details'})
    html = ''
    for item in searchResults:
        #print(str(tag))
        tag = item.find('h2')
        pid = tag.parent.parent
        link = str(pid.find('a').get('href'))
        html += "https://www.hindawi.org"+str(link)+":::3\n"
    return html

def waitForAlert(driver):
    try:
        WebDriverWait(driver, 1).until(EC.alert_is_present(),
                                       'Timed out waiting for PA creation ' +
                                       'confirmation popup to appear.')

        alert = driver.switch_to.alert
        alert.accept()
        print("alert accepted")
        waitForAlert(driver)
    except TimeoutException:
        print("no alert")

def getTextFromLink(url, difficulty, level, count):
    dir = 'C:\\Users\\maste\\Desktop\\arabicScraperOutput'
    filename = "hindawi_" + str(level)+"_"+str(count) + ".txt"
    #print(url)
    req = requests.get(url)
    soup = BeautifulSoup(req.content,"html.parser")

    ps = soup.find("div",{"class":"formilized-body"})
    
    if ps != None:
        myFileName = os.path.join(dir, filename)
        fout = open(myFileName, "w+",encoding='utf8')
        fout.write("<DIFFICULTY>"+str(difficulty)+"</DIFFICULTY>\n")
        fout.write("<BODY>\n")
        fout.write(ps.text.replace("\n\n","\n") + "\n")
        fout.write("</BODY>\n")
        fout.close()
    else:
        print("no text found")

def getModules(page,count, driver):
    html = ''
    links = {}
    driver.get(page+count)
    driver.implicitly_wait(1)
    html = getLinks(driver)
    return html

def download_wait(path_to_downloads):
    seconds = 0
    dl_wait = True
    while dl_wait and seconds < 20:
        time.sleep(1)
        dl_wait = False
        for fname in os.listdir(path_to_downloads):
            if fname.endswith('.crdownload'):
                dl_wait = True
        seconds += 1
    return seconds
def topicFromKey(key):
    pieces = key.split('/')
    topic = pieces[5]
    return topic

In [8]:
#sections that can be skipped
SKIP = {'economics':'','business':'','literature':'','travel.literature':'', 'history':'', 'technology':'','geography':'','science.fiction':'', 'novels':'', 'politics':''}

In [9]:
if __name__ == "__main__":
    req = requests.get('https://www.hindawi.org/books/')
    soup = BeautifulSoup(req.content,"html.parser")
    list = soup.find('ul',{'class':'navSide'})
    links = list.find_all('a')
    html = {}
    for link in links:
        if link['href'].find('categories') != -1:
            url = 'https://www.hindawi.org' + link['href']
            html[url] = ''
    for key in html:
        topic = topicFromKey(key)
        print(topic)
        if (topic in SKIP):
            print('-skipped ', topic)
        else:
            downloadDir = "C:\\Users\\maste\\Desktop\\hindawi\\epubs\\" + topic
            chromeOptions = webdriver.ChromeOptions()
            prefs = {"download.default_directory" : downloadDir}
            chromeOptions.add_experimental_option("prefs",prefs)
            driver = webdriver.Chrome(chrome_options=chromeOptions)
            next = 'start'
            count = 1
            while next != '':
                next = getModules(key,str(count), driver)
                html[key] += next + ":::" + topic
                count += 1
                
            urlList = html[key].split('\n')

            print(len(urlList))
            for i in urlList:
                url = i.split(":::")[0]
                if url != '':
                    name = url.split('/')[4] + '.epub'
                    if (os.path.exists(os.path.join(downloadDir, name)) == False):
                        epuburl = url[0:-1] + '.epub'
                        driver.get(epuburl)
                        checkExists = 0
                        while (os.path.exists(os.path.join(downloadDir, name)) == False):
                            time.sleep(1)
                            checkExists += 1
                            if (checkExists == 10):
                                if ((os.path.exists(os.path.join(downloadDir, name+'.crdownload')) == False)):
                                    print('timed out')
                                    break
            driver.close()
    print('finished')

business
-skipped  business
literature
-skipped  literature
travel.literature
-skipped  travel.literature
economics
-skipped  economics
history
-skipped  history
technology
-skipped  technology
geography
-skipped  geography
science.fiction
-skipped  science.fiction
novels
-skipped  novels
politics
-skipped  politics
biographies
96
poetry
58
health
5
psychology
18
science
64
social.sciences
93
environmental.sciences
3
linguistics
15
philosophy
127
arts
24
children.stories
130


KeyboardInterrupt: 

In [10]:
for key in html:
    topic = topicFromKey(key)n
    downloadDir = "C:\\Users\\maste\\Desktop\\hindawi\\epubs\\" + topic
    if os.path.exists(downloadDir):
        print(topic, " exists")
    else:
        print('making dir for ', topic)
        os.mkdir(downloadDir)
    

business  exists
literature  exists
travel.literature  exists
economics  exists
history  exists


In [11]:
epubDir = 'C:\\Users\\maste\\Desktop\\hindawi\\epubs'
outDir = 'C:\\Users\\maste\\Desktop\\hindawi\\out'
for category in os.listdir(epubDir):
    print(category)
    #create output directory if it doesn't exist
    outDirFull = os.path.join(outDir,category)
    print(outDirFull)
    if (os.path.isdir(outDirFull) == False):
        os.mkdir(outDirFull)
    epubDirFull = os.path.join(epubDir, category)
    for filename in os.listdir(epubDirFull):
        output = open(outDirFull+'\\hindawi_'+filename[:-5]+'.txt', 'w+', encoding='utf-8')
        book = epub.read_epub(os.path.join(epubDirFull, filename))
        output.write('<DIFFICULTY></DIFFICULTY>\n')
        output.write('<TOPIC>'+category+'</TOPIC>\n')
        output.write('<BODY>\n\n')
        for item in book.get_items():
            if item.get_type() == ebooklib.ITEM_DOCUMENT:
                output.write('================================================\n')
                text = BeautifulSoup(item.get_body_content()).get_text()
                output.write(text +'\n')
                output.write('================================================\n\n')
        output.write('\n\n</BODY>')
    output.close()

arts
C:\Users\maste\Desktop\hindawi\out\arts
biographies
C:\Users\maste\Desktop\hindawi\out\biographies
business
C:\Users\maste\Desktop\hindawi\out\business
children.stories
C:\Users\maste\Desktop\hindawi\out\children.stories
detective.fiction
C:\Users\maste\Desktop\hindawi\out\detective.fiction
economics
C:\Users\maste\Desktop\hindawi\out\economics
environmental.sciences
C:\Users\maste\Desktop\hindawi\out\environmental.sciences
geography
C:\Users\maste\Desktop\hindawi\out\geography
health
C:\Users\maste\Desktop\hindawi\out\health
history
C:\Users\maste\Desktop\hindawi\out\history
linguistics
C:\Users\maste\Desktop\hindawi\out\linguistics
literary.criticism
C:\Users\maste\Desktop\hindawi\out\literary.criticism
literature
C:\Users\maste\Desktop\hindawi\out\literature
novels
C:\Users\maste\Desktop\hindawi\out\novels
philosophy
C:\Users\maste\Desktop\hindawi\out\philosophy
plays
C:\Users\maste\Desktop\hindawi\out\plays
poetry
C:\Users\maste\Desktop\hindawi\out\poetry
politics
C:\Users\mas

EpubException: 'Bad Zip file'