In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

#%timeit ,  line_profiler, memory_profiler

## Scrapping

In [2]:
class Aux:
    INFO = 0
    DEBUG = 1
    WARN = 2
    ERROR = 3
    NONE = 4
    def timestamp():
        now = datetime.now()
        return ''.join(str(datetime.timestamp(now)).split('.'))

    def string(string, var):
        return string % var
    
    def get_state(res, level=4):
        if level <= Aux.DEBUG:
            print("## Status code = "+str(res.status_code))
            #print("## Message = "+str(res.text)[:20])
        if res.status_code >=200 and res.status_code <300:
            print("  This is OK")
        if res.status_code >=300 and res.status_code <400:
            print("  This is a redirection")
        if res.status_code >=400 and res.status_code <500:
            print("  This is a Client problem")
        if res.status_code >=500 and res.status_code <600:
            print("  This is a Server problem")
        if level == Aux.INFO:
            print("#HEADERS#", res.headers)
            print("#CONTENT#", res.content)
            #print("#JSON#", res.json())
            
    #hash - a md5 digest of the ts parameter, your private key and your public key (e.g. md5(ts+privateKey+publicKey)
    def hash_md5(string):
        return hashlib.md5(string.encode('utf-8')).hexdigest()
    
    def image(url):
        return Image(url, width=100, height=100)

In [3]:
import requests
from bs4 import BeautifulSoup
import time

## Spider

In [4]:
import random

def parser2(content):
    soup = BeautifulSoup(content,'lxml')
    div_search = soup.find_all('div',{'class':'q1'})
    divs = [span for span in div_search]
    span_search = soup.find_all('span',{'class':['label','content']})
    spans = [span.text for span in span_search if len(span.string)>1 
             or str(span.string).isnumeric()]  
    return spans

class Spider:
    def __init__(self, url_pattern, pages_to_scrape=1, sleep_interval=-1, content_parser=None):
        self.url_pattern = url_pattern
        self.pages_to_scrape = pages_to_scrape
        self.sleep_interval = sleep_interval
        self.content_parser = content_parser
        self.results = []
        self.responses = []
        self.df = pd.DataFrame()
        parent, leaf = url_pattern.rsplit("/", maxsplit=1)
        book, version = leaf.rsplit(".", maxsplit=1)
        self.book = book[:3]
        self.version = version
    
    """
    Scrape the content of a single url.
    """
    def scrape_url(self, url, chapter):
        try:
            #print('url:',url)
            header = self.get_random_ua()
            #print('header:',header)
            response = requests.get(url,headers=header)
            #Aux.get_state(response, Aux.DEBUG)
        except requests.exceptions.Timeout:
            print('Timeout Error')
            pass
        except requests.exceptions.SSLError:
            print('SSLError Error')
            pass
        except requests.exceptions.RequestException as e:
            print('RequestException Error')
            pass
        
        result = self.content_parser(response.content)
        self.to_df(result, chapter)
        #self.responses.append(response.content)
        #self.results.append(result)
        self.output_results(self.book+":"+str(chapter))
    
    """
    Export the scraped content. Right now it simply print out the results.
    But in the future you can export the results into a text file or database.
    """
    def output_results(self, r):
        print(r)
    
    """
    After the class is instantiated, call this function to start the scraping jobs.
    This function uses a FOR loop to call `scrape_url()` for each url to scrape.
    """
    def kickstart(self):
        sleepy = self.sleep_interval
        for i in range(1, self.pages_to_scrape+1):
            if sleepy < 0:
                sleepy =  random.random() * 1.2
            time.sleep(sleepy)
            self.scrape_url(self.url_pattern % i, i)
            
    def save(self, name='', pre=''):
        if not name:
            name= str(pre) + self.book + ".csv"
        else:
            name = 'data/'+str(pre) + name
        self.df.to_csv(name,index=False)
        
    """ returns random User Agent"""       
    def get_random_ua(self):
        heads =[
            'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Mobile Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:67.0) Gecko/20100101 Firefox/67.0',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.1 Safari/605.1.15'
        ]
        headers = {'User-Agent': ''}
        sample = random.sample(heads,1)
        headers['User-Agent'] = sample[0]
        return headers
    
    def to_df(self, result, chapter):
        book = self.book
        version = self.version
        dic = {}
        txt = ''
        key = ''
        for f in result:
            if f.isnumeric():
                if txt!='' and key != '':dic[key]=[book,chapter,key,txt,version]
                key = int(f)
                txt = ''
            else:
                txt += f
        df = pd.DataFrame.from_dict(dic,orient='index',
                            columns=['book','chapter','verse','text','version'])
        self.df = pd.concat([self.df,df],axis=0,ignore_index=True)

In [5]:
# URL_PATTERN = 'https://my.bible.com/ca/bible/335/PRO.%s.BCI'
# català: URL_BASE = 'https://my.bible.com/ca/bible/335/'
URL_BASE = 'https://my.bible.com/ca/bible/1637/'

# book, chapters, prefix ('PRO',31,'20'),('WIS',19,'43'),
books = [('GEN',50,'1'),('REV',22,'1')]
version = 'NVI'

def init(books, version):
    for (book, chapters, pre) in books:
        URL_PATTERN = URL_BASE + book + ".%s." + version
        spider = Spider(URL_PATTERN, chapters, content_parser=parser2)
        spider.kickstart()
        spider.save(pre= pre+'_')
        
#init(books, version)

In [6]:
init(books, version)

GEN:1
GEN:2
GEN:3
GEN:4
GEN:5
GEN:6
GEN:7
GEN:8
GEN:9
GEN:10
GEN:11
GEN:12
GEN:13
GEN:14
GEN:15
GEN:16
GEN:17
GEN:18
GEN:19
GEN:20
GEN:21
GEN:22
GEN:23
GEN:24
GEN:25
GEN:26
GEN:27
GEN:28
GEN:29
GEN:30
GEN:31
GEN:32
GEN:33
GEN:34
GEN:35
GEN:36
GEN:37
GEN:38
GEN:39
GEN:40
GEN:41
GEN:42
GEN:43
GEN:44
GEN:45
GEN:46
GEN:47
GEN:48
GEN:49
GEN:50
REV:1
REV:2
REV:3
REV:4
REV:5
REV:6
REV:7
REV:8
REV:9
REV:10
REV:11
REV:12
REV:13
REV:14
REV:15
REV:16
REV:17
REV:18
REV:19
REV:20
REV:21
REV:22
