In [1]:
import os
import json
import time
import requests
import PyPDF2
from datetime import datetime
from bs4 import BeautifulSoup
from src import common

In [2]:
def _convert_pdf(response):
    
    tmp_file = f'../data/policies/tmp/temp.pdf'
    if os.path.isfile(tmp_file):
        os.remove(tmp_file)
    with open(tmp_file, 'wb') as pdf:
        pdf.write(response.content)
    with open(tmp_file, 'rb') as pdf:
        pdfreader = PyPDF2.PdfFileReader(pdf)
        pdftext = ''
        for n in range(pdfreader.getNumPages()):
            pdfpage = pdfreader.getPage(n)
            pdftext += pdfpage.extractText()
            pdftext += '\n'
    return pdftext

def _conver_html(response):
    
    soup = BeautifulSoup(response.content, 'html.parser')
    htmlbody = soup.find('body')
    for tag in htmlbody.find_all('a'):
        tag.replaceWith('')
    htmltext = htmlbody.get_text(strip=True)
    return htmltext

def scrape_webpage(url):
    
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "Accept-Language": "en-US,en;q=0.9,en;q=0.8",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15"
      }
    
    status = False
    try:
        response = requests.get(url, headers=headers, stream=True, timeout=20)
        time.sleep(3)
        status = True
    except Exception as e:
        print('===> Problem occured during html request')
        print(f'===> {e}')
    if status:
        if response.status_code == 200:
            try:
                contenttype = response.headers.get('content-type')
                if 'application/pdf' in contenttype:
                    text = _convert_pdf(response)
                elif 'text/html' in contenttype:
                    text = _conver_html(response)
                else:
                    print(f'unknown content-type: {contenttype}')
                    text = ''
            except Exception as e:
                print('===> Problem occured during response to text convertion')
                print(e)
                text = ''
            return text, response.status_code
        else:
            print(f'===> Response: {response.status_code}')
            return '', response.status_code
    else:
        return '', -999
    
def main(ppurls):
    
    logdict = {}
    
    for k, v in ppurls.items():
        firmhash = common.__hash(k)
        print(f'==> get request for: {k} ({firmhash}): {v["ppurl"]}')
        text, status_code = scrape_webpage(v['ppurl'])
        with open(f'../data/policies/scraped/{firmhash}_privacy_policy.txt', 'w') as outfile:
            outfile.write(text)
            
        logdict[firmhash] = {
            'firm': k,
            'ppurl': v['ppurl'],
            'n_char': len(text),
            'statuscode':  status_code
        }
    
    now = datetime.now().strftime('%Y%m%d%H%M%S')
    with open('../data/policies/tmp/policies_scrape_log.json', 'w') as logstream:
        json.dump(logdict, logstream)

In [3]:
if os.path.isfile('../data/policies/urls/privacy_policy_urls_corrected.json'):
    with open('../data/policies/urls/privacy_policy_urls_corrected.json', 'r') as infile:
        ppurls = json.load(infile)
else:
    with open('../data/policies/urls/privacy_policy_urls_20210316.json', 'r') as infile:
        ppurls = json.load(infile)
main(ppurls)

==> get request for: Mutual of Omaha Insurance (15927840): https://www.mutualofomaha.com/legal-services
==> get request for: ODP (69573240): https://www.officedepot.com/cm/help/privacy-statement
==> get request for: Boston Scientific (61421604): https://www.bostonscientific.com/en-US/privacy-policy.html
==> get request for: eBay (41255816): https://www.ebay.com/help/policies/member-behaviour-policies/user-privacy-notice-privacy-policy?id=4260
==> get request for: FirstEnergy (11857351): https://www.firstenergycorp.com/corporate/privacy_legal_statement.html
==> get request for: Entergy (58280832): https://www.entergy.com/privacy-policy/
==> get request for: Nvidia (59476916): https://www.nvidia.com/en-us/about-nvidia/privacy-policy/
==> get request for: IQVIA Holdings (37791241): https://www.iqvia.com/about-us/privacy/privacy-policy
==> get request for: Leidos Holdings (28049975): https://www.leidos.com/privacy
==> get request for: Discovery (94835721): https://corporate.discovery.com/p



==> get request for: General Electric (84482324): https://www.ge.com/privacy
==> get request for: Valero Energy (90987102): https://www.valero.com/privacy-statement
==> get request for: Citigroup (96185568): https://www.citigroup.com/citi/privacy.html
==> get request for: Wells Fargo (20224692): https://www.wellsfargo.com/privacy-security/online/
==> get request for: Anthem (31743452): https://www.antheminc.com/Privacy/index.htm
==> get request for: Comcast (82475883): https://www.xfinity.com/privacy/policy
==> get request for: Phillips 66 (60549119): https://hr.phillips66.com/Legal-And-Privacy-Statement.aspx
==> get request for: Home Depot (36053009): https://www.homedepot.com/privacy/Privacy_Security
==> get request for: Bank of America (4802185): https://www.bankofamerica.com/security-center/privacy-overview/
===> Problem occured during html request
===> HTTPSConnectionPool(host='www.bankofamerica.com', port=443): Max retries exceeded with url: /security-center/privacy-overview/ (Ca