In [185]:
import os
import time
import re
import pprint
import csv
import time
import copy

In [186]:
# regex objects
proxy_re = re.compile('proxy\.bc\.edu\/login\?url=', re.IGNORECASE)
remove_protocol_re = re.compile('https?:\/\/', re.IGNORECASE)
domain_name_re = re.compile('(www\.)?(?P<dn>[a-zA-Z0-9.\-_]*).*', re.IGNORECASE)

In [209]:
# 0   1     2            3    4
# ID, Name, Description, URL, Created,
#
# 5       6      7                      8          9 
# Vendor, Types, Alt. Names / Keywords, Use Proxy, Friendly URL,
#
# 10        11         12                13   14
# Subjects, More Info, Librarian Review, New, Trial, 
#
# 15       16              17      18             19
# Popular, Permitted Uses, Hidden, Internal Note, Owner

db_url_raw = []
db_url_dn_raw = []
with open('az_database_list.csv', newline='') as csvfile:
    db_reader = csv.reader(csvfile, delimiter=',', quotechar='"')
    
    # skip the header
    next(db_reader, None)  # skip the headers
    
    for row in db_reader:
        db_url = row[3]
        db_uses_proxy = row[8]
        
        db_url_lower = db_url.lower()
        
        # check if db uses proxy -- ignore if db_uses_proxy == "No"
        if db_uses_proxy == "Yes":
            # remove url protocol
            db_url_clean = re.sub(remove_protocol_re, "", db_url_lower)

            # strip out trailing forward slash
            db_url_clean = db_url_clean.rstrip('/')
    
            db_url_raw.append(db_url_clean)

            # for db_url_dn_raw, ignore any urls that contain 'bc.edu' and 'wrds-web.wharton.upenn.edu'
            if "bc.edu" not in db_url_lower and "wrds-web.wharton.upenn.edu" not in db_url_lower:
                db_url_dn_raw.append(db_url_clean)

In [210]:
len(db_url_raw)

753

In [211]:
len(db_url_dn_raw)

747

In [212]:
db_url_raw

['infotrac.galegroup.com/itweb/mlin_m_bostcoll?db=bncn',
 'history.paratext.com',
 'infotrac.galegroup.com/itweb/mlin_m_bostcoll?db=ahsi&id=boston',
 'dictionaries.brillonline.com/montanari',
 'search.ebscohost.com/login.aspx?authtype=ip,uid&profile=ehost&defaultdb=33h',
 'www.morganclaypool.com/search/advanced',
 'analyticscampus.gallup.com/?ref=auth',
 'search.proquest.com/globalnews/embedded/sv46nqy98qsi5agf?_ga=2.24403762.927166705.1519997644-1600456357.1519997644',
 'www.hstalks.com/business',
 'dlib.eastview.com/browse/books/1670',
 'www.jstor.org/sustainability',
 'trials.proquest.com/trials/trialsummary.action?view=subject&trialbean.token=csrde8xxjivge9gv69un',
 'www.rocksbackpages.com',
 'sk.sagepub.com/video/business-management',
 'microform.digital/boa/search?facettopic=tradeinpeople',
 'microform.digital/boa/search?facettopic=tww',
 'search.proquest.com/usmajordailies/embedded/sv46nqy98qsi5agf?_ga=2.237868168.927166705.1519997644-1600456357.1519997644',
 'search.alexanderst

In [213]:
db_url_dn_raw

['infotrac.galegroup.com/itweb/mlin_m_bostcoll?db=bncn',
 'history.paratext.com',
 'infotrac.galegroup.com/itweb/mlin_m_bostcoll?db=ahsi&id=boston',
 'dictionaries.brillonline.com/montanari',
 'search.ebscohost.com/login.aspx?authtype=ip,uid&profile=ehost&defaultdb=33h',
 'www.morganclaypool.com/search/advanced',
 'analyticscampus.gallup.com/?ref=auth',
 'search.proquest.com/globalnews/embedded/sv46nqy98qsi5agf?_ga=2.24403762.927166705.1519997644-1600456357.1519997644',
 'www.hstalks.com/business',
 'dlib.eastview.com/browse/books/1670',
 'www.jstor.org/sustainability',
 'trials.proquest.com/trials/trialsummary.action?view=subject&trialbean.token=csrde8xxjivge9gv69un',
 'www.rocksbackpages.com',
 'sk.sagepub.com/video/business-management',
 'microform.digital/boa/search?facettopic=tradeinpeople',
 'microform.digital/boa/search?facettopic=tww',
 'search.proquest.com/usmajordailies/embedded/sv46nqy98qsi5agf?_ga=2.237868168.927166705.1519997644-1600456357.1519997644',
 'search.alexanderst

In [214]:
# remove dupes from db_url_raw
db_url_no_dupes = list(set(db_url_raw))
len(db_url_no_dupes)

735

In [215]:
# sort db_url_no_dupes
db_url_no_dupes.sort()

In [216]:
len(db_url_no_dupes)

735

In [217]:
db_url_no_dupes

['aadr.alexanderstreet.com',
 'abcas3.auditedmedia.com/micenter/micenter',
 'academic.mintel.com',
 'access.newspaperarchive.com',
 'access.vault.com/career-insider-login.aspx?aid=148188',
 'acta.chadwyck.com',
 'afi.chadwyck.com',
 'amadeus.bvdinfo.com/ip',
 'analyticscampus.gallup.com/?ref=auth',
 'ancestrylibrary.proquest.com',
 'anthrosource.onlinelibrary.wiley.com',
 'apps.brepolis.net/brepolisportal/default.aspx',
 'apps.brepolis.net/dhge/test/default2.aspx',
 'apps.brepolis.net/lexiema/test/default2.aspx',
 'apps.intelligize.com',
 'archive.irishnewsarchive.com/olive/apa/ina.edu/default.aspx#panel=home',
 'archives.chadwyck.com',
 'artfl-project.uchicago.edu/content/tfa',
 'asp6new.alexanderstreet.com/orhi',
 'asp6new.alexanderstreet.com/sixt',
 'asp6new.alexanderstreet.com/was2/was2.index.map.aspx',
 'atho.alexanderstreet.com',
 'bap.chadwyck.com',
 'bc.app.movie-discovery.com',
 'bc.kanopystreaming.com',
 'bc.policymap.com/maps',
 'bldr.alexanderstreet.com',
 'bltc.alexanderst

In [219]:
# parse out the domain name for every url in db_url_dn_raw
db_url_dn = []
for url in db_url_dn_raw:
    get_dn = domain_name_re.search(url)
    if get_dn:
        matched_dn = get_dn.group("dn")
        if matched_dn:
            db_url_dn.append(matched_dn)
        else:
            # TODO log an error here
            continue

In [220]:
# remove dupes from db_url_dn_raw
db_url_dn_no_dupes = list(set(db_url_dn))
len(db_url_dn_no_dupes)

335

In [221]:
# sort list
db_url_dn_no_dupes.sort()

In [222]:
db_url_dn_no_dupes

['18thcjournals.amdigital.co.uk',
 'aadr.alexanderstreet.com',
 'aapredbook.org',
 'abcas3.auditedmedia.com',
 'academic.marketresearch.com',
 'academic.mintel.com',
 'access.newspaperarchive.com',
 'access.vault.com',
 'accessible-archives.com',
 'accessible.com',
 'accessscience.com',
 'acta.chadwyck.com',
 'afi.chadwyck.com',
 'airitilibrary.com',
 'alacra.com',
 'amadeus.bvdinfo.com',
 'analyticscampus.gallup.com',
 'anb.org',
 'ancestrylibrary.proquest.com',
 'anthrosource.onlinelibrary.wiley.com',
 'apps.brepolis.net',
 'apps.intelligize.com',
 'archive.irishnewsarchive.com',
 'archives.chadwyck.com',
 'archivesdirect.amdigital.co.uk',
 'artfl-project.uchicago.edu',
 'askart.com',
 'asp6new.alexanderstreet.com',
 'atho.alexanderstreet.com',
 'balkaninsight.com',
 'bap.chadwyck.com',
 'bc.app.movie-discovery.com',
 'bc.kanopystreaming.com',
 'bc.policymap.com',
 'bdsl-online.de',
 'beckettarchive.org',
 'bibliorossica.com',
 'biocyc.org',
 'blackwellreference.com',
 'bldr.alexande

In [205]:
if "bc.edu" not in 'libguides.bc.edu/chant' and "foo" not in 'libguides.bc.edu/chant':
    print("not found in string")
else:
    print("found in string")

found in string
