In [30]:
import os
import time
import re
import pprint
import csv
import time
import copy
import json

In [2]:
# regex objects
proxy_re = re.compile('proxy\.bc\.edu\/login\?url=', re.IGNORECASE)
remove_protocol_re = re.compile('https?:\/\/', re.IGNORECASE)
domain_name_re = re.compile('(www\.)?(?P<dn>[a-zA-Z0-9.\-_]*).*', re.IGNORECASE)

In [24]:
# 0   1     2            3    4
# ID, Name, Description, URL, Created,
#
# 5       6      7                      8          9 
# Vendor, Types, Alt. Names / Keywords, Use Proxy, Friendly URL,
#
# 10        11         12                13   14
# Subjects, More Info, Librarian Review, New, Trial, 
#
# 15       16              17      18             19
# Popular, Permitted Uses, Hidden, Internal Note, Owner

db_url_raw = []
db_url_dn_raw = []
with open('az_database_list.csv', newline='') as csvfile:
    db_reader = csv.reader(csvfile, delimiter=',', quotechar='"')
    
    # skip the header
    next(db_reader, None)  # skip the headers
    
    for row in db_reader:
        db_name = row[1]
        db_url = row[3]
        db_uses_proxy = row[8]
        
        db_url_lower = db_url.lower()
        
        # check if db uses proxy -- ignore if db_uses_proxy == "No"
        if db_uses_proxy == "Yes":
            # remove url protocol
            db_url_clean = re.sub(remove_protocol_re, "", db_url_lower)

            # strip out trailing forward slash
            db_url_clean = db_url_clean.rstrip('/')
            
            # db_dict = {
            #     'name': db_name,
            #     'url': db_url_clean
            # }
            # db_url_raw.append(db_dict)
    
            db_url_raw.append(db_url_clean)

            # for db_url_dn_raw, ignore any urls that contain 'bc.edu' and 'wrds-web.wharton.upenn.edu'
            if "bc.edu" not in db_url_lower and "wrds-web.wharton.upenn.edu" not in db_url_lower:
                db_url_dn_raw.append(db_url_clean)

In [25]:
len(db_url_raw)

755

In [26]:
len(db_url_dn_raw)

749

In [28]:
# db_url_raw

In [21]:
# db_url_dn_raw

In [29]:
# remove dupes from db_url_raw
db_url_no_dupes = list(set(db_url_raw))
len(db_url_no_dupes)

737

In [35]:
# sort db_url_no_dupes
db_url_no_dupes.sort()

In [36]:
len(db_url_no_dupes)

737

In [44]:
# db_url_no_dupes

In [52]:
# write out to json
db_list = {
    "databases": db_url_no_dupes,
    "date_stamp": time.strftime("%Y%m%d-%H%M%S")
}
with open('all_databases.txt', 'w') as outfile:
    json.dump(db_list, outfile, ensure_ascii=False, indent=4)

In [70]:
# write out to javascript
with open('all_databases.js', 'w') as outfile:
    outfile.write("var db_full_names = [\n")
    for item in db_url_no_dupes:
      outfile.write("\t'%s',\n" % item)
    outfile.write("];\n")
    outfile.write("var db_full_names_date_stamp = '{}';\n".format(time.strftime("%Y%m%d-%H%M%S")))

In [53]:
# parse out the domain name for every url in db_url_dn_raw
db_url_dn = []
for url in db_url_dn_raw:
    get_dn = domain_name_re.search(url)
    if get_dn:
        matched_dn = get_dn.group("dn")
        if matched_dn:
            db_url_dn.append(matched_dn)
        else:
            # TODO log an error here
            continue

In [54]:
# remove dupes from db_url_dn_raw
db_url_dn_no_dupes = list(set(db_url_dn))
len(db_url_dn_no_dupes)

336

In [55]:
# sort list
db_url_dn_no_dupes.sort()

In [56]:
# db_url_dn_no_dupes

In [57]:
# write out to json
db_list = {
    "databases": db_url_dn_no_dupes,
    "date_stamp": time.strftime("%Y%m%d-%H%M%S")
}
with open('all_databases_domain_names.txt', 'w') as outfile:
    json.dump(db_list, outfile, ensure_ascii=False, indent=4)

In [69]:
# write out to javascript
with open('all_databases_domain_names.js', 'w') as outfile:
    outfile.write("var db_unique_domain_names = [\n")
    for item in db_url_dn_no_dupes:
      outfile.write("\t'%s',\n" % item)
    outfile.write("];\n")
    outfile.write("var db_unique_domain_names_date_stamp = '{}';\n".format(time.strftime("%Y%m%d-%H%M%S")))