# Test notebook for evaluation of Measuring Up privacy leakage analysis

In [20]:
from lxml import html
import requests
import urllib2
from bs4 import BeautifulSoup
import csv
from urlparse import urlsplit, urlunsplit
import datetime
import os
import json

In [16]:
# Define Variables

# Limited test of the partial collection of libraries in the CSV file
limitedTest = True


# Get Member Library Names and URLs
dataDLF = 'https://www.diglib.org/members/'
dataARL = 'http://www.arl.org/membership/list-of-arl-members'
dataOCLC = 'http://www.oclc.org/research/partnership/roster.html'

# data file containing the list of libraries from which to retrieve pages
libraryFile = "data_all_library.csv"

# root path for writing out the retrieved files
writePath = "fileStore"
tablePath = "tableStore"

In [19]:
# set the execution time to use in creating the subdirectory in which the retrieved HTML files will be placed
runTime = datetime.datetime.now().isoformat()

# create the subdirectory where the retrieved files will be saved
currentWriteFilePath = writePath + "/" + runTime
if not os.path.isdir(currentWriteFilePath):
    os.mkdir(currentWriteFilePath)

# create the subdirectory where the generated output tables will be saved
tableWriteFilePath = tablePath + "/" + runTime
if not os.path.isdir(tableWriteFilePath):
    os.mkdir(tableWriteFilePath)

# Connect to the CSV file containing the list of libraries to test
with open(libraryFile, 'rb') as csvfile:
    libraryReader = csv.DictReader(csvfile, delimiter=",", quotechar='"')
    i=0
    outputDict = {}
    for libraryRow in libraryReader:
        i+=1
        print "(" + str(i) + ") " + "Processing: " + libraryRow['school']
        
        # break the provided URL into components so that secure and insecure requests can be generated
        scheme,host,path,query,fragment = urlsplit(libraryRow['url'])
        insecureLink = urlunsplit(('http',host,path,query,fragment))
        secureLink = urlunsplit(('https',host,path,query,fragment))
        print "Insecure URL: " + insecureLink
        print "Secure URL: " + secureLink
        
        # request the insecure page and process its content
        try:
            insecurePageRequest = requests.get(insecureLink, allow_redirects=True)
            pageHeaders = insecurePageRequest.headers
            pageCookies = dict(insecurePageRequest.cookies)
            pageRedirectHist = insecurePageRequest.history
            pageRedirectEndURL = insecurePageRequest.url
            pageHTML = insecurePageRequest.content
            pageSoup = BeautifulSoup(pageHTML,'html.parser')
            imageURLs = []
            scriptURLs = []
            for image in pageSoup.find_all('img'):
                imageURLs.append(image.get('src'))
            for script in pageSoup.find_all('script'):
                scriptURLs.append(script.get('src'))
            if urlsplit(pageRedirectEndURL)[0] == 'https':
                pageSecureRedirect = True
            else:
                pageSecureRedirect = False
            #print pageHeaders
            #print pageCookies
            #print pageRedirectHist
            #print pageRedirectEndURL
            #print pageSecureRedirect
            #print imageURLs
            #print scriptURLs
            # write out the retrieved HTML file
            thisFileWriteFilename = "insecure_" + host + ".html"
            thisFileFullPathWFilename = currentWriteFilePath+'/'+thisFileWriteFilename
            f = open(thisFileFullPathWFilename, 'w')
            f.write(pageHTML)
            f.close()
            # generate the outputDict entry for this web site
            thisDict = {insecureLink: {
                    'pageHeaders': pageHeaders,
                    'pageCookies': pageCookies,
                    'pageRedirectHist': pageRedirectHist,
                    'pageRedirectEndURL': pageRedirectEndURL,
                    'imageURLs': imageURLs,
                    'scriptURLs': scriptURLs
                }}
            #print thisDict
            outputDict.update(thisDict)
        except:
            print "something went wrong - more soon ;)"
        
        # request the secure page and process its content
        try:
            securePageRequest = requests.get(secureLink, allow_redirects=True)
            pageHeaders = securePageRequest.headers
            pageCookies = dict(securePageRequest.cookies)
            pageRedirectHist = securePageRequest.history
            pageRedirectEndURL = securePageRequest.url
            pageHTML = securePageRequest.content
            pageSoup = BeautifulSoup(pageHTML,'html.parser')
            imageURLs = []
            scriptURLs = []
            for image in pageSoup.find_all('img'):
                imageURLs.append(image.get('src'))
            for script in pageSoup.find_all('script'):
                scriptURLs.append(script.get('src'))
            if urlsplit(pageRedirectEndURL)[0] == 'https':
                pageSecureRedirect = True
            else:
                pageSecureRedirect = False
            #print pageHeaders
            #print pageCookies
            #print pageRedirectHist
            #print pageRedirectEndURL
            #print pageSecureRedirect
            #print imageURLs
            #print scriptURLs
            # write out the retrieved HTML file
            thisFileWriteFilename = "secure_" + host + ".html"
            thisFileFullPathWFilename = currentWriteFilePath+'/'+thisFileWriteFilename
            f = open(thisFileFullPathWFilename, 'w')
            f.write(pageHTML)
            f.close()
            # generate the outputDict entry for this web site
            thisDict = {secureLink: {
                    'pageHeaders': pageHeaders,
                    'pageCookies': pageCookies,
                    'pageRedirectHist': pageRedirectHist,
                    'pageRedirectEndURL': pageRedirectEndURL,
                    'imageURLs': imageURLs,
                    'scriptURLs': scriptURLs
                }}
            #print thisDict
            outputDict.update(thisDict)
        except:
            print "something went wrong - more soon ;)"
        

        print
        print "======================================"
        if limitedTest and i>10:
            break

(1) Processing: University of Alabama Libraries
Insecure URL: http://www.lib.ua.edu/
Secure URL: https://www.lib.ua.edu/
{'http://www.lib.ua.edu/': {'pageHeaders': {'X-Powered-By': 'PHP/5.3.17', 'Transfer-Encoding': 'chunked', 'Vary': 'Accept-Encoding', 'Keep-Alive': 'timeout=15, max=100', 'Server': 'Apache/2.2.12 (Linux/SUSE)', 'Connection': 'Keep-Alive', 'Link': '<https://www.lib.ua.edu/wp-json/>; rel="https://api.w.org/", <https://www.lib.ua.edu/>; rel=shortlink', 'Date': 'Wed, 27 Jul 2016 04:44:27 GMT', 'Content-Type': 'text/html; charset=UTF-8', 'X-Pingback': 'https://www.lib.ua.edu/xmlrpc.php'}, 'scriptURLs': [None, None, u'https://www.lib.ua.edu/wp-content/themes/roots-ualib/assets/js/local.js?ver=7921d642b236f108b809cb1bca816257', None, u'https://www.lib.ua.edu/wp-content/themes/roots-ualib/assets/js/vendor/modernizr.min.js', u'//cdnjs.cloudflare.com/ajax/libs/angular.js/1.2.28/angular.min.js', u'//cdnjs.cloudflare.com/ajax/libs/angular.js/1.2.28/angular-animate.min.js', u'//cd