# Test notebook for evaluation of Measuring Up privacy leakage analysis

In [1]:
from lxml import html
import requests
import urllib2
from bs4 import BeautifulSoup
import csv
from urlparse import urlsplit, urlunsplit
import datetime
import os
import json
import string
import uuid

In [2]:
# Define Variables

# data file containing the list of libraries from which to retrieve pages
libraryFile = "/Users/kbene/Repos/RDS-Project-MeasuringUp-Privacy/data_all_library.csv" # Office Desktop location
# libraryFile = "/Users/kbene/Repos/MeasuringUp_library-privacy/data_all_library.csv" # Laptop location

# root path for writing out the retrieved files
writePath = "/Users/kbene/Box Sync/IMLS-Measuring-Up/IR-Privacy/HTML_Files"
tablePath = "/Users/kbene/Box Sync/IMLS-Measuring-Up/IR-Privacy/JSON_Files"

In [7]:
# Limited test of the partial collection of libraries in the CSV file
limitedTest = False

# set the execution time to use in creating the subdirectory in which the retrieved HTML files will be placed
runTime = datetime.datetime.now().isoformat()

# create the subdirectory where the retrieved files will be saved
currentWriteFilePath = writePath + "/" + runTime
print "HTML files will be written to: " + currentWriteFilePath
if not os.path.isdir(currentWriteFilePath):
    os.mkdir(currentWriteFilePath)

# create the subdirectory where the generated output tables will be saved
tableWriteFilePath = tablePath + "/" + runTime
print "JSON, log and source files will be written to: " + tableWriteFilePath
if not os.path.isdir(tableWriteFilePath):
    os.mkdir(tableWriteFilePath)



# Connect to the CSV file containing the list of libraries to test
with open(libraryFile, 'rb') as csvfile:
    logfile = open(tableWriteFilePath + "/log.txt", "w")
    logfile.write("requestNo\tlistSequenceNo\tsourceUUID\trequestUUID\trequestURL\tredirects\tstatus\terror\n")
    sourcefile = open(tableWriteFilePath + "/sourceList.txt", "w")
    sourcefile.write("timestamp\tsourceUUID\tmembership\tschool\turl\n")
    libraryReader = csv.DictReader(csvfile, delimiter=",", quotechar='"')
    i=0
    requestNo = 0
    outputDict = {}
    for libraryRow in libraryReader:
        i+=1
        
        sourceUUID = str(uuid.uuid4())
        org = libraryRow['membership']
        schoolName = libraryRow['school']
        sourceURL = libraryRow['url']
        print "(" + str(i) + ") " + "Processing: " + schoolName

        sourcefile.write(''.join([datetime.datetime.now().isoformat(),"\t",sourceUUID,"\t",org,"\t",schoolName,"\t",sourceURL,'\n']))
                
        
        # break the provided URL into components so that secure and insecure requests can be generated
        scheme,host,path,query,fragment = urlsplit(libraryRow['url'])
        insecureLink = urlunsplit(('http',host,path,query,fragment))
        secureLink = urlunsplit(('https',host,path,query,fragment))
        print "Insecure URL: " + insecureLink
        print "Secure URL: " + secureLink
        
        # request the insecure page and process its content
        try:
            requestNo += 1
            timestamp = datetime.datetime.now().isoformat()
            requestUUID = str(uuid.uuid4())
            insecurePageRequest = requests.get(insecureLink, allow_redirects=True)
            pageHeaders = insecurePageRequest.headers
            pageCookies = dict(insecurePageRequest.cookies)
            pageRedirectHist = insecurePageRequest.history
            pageRedirectEndURL = insecurePageRequest.url
            pageHTML = insecurePageRequest.content
            pageSoup = BeautifulSoup(pageHTML,'html.parser')
            
            imageURLs = []
            scriptURLs = []
            cssURLs = []
            for image in pageSoup.find_all('img'):
                imageURLs.append(image.get('src'))
            for script in pageSoup.find_all('script'):
                scriptURLs.append(script.get('src'))
            for link in pageSoup.find_all('link', rel='stylesheet'):
                cssURLs.append(link.get('href'))
            
            if urlsplit(pageRedirectEndURL)[0] == 'https':
                pageSecureRedirect = True
            else:
                pageSecureRedirect = False
            #print pageHeaders
            #print pageCookies
            #print pageRedirectHist
            #print pageRedirectEndURL
            #print pageSecureRedirect
            #print imageURLs
            #print scriptURLs
            # write out the retrieved HTML file
            thisFileWriteFilename = requestUUID + ".html"
            thisFileFullPathWFilename = currentWriteFilePath+'/'+thisFileWriteFilename
            f = open(thisFileFullPathWFilename, 'w')
            f.write(pageHTML)
            f.close()
            # generate the outputDict entry for this web site
            thisDict = {requestUUID: {
                    'requestTimestamp':timestamp,
                    'org':org,
                    'schoolName':schoolName,
                    'requestURL':insecureLink,
                    'pageHeaders': repr(pageHeaders),
                    'pageCookies': pageCookies,
                    'pageRedirectHist': repr(pageRedirectHist),
                    'pageRedirectEndURL': pageRedirectEndURL,
                    'imageURLs': imageURLs,
                    'scriptURLs': scriptURLs,
                    'cssURLs':cssURLs,
                    'filename':thisFileWriteFilename,
                    'sourceUUID':sourceUUID
                }}
            #print thisDict
            outputDict.update(thisDict)
            logstring = ''.join([str(requestNo),"\t",str(i),"\t",sourceUUID,"\t",requestUUID,"\t",insecureLink,"\t",repr(pageRedirectHist),"\tsuccess\tnone\n"])
            logfile.write(logstring)
        except Exception as e:
            print
            print "something went wrong with the insecure request"
            print(e)
            errorString = str(e)
            logstring = ''.join([str(requestNo),"\t",str(i),"\t",sourceUUID,"\t",requestUUID,"\t",insecureLink,"\t",repr(pageRedirectHist),"\terror","\t",errorString,"\n"])
            logfile.write(logstring)

        
        # request the secure page and process its content
        try:
            requestNo += 1
            timestamp = datetime.datetime.now().isoformat()
            requestUUID = str(uuid.uuid4())
            securePageRequest = requests.get(secureLink, allow_redirects=True)
            pageHeaders = securePageRequest.headers
            pageCookies = dict(securePageRequest.cookies)
            pageRedirectHist = securePageRequest.history
            pageRedirectEndURL = securePageRequest.url
            pageHTML = securePageRequest.content
            pageSoup = BeautifulSoup(pageHTML,'html.parser')
            
            imageURLs = []
            scriptURLs = []
            cssURLs = []
            for image in pageSoup.find_all('img'):
                imageURLs.append(image.get('src'))
            for script in pageSoup.find_all('script'):
                scriptURLs.append(script.get('src'))
            for link in pageSoup.find_all('link', rel='stylesheet'):
                cssURLs.append(link.get('href'))
            
            if urlsplit(pageRedirectEndURL)[0] == 'https':
                pageSecureRedirect = True
            else:
                pageSecureRedirect = False
            #print pageHeaders
            #print pageCookies
            #print pageRedirectHist
            #print pageRedirectEndURL
            #print pageSecureRedirect
            #print imageURLs
            #print scriptURLs
            # write out the retrieved HTML file
            thisFileWriteFilename = requestUUID + ".html"
            thisFileFullPathWFilename = currentWriteFilePath+'/'+thisFileWriteFilename
            f = open(thisFileFullPathWFilename, 'w')
            f.write(pageHTML)
            f.close()
            # generate the outputDict entry for this web site
            thisDict = {requestUUID: {
                    'requestTimestamp':timestamp,
                    'org':org,
                    'schoolName':schoolName,
                    'requestURL': secureLink,
                    'pageHeaders': repr(pageHeaders),
                    'pageCookies': pageCookies,
                    'pageRedirectHist': repr(pageRedirectHist),
                    'pageRedirectEndURL': pageRedirectEndURL,
                    'imageURLs': imageURLs,
                    'scriptURLs': scriptURLs,
                    'cssURLs':cssURLs,
                    'filename':thisFileWriteFilename,
                    'sourceUUID':sourceUUID
                }}
            #print thisDict
            outputDict.update(thisDict)
            logstring = ''.join([str(requestNo),"\t",str(i),"\t",sourceUUID,"\t",requestUUID,"\t",secureLink,"\t",repr(pageRedirectHist),"\tsuccess\tnone\n"])
            logfile.write(logstring)
        except Exception as e:
            print
            print "something went wrong with the secure request"
            print(e)
            errorString = str(e)
            logstring = ''.join([str(requestNo),"\t",str(i),"\t",sourceUUID,"\t",requestUUID,"\t",secureLink,"\t",repr(pageRedirectHist),"\terror","\t",errorString,"\n"])
            logfile.write(logstring)
        
        f = open(tableWriteFilePath + "/result.json", "w")
        f.write(json.dumps(outputDict))
        f.close()
        print
        print "======================================"
        if limitedTest and i>10:
            break
    logfile.close()
    sourcefile.close()

HTML files will be written to: /Users/kbene/Box Sync/IMLS-Measuring-Up/IR-Privacy/HTML_Files/2016-10-05T14:51:11.452293
JSON, log and source files will be written to: /Users/kbene/Box Sync/IMLS-Measuring-Up/IR-Privacy/JSON_Files/2016-10-05T14:51:11.452293
(1) Processing: University of Alabama Libraries
Insecure URL: http://www.lib.ua.edu/
Secure URL: https://www.lib.ua.edu/

(2) Processing: University at Albany, SUNY, Libraries
Insecure URL: http://library.albany.edu/
Secure URL: https://library.albany.edu/

(3) Processing: University of Alberta Libraries
Insecure URL: http://www.library.ualberta.ca/
Secure URL: https://www.library.ualberta.ca/

(4) Processing: University of Arizona Libraries
Insecure URL: http://www.library.arizona.edu/
Secure URL: https://www.library.arizona.edu/

(5) Processing: Arizona State University Libraries
Insecure URL: http://lib.asu.edu/
Secure URL: https://lib.asu.edu/

(6) Processing: Auburn University Libraries
Insecure URL: http://www.lib.auburn.edu/
Se