diff --git a/report/report.lyx b/report/report.lyx index 18d2149..ab3b81a 100644 --- a/report/report.lyx +++ b/report/report.lyx @@ -2589,7 +2589,7 @@ When .SE performs their annual These domains are mostly from the .se zone and cover government, county, municipality, higher education, government-owned corporations, financial service, internet service provider (ISP), domain registrar, and media domains. - Some domains overlap both in and between categories; domains have been + Some domains overlap both within and between categories; domains have been deduplicated. \end_layout @@ -2669,14 +2669,14 @@ numprint{10000} \end_layout \begin_layout Subsection -Random .com, .net, .name domains +Random .com, .net domains \end_layout \begin_layout Standard The maintainers of the .com, .net and .name TLDs, Verisign, allow downloading of the complete zone file under an agreement. The .com zone is the largest one by far, and the .net zone is in the top - 5. + 4. \begin_inset Foot status open @@ -2849,15 +2849,16 @@ These are the final domain lists in use. \begin_layout Standard \begin_inset Tabular - + - - - - - + + + + + + - + \begin_inset Text \begin_layout Plain Layout @@ -2866,7 +2867,7 @@ Name \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -2875,7 +2876,7 @@ Date \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -2884,7 +2885,7 @@ Total size \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -2893,18 +2894,27 @@ Selection \end_inset - + \begin_inset Text \begin_layout Plain Layout Selection size \end_layout +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +Unique +\end_layout + \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -2913,7 +2923,7 @@ Selection size \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -2922,16 +2932,28 @@ Selection size \end_inset - + \begin_inset Text \begin_layout Plain Layout +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +numprint{980} +\end_layout + +\end_inset + \end_layout \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -2940,7 +2962,16 @@ curated \end_inset - + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + \begin_inset Text \begin_layout Plain Layout @@ -2963,7 +2994,7 @@ numprint{915} - + \begin_inset Text \begin_layout Plain Layout @@ -2972,7 +3003,7 @@ numprint{915} \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -2981,34 +3012,34 @@ numprint{915} \end_inset - + \begin_inset Text \begin_layout Plain Layout -\begin_inset ERT +\begin_inset Foot status open \begin_layout Plain Layout +\begin_inset CommandInset href +LatexCommand href +target "https://www.iis.se/domaner/statistik/tillvaxt/?chart=active" + +\end_inset -\backslash -numprint{1318000} \end_layout \end_inset -\begin_inset Foot +\begin_inset ERT status open \begin_layout Plain Layout -\begin_inset CommandInset href -LatexCommand href -target "https://www.iis.se/domaner/statistik/tillvaxt/?chart=active" - -\end_inset +\backslash +numprint{1318000} \end_layout \end_inset @@ -3018,7 +3049,7 @@ target "https://www.iis.se/domaner/statistik/tillvaxt/?chart=active" \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -3027,7 +3058,28 @@ random \end_inset - + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +numprint{100000} +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + \begin_inset Text \begin_layout Plain Layout @@ -3050,7 +3102,7 @@ numprint{100000} - + \begin_inset Text \begin_layout Plain Layout @@ -3059,7 +3111,7 @@ numprint{100000} \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -3068,34 +3120,34 @@ numprint{100000} \end_inset - + \begin_inset Text \begin_layout Plain Layout -\begin_inset ERT +\begin_inset Foot status open \begin_layout Plain Layout +\begin_inset CommandInset href +LatexCommand href +target "https://stats.dk-hostmaster.dk/domains/total_domains/" + +\end_inset -\backslash -numprint{1260000} \end_layout \end_inset -\begin_inset Foot +\begin_inset ERT status open \begin_layout Plain Layout -\begin_inset CommandInset href -LatexCommand href -target "https://stats.dk-hostmaster.dk/domains/total_domains/" - -\end_inset +\backslash +numprint{1260000} \end_layout \end_inset @@ -3105,7 +3157,7 @@ target "https://stats.dk-hostmaster.dk/domains/total_domains/" \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -3114,7 +3166,28 @@ random \end_inset - + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +numprint{10000} +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + \begin_inset Text \begin_layout Plain Layout @@ -3137,7 +3210,7 @@ numprint{10000} - + \begin_inset Text \begin_layout Plain Layout @@ -3146,7 +3219,7 @@ numprint{10000} \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -3155,7 +3228,7 @@ numprint{10000} \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -3176,7 +3249,7 @@ numprint{114178000} \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -3185,7 +3258,28 @@ random \end_inset - + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +numprint{10000} +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + \begin_inset Text \begin_layout Plain Layout @@ -3208,7 +3302,7 @@ numprint{10000} - + \begin_inset Text \begin_layout Plain Layout @@ -3217,7 +3311,7 @@ numprint{10000} \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -3226,7 +3320,7 @@ numprint{10000} \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -3247,7 +3341,7 @@ numprint{15096000} \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -3256,7 +3350,28 @@ random \end_inset - + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +numprint{10000} +\end_layout + +\end_inset + + +\end_layout + +\end_inset + + \begin_inset Text \begin_layout Plain Layout @@ -3279,7 +3394,7 @@ numprint{10000} - + \begin_inset Text \begin_layout Plain Layout @@ -3288,7 +3403,7 @@ reach50.com \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -3297,7 +3412,7 @@ reach50.com \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -3318,7 +3433,7 @@ numprint{50} \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -3327,7 +3442,16 @@ top \end_inset - + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + \begin_inset Text \begin_layout Plain Layout @@ -3350,7 +3474,7 @@ numprint{50} - + \begin_inset Text \begin_layout Plain Layout @@ -3359,7 +3483,7 @@ Alexa Top 1M \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -3368,7 +3492,7 @@ Alexa Top 1M \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -3389,7 +3513,7 @@ numprint{1000000} \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -3398,7 +3522,7 @@ top \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -3415,13 +3539,34 @@ numprint{10000} \end_inset +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +numprint{9986} +\end_layout + +\end_inset + + \end_layout \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -3430,7 +3575,7 @@ numprint{10000} \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -3439,7 +3584,7 @@ numprint{10000} \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -3448,7 +3593,7 @@ numprint{10000} \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -3457,7 +3602,7 @@ random \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -3474,13 +3619,34 @@ numprint{10000} \end_inset +\end_layout + +\end_inset + + +\begin_inset Text + +\begin_layout Plain Layout +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +numprint{9959} +\end_layout + +\end_inset + + \end_layout \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -3489,7 +3655,7 @@ numprint{10000} \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -3498,7 +3664,7 @@ numprint{10000} \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -3507,7 +3673,7 @@ numprint{10000} \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -3516,7 +3682,16 @@ numprint{10000} \end_inset - + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + \begin_inset Text \begin_layout Plain Layout @@ -3539,7 +3714,7 @@ numprint{3364} - + \begin_inset Text \begin_layout Plain Layout @@ -3548,7 +3723,7 @@ numprint{3364} \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -3557,7 +3732,7 @@ numprint{3364} \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -3566,7 +3741,7 @@ numprint{3364} \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -3575,7 +3750,16 @@ numprint{3364} \end_inset - + +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset + + \begin_inset Text \begin_layout Plain Layout @@ -3598,16 +3782,18 @@ numprint{2637} - + \begin_inset Text \begin_layout Plain Layout + +\series bold Total \end_layout \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -3616,10 +3802,12 @@ Total \end_inset - + \begin_inset Text \begin_layout Plain Layout + +\series bold \begin_inset ERT status open @@ -3637,7 +3825,7 @@ numprint{132852050} \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -3646,10 +3834,12 @@ numprint{132852050} \end_inset - + \begin_inset Text \begin_layout Plain Layout + +\series bold \begin_inset ERT status open @@ -3667,541 +3857,559 @@ numprint{156907} \end_inset - - + +\begin_inset Text -\end_inset +\begin_layout Plain Layout +\series bold +\begin_inset ERT +status open -\end_layout +\begin_layout Plain Layout -\begin_layout Section -External datasets -\end_layout -\begin_layout Subsection -Disconnect's blocking list +\backslash +numprint{156045} \end_layout -\begin_layout Standard -One of the most popular privacy tools is Disconnect, which blocks tracking - sites by running as a browser plugin. - Disconnect was started by ex-Google engineers, and seems to still have - close ties to Google. -\end_layout +\end_inset -\begin_layout Standard -The Disconnect software lets users block/unblock loading resources from - specific third-party domains. - A list of 2143 domains is used as the basis from the blocking. - Each entry belongs to an organization, including a link to the organizations - webpage. - There is also a grouping into categories, here shown with some examples. - Worth noting is that the content category isn't blocked by default. - -\begin_inset Note Greyedout -status open -\begin_layout Plain Layout -Write about top results in datasets. \end_layout +\end_inset + + + + \end_inset \end_layout -\begin_layout Subsubsection -Advertising +\begin_layout Subsection +TLD distribution \end_layout \begin_layout Standard -\begin_inset Note Greyedout -status open +These are the top TLDs in the list of unique domains. +\end_layout + +\begin_layout Standard +\begin_inset Tabular + + + + + + + +\begin_inset Text \begin_layout Plain Layout -Add technorati.com, wpp.com? +Rank \end_layout \end_inset + + +\begin_inset Text - +\begin_layout Plain Layout +Count \end_layout -\begin_layout Description -overture.com Yahoo's ad network. -\end_layout +\end_inset + + +\begin_inset Text -\begin_layout Description -omniture.com Adobe's ad network. +\begin_layout Plain Layout +TLD \end_layout -\begin_layout Description -amazon-adsystem.com Amazon's ad delivery network. -\end_layout +\end_inset + + + + +\begin_inset Text -\begin_layout Subsubsection -Analytics -\end_layout +\begin_layout Plain Layout -\begin_layout Description -alexa.com Amazon's web statistics service, considered an authority in web - measurement. +\family roman +\series medium +\shape up +\size normal +\emph off +\bar no +\strikeout off +\uuline off +\uwave off +\noun off +\color none +1 \end_layout -\begin_layout Description -comscore.com Analytics service that also publishes statistics. -\end_layout +\end_inset + + +\begin_inset Text -\begin_layout Description -gaug.es GitHub's analytics service. -\end_layout +\begin_layout Plain Layout +\begin_inset ERT +status open -\begin_layout Description -coremetrics.com Part of IBM's enterprise marketing services. -\end_layout +\begin_layout Plain Layout -\begin_layout Description -newrelic.com A suite of systems monitoring and analytics software, up to - and including browsers. -\end_layout -\begin_layout Description -nielsen.com Consumer studies. +\backslash +numprint{103645} \end_layout -\begin_layout Description -statcounter.com Web statistics tool. -\end_layout +\end_inset -\begin_layout Description -webtrends.com Digital marketing analytics and optimization across channels. -\end_layout -\begin_layout Subsubsection -Content \end_layout -\begin_layout Standard -Sites that deliver content. - There is a wide variety of content, from images and videos to A/B testing, - comment and helpdesk services. - This category is not blocked by default. -\end_layout +\end_inset + + +\begin_inset Text -\begin_layout Description -brightcove.com Video hosting/monetization service. -\end_layout +\begin_layout Plain Layout -\begin_layout Description -disqus.com A third party comment service. +\family roman +\series medium +\shape up +\size normal +\emph off +\bar no +\strikeout off +\uuline off +\uwave off +\noun off +\color none +.se \end_layout -\begin_layout Description -flickr.com Flickr is a photo/video hosting site, owned by Yahoo. -\end_layout +\end_inset + + + + +\begin_inset Text -\begin_layout Description -instagram.com Facebook's photo/video sharing site. -\end_layout +\begin_layout Plain Layout -\begin_layout Description -office.com Microsoft's Office suite online. +\family roman +\series medium +\shape up +\size normal +\emph off +\bar no +\strikeout off +\uuline off +\uwave off +\noun off +\color none +2 \end_layout -\begin_layout Description -optimizely.com An A/B testing service. -\end_layout +\end_inset + + +\begin_inset Text -\begin_layout Description -truste.com Provides certification and tools for privacy policies in order - to gain users' trust; “enabling businesses to safely collect and use customer - data across web, mobile, cloud and advertising channels.” This includes - ways to selectively opt-out from cookies features; required, functional - or advertising. -\end_layout +\begin_layout Plain Layout +\begin_inset ERT +status open -\begin_layout Description -tumblr.com A popular blogging platform. -\end_layout +\begin_layout Plain Layout -\begin_layout Description -uservoice.com A customer support service. -\end_layout -\begin_layout Description -vimeo.com A video site. +\backslash +numprint{21203} \end_layout -\begin_layout Description -youtube.com Google's video site. -\end_layout +\end_inset -\begin_layout Subsubsection -Disconnect -\end_layout -\begin_layout Standard -A special category for Facebook, Google and Twitter. - It seems to initially have been designed to block their respective like/+1/twee -t buttons, but also contains other known tracking domains belonging to the - same organization. \end_layout -\begin_layout Standard -It's worth noting that adwords.google.com, doubleclick.net, admob.com and several - other of Google's ad networks, are listed here. -\end_layout +\end_inset + + +\begin_inset Text -\begin_layout Subsubsection -Social -\end_layout +\begin_layout Plain Layout -\begin_layout Standard -Site with an emphasis on social aspects. - They often have buttons to vote for, recommend or share with others. +\family roman +\series medium +\shape up +\size normal +\emph off +\bar no +\strikeout off +\uuline off +\uwave off +\noun off +\color none +.com \end_layout -\begin_layout Description -addthis.com A link sharing service aggregator. -\end_layout +\end_inset + + + + +\begin_inset Text -\begin_layout Description -digg.com News aggregator. -\end_layout +\begin_layout Plain Layout -\begin_layout Description -linkedin.com Professional social network. +\family roman +\series medium +\shape up +\size normal +\emph off +\bar no +\strikeout off +\uuline off +\uwave off +\noun off +\color none +3 \end_layout -\begin_layout Description -reddit.com Social new and link sharing, and discussion. -\end_layout +\end_inset + + +\begin_inset Text -\begin_layout Subsection -Public suffix list -\begin_inset Foot +\begin_layout Plain Layout +\begin_inset ERT status open \begin_layout Plain Layout -\begin_inset CommandInset href -LatexCommand href -target "https://publicsuffix.org/" - -\end_inset +\backslash +numprint{12610} \end_layout \end_inset -\begin_inset Foot -status open - -\begin_layout Plain Layout -\begin_inset CommandInset href -LatexCommand href -target "https://en.wikipedia.org/wiki/Public_Suffix_List" +\end_layout \end_inset + + +\begin_inset Text +\begin_layout Plain Layout +\family roman +\series medium +\shape up +\size normal +\emph off +\bar no +\strikeout off +\uuline off +\uwave off +\noun off +\color none +.dk \end_layout \end_inset + + + + +\begin_inset Text +\begin_layout Plain Layout +\family roman +\series medium +\shape up +\size normal +\emph off +\bar no +\strikeout off +\uuline off +\uwave off +\noun off +\color none +4 \end_layout -\begin_layout Standard -In the domain name system, it is not always obvious what parts of a domain - name are a public suffix and which are open for registration by Internet - users. - The main example is -\begin_inset Flex Code -status collapsed +\end_inset + + +\begin_inset Text \begin_layout Plain Layout -example.co.uk +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +numprint{11012} \end_layout \end_inset -, where the public suffix -\begin_inset Flex Code -status collapsed -\begin_layout Plain Layout -co.uk \end_layout \end_inset - - is to different from the TLD -\begin_inset Flex Code -status collapsed + + +\begin_inset Text \begin_layout Plain Layout -uk + +\family roman +\series medium +\shape up +\size normal +\emph off +\bar no +\strikeout off +\uuline off +\uwave off +\noun off +\color none +.net \end_layout \end_inset - -. - Because HTTP cookies are based on domains names, it is important to browser - vendors to be able to recognize which parts are public suffixes to be able - to protect users against supercookies -\begin_inset Foot -status open + + + + +\begin_inset Text \begin_layout Plain Layout -\begin_inset CommandInset href -LatexCommand href -target "https://en.wikipedia.org/wiki/HTTP_cookie#Supercookie" - -\end_inset - +\family roman +\series medium +\shape up +\size normal +\emph off +\bar no +\strikeout off +\uuline off +\uwave off +\noun off +\color none +5 \end_layout \end_inset - -; cookies which are scoped to a public suffix, and therefore readable across - all web sites under that public suffix. - The same dataset is also useful for grouping domains without improperly - counting -\begin_inset Flex Code -status collapsed + + +\begin_inset Text \begin_layout Plain Layout -example.co.uk -\end_layout - -\end_inset - - as a -\emph on -user-owned subdomain -\emph default - of -\begin_inset Flex Code -status collapsed +\begin_inset ERT +status open \begin_layout Plain Layout -co.uk -\end_layout - -\end_inset -, which would then render -\begin_inset Flex Code -status collapsed -\begin_layout Plain Layout -co.uk +\backslash +numprint{650} \end_layout \end_inset - as the most popular domain under the -\begin_inset Flex Code -status collapsed -\begin_layout Plain Layout -uk \end_layout \end_inset - - TLD. -\end_layout - -\begin_layout Standard -Swedish examples include second level domains -\begin_inset Flex Code -status collapsed + + +\begin_inset Text \begin_layout Plain Layout -pp.se -\end_layout - -\end_inset - - for privately owned domains and -\begin_inset Flex Code -status collapsed -\begin_layout Plain Layout -tm.se +\family roman +\series medium +\shape up +\size normal +\emph off +\bar no +\strikeout off +\uuline off +\uwave off +\noun off +\color none +.ru \end_layout \end_inset - - for trademarks -\begin_inset Foot -status open + + + + +\begin_inset Text \begin_layout Plain Layout -\begin_inset CommandInset href -LatexCommand href -target "https://www.iis.se/data/barred_domains_list.txt" - -\end_inset - +\family roman +\series medium +\shape up +\size normal +\emph off +\bar no +\strikeout off +\uuline off +\uwave off +\noun off +\color none +6 \end_layout \end_inset + + +\begin_inset Text -. - These second level domains were more important before April 2003 -\begin_inset Foot +\begin_layout Plain Layout +\begin_inset ERT status open \begin_layout Plain Layout -\begin_inset CommandInset href -LatexCommand href -target "https://en.wikipedia.org/wiki/.se#Pre_2003_system" - -\end_inset +\backslash +numprint{639} \end_layout \end_inset -, when first level domain registration rules restricted registration to - nation-wide companies, associations and authorities. -\end_layout - -\begin_layout Standard -The public suffix list specification contains an algorithm for certain wildcard - rules that have exceptions, which this thesis has not implemented fully. - These 10 exception were deemed insignificant, as 7 of them are Japanese - cities grouped by geographic areas and the remaining 3 seem to belong to - ccTLD owner organizations. -\end_layout - -\begin_layout Chapter -Retrieving websites and resources -\end_layout - -\begin_layout Section -Computer machines -\end_layout - -\begin_layout Standard -Two computers were used to download web pages - one laptop machine and one - server machine. - The server is significantly more powerful than the laptop, and they downloaded - a different number of web pages at a a time. -\end_layout - -\begin_layout Standard -\begin_inset Tabular - - - - - - - - - - -\begin_inset Text -\begin_layout Plain Layout -Machine \end_layout \end_inset - + \begin_inset Text \begin_layout Plain Layout -OS, version -\end_layout - -\end_inset - - -\begin_inset Text -\begin_layout Plain Layout -CPU architecture +\family roman +\series medium +\shape up +\size normal +\emph off +\bar no +\strikeout off +\uuline off +\uwave off +\noun off +\color none +.org \end_layout \end_inset - + + + \begin_inset Text \begin_layout Plain Layout -CPU cores + +\family roman +\series medium +\shape up +\size normal +\emph off +\bar no +\strikeout off +\uuline off +\uwave off +\noun off +\color none +7 \end_layout \end_inset - + \begin_inset Text \begin_layout Plain Layout -Core speed (MHz) -\end_layout - -\end_inset - - -\begin_inset Text +\begin_inset ERT +status open \begin_layout Plain Layout -Memory (GB) -\end_layout -\end_inset - - - - -\begin_inset Text -\begin_layout Plain Layout -Laptop +\backslash +numprint{619} \end_layout \end_inset - - -\begin_inset Text -\begin_layout Plain Layout -Mac OS X 10.9.2 Mavericks + \end_layout \end_inset - + \begin_inset Text \begin_layout Plain Layout -x86/64 bit + +\family roman +\series medium +\shape up +\size normal +\emph off +\bar no +\strikeout off +\uuline off +\uwave off +\noun off +\color none +.de \end_layout \end_inset - + + + \begin_inset Text \begin_layout Plain Layout -2 + +\family roman +\series medium +\shape up +\size normal +\emph off +\bar no +\strikeout off +\uuline off +\uwave off +\noun off +\color none +8 \end_layout \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -4212,7 +4420,7 @@ status open \backslash -numprint{2800} +numprint{441} \end_layout \end_inset @@ -4222,54 +4430,116 @@ numprint{2800} \end_inset - + \begin_inset Text \begin_layout Plain Layout -8 + +\family roman +\series medium +\shape up +\size normal +\emph off +\bar no +\strikeout off +\uuline off +\uwave off +\noun off +\color none +.jp \end_layout \end_inset - + \begin_inset Text \begin_layout Plain Layout -Server + +\family roman +\series medium +\shape up +\size normal +\emph off +\bar no +\strikeout off +\uuline off +\uwave off +\noun off +\color none +9 \end_layout \end_inset - + \begin_inset Text \begin_layout Plain Layout -Debian GNU/Linux 8 Jessie/Sid +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +numprint{334} +\end_layout + +\end_inset + + \end_layout \end_inset - + \begin_inset Text \begin_layout Plain Layout -x86/64 bit + +\family roman +\series medium +\shape up +\size normal +\emph off +\bar no +\strikeout off +\uuline off +\uwave off +\noun off +\color none +.br \end_layout \end_inset - + + + \begin_inset Text \begin_layout Plain Layout -4 + +\family roman +\series medium +\shape up +\size normal +\emph off +\bar no +\strikeout off +\uuline off +\uwave off +\noun off +\color none +10 \end_layout \end_inset - + \begin_inset Text \begin_layout Plain Layout @@ -4280,7 +4550,7 @@ status open \backslash -numprint{2500} +numprint{316} \end_layout \end_inset @@ -4290,12 +4560,24 @@ numprint{2500} \end_inset - + \begin_inset Text \begin_layout Plain Layout -16 -\end_layout + +\family roman +\series medium +\shape up +\size normal +\emph off +\bar no +\strikeout off +\uuline off +\uwave off +\noun off +\color none +.uk +\end_layout \end_inset @@ -4308,157 +4590,223 @@ numprint{2500} \end_layout \begin_layout Section -Network connection +External datasets \end_layout -\begin_layout Standard -The laptop machine was connected by ethernet to the .SE office network, which - is shared with employees' computers. - The server machine was connected to server co-location network, which is - shared with other servers. - The .SE network technicians said load was kept very low, and only a few - percent of the dedicated 100 Mbps per location was used. - Both locations are in Stockholm city, and should therefore be well placed - in regard to web sites hosted in Sweden. +\begin_layout Subsection +Disconnect's blocking list \end_layout -\begin_layout Section -Software considerations +\begin_layout Standard +One of the most popular privacy tools is Disconnect, which blocks tracking + sites by running as a browser plugin. + Disconnect was started by ex-Google engineers, and seems to still have + close ties to Google. \end_layout \begin_layout Standard -To expedite an automated and repeatable process, a custom set of scripts - were written as the project har-heedless. - The scripts are written using standard tools, available as open source - and on multiple platforms. +The Disconnect software lets users block/unblock loading resources from + specific third-party domains. + A list of 2143 domains is used as the basis from the blocking. + Each entry belongs to an organization, including a link to the organizations + webpage. + There is also a grouping into categories, here shown with some examples. + Worth noting is that the content category isn't blocked by default. + +\begin_inset Note Greyedout +status open + +\begin_layout Plain Layout +Write about top results in datasets. \end_layout -\begin_layout Subsection -Dynamic web pages +\end_inset + + +\end_layout + +\begin_layout Subsubsection +Advertising \end_layout \begin_layout Standard -Previous efforts to download and analyze web pages by .SE used a static approach, - analyzing the HTML by means of simple searches for -\begin_inset Flex Code -status collapsed +\begin_inset Note Greyedout +status open \begin_layout Plain Layout -http:// +Add technorati.com, wpp.com? \end_layout \end_inset - and -\begin_inset Flex Code -status collapsed -\begin_layout Plain Layout -https:// \end_layout -\end_inset +\begin_layout Description +overture.com Yahoo's ad network. +\end_layout - strings in HTML and CSS. - It had proven hard to maintain, and the software project was abandoned - before the thesis was started, but hadn't yet been replaced. - In order to better handle the dynamic nature of modern web pages, the headless - browser phantomjs was chosen, as it would also download and execute javascript - - a major component in both user interfaces as well as active trackers - and ads. +\begin_layout Description +omniture.com Adobe's ad network. \end_layout -\begin_layout Subsection -Flash files +\begin_layout Description +amazon-adsystem.com Amazon's ad delivery network. +\end_layout + +\begin_layout Subsubsection +Analytics +\end_layout + +\begin_layout Description +alexa.com Amazon's web statistics service, considered an authority in web + measurement. +\end_layout + +\begin_layout Description +comscore.com Analytics service that also publishes statistics. +\end_layout + +\begin_layout Description +gaug.es GitHub's analytics service. +\end_layout + +\begin_layout Description +coremetrics.com Part of IBM's enterprise marketing services. +\end_layout + +\begin_layout Description +newrelic.com A suite of systems monitoring and analytics software, up to + and including browsers. +\end_layout + +\begin_layout Description +nielsen.com Consumer studies. +\end_layout + +\begin_layout Description +statcounter.com Web statistics tool. +\end_layout + +\begin_layout Description +webtrends.com Digital marketing analytics and optimization across channels. +\end_layout + +\begin_layout Subsubsection +Content \end_layout \begin_layout Standard -Flash is a scriptable proprietary cross-platform vector based web technology - owned by Adobe. - Several kinds of content, including video players, games and ads, use Flash - because it has historically been better suited than javascript for in-browser - moving graphics and video. - Flash usage has not been considered for this thesis as the technology isn - not available on all popular web browsing platforms, notably Apple's iPad, - and is being phased out by HTML 5 features such as -\begin_inset Flex Code -status collapsed +Sites that deliver content. + There is a wide variety of content, from images and videos to A/B testing, + comment and helpdesk services. + This category is not blocked by default. +\end_layout -\begin_layout Plain Layout - +\begin_layout Description +brightcove.com Video hosting/monetization service. \end_layout -\end_inset +\begin_layout Description +disqus.com A third party comment service. +\end_layout - and -\begin_inset Flex Code -status collapsed +\begin_layout Description +flickr.com Flickr is a photo/video hosting site, owned by Yahoo. +\end_layout -\begin_layout Plain Layout -