From 9f0d7e7442a7f6c706569662d42424be28ba1f33 Mon Sep 17 00:00:00 2001 From: Joel Purra Date: Mon, 2 Feb 2015 20:20:17 +0100 Subject: [PATCH] Rewrote introduction, results --- report/report.lyx | 424 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 301 insertions(+), 123 deletions(-) diff --git a/report/report.lyx b/report/report.lyx index ce10c2a..89d6d0b 100644 --- a/report/report.lyx +++ b/report/report.lyx @@ -347,19 +347,6 @@ tracked Data collection is the new wild west, and you are the new cattle. \end_layout -\begin_layout Standard -\begin_inset Note Greyedout -status open - -\begin_layout Plain Layout -Mention related work? -\end_layout - -\end_inset - - -\end_layout - \begin_layout Standard This thesis uses large-scale measurements to characterize how different kinds of domains in Sweden and internationally use website resources. @@ -383,108 +370,176 @@ numprint{150000} Resources were grouped by mime type, URL protocol, domain, if it matches the domain the request originated from and compared to lists of known trackers and organizations. - The thesis makes three primary contributions. + The thesis makes three primary contributions: \end_layout -\begin_layout Itemize -First, adaption of HTTPS for different domains is characterized from a Swedish - perspective. - The adaption between classes of domains within Sweden, as well as against - popular international domains are compared. +\begin_layout Enumerate +Software for automated, repeatable retrieval and analysis of large amounts + of websites has been developed, and released as open source (see Appendix -\begin_inset Note Greyedout -status open - -\begin_layout Plain Layout -Highlight some key findings and their implications. -\end_layout +\begin_inset CommandInset ref +LatexCommand vref +reference "chap:Software" \end_inset - -\begin_inset Note Greyedout +). + Datasets based on publicly available domain lists have been released for + scientific scrutinization +\begin_inset Foot status open \begin_layout Plain Layout -Write about users' (in-)ability to protect their own privacy using HTTPS. -\end_layout +\begin_inset CommandInset href +LatexCommand href +target "http://joelpurra.com/projects/masters-thesis/" \end_inset - -\begin_inset Note Greyedout -status open -\begin_layout Plain Layout -Insert reference to results chapter. \end_layout \end_inset - +. + The data allows analysis of websites' HTTP/HTTPS requests including the + use of resources internal versus external to the entry domain, which the + most common confirmed tracker organizations are, what spread they have + and how much the average internet user can expect to be tracked by visiting + some of the most important and popular sites in Sweden, Denmark and worldwide. + Downloading and analyzing additional/custom datasets is very easy. \end_layout -\begin_layout Itemize -Second, the use of third-party trackers and other third-party services for - different classes of domains, is analyzed. - Using public lists of potential tracker domains, we analyze and compare - the wide spread adaption of these services across domains within Sweden, - as well as internationally. - -\begin_inset Note Greyedout -status open +\begin_layout Enumerate +Adaption of HTTPS for different domains has been characterized from a Swedish + perspective. + The adaption between classes of domains within Sweden, as well as against + popular international domains, are compared (see Section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Results-HTTP,-HTTPS-and-redirects" -\begin_layout Plain Layout -Highlight some key findings and their implications. -\end_layout +\end_inset + +). + HTTPS adoption among globally popular websites (10-30%, 50% for the very + top) and curated lists of Swedish websites (15-50%) is much higher than + for random domains (less than 1%). + This means that most websites in the world are susceptible to passive eavesdrop +ping anywhere along the network path between the client and the server. + But even with HTTPS enabled, traffic data and personally identifiable informati +on is leaked through external resources and third-party trackers, which + are just as prevalent on insecure HTTP as secure HTTPS enabled websites + (see Section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Results-Internal-and-external-requests" \end_inset - -\begin_inset Note Greyedout -status open + and +\begin_inset CommandInset ref +LatexCommand eqref +reference "sec:Results-Trackers-and-tracker-organizations" -\begin_layout Plain Layout -Insert reference to results chapter. -\end_layout +\end_inset + +). + This means that a secure, encrypted connection protecting against eavesdropping + doesn't automatically lead to +\emph on +privacy +\emph default + -- something which users might be lead to believe when it is called a +\begin_inset Quotes eld +\end_inset + +secure connection +\begin_inset Quotes erd +\end_inset + as well as through the use of +\begin_inset Quotes eld \end_inset +security symbols +\begin_inset Quotes erd +\end_inset + such as padlocks. \end_layout -\begin_layout Itemize -Software for automated, repeatable retrieval and analysis of large amounts - of websites was developed, and released as open source (see Appendix +\begin_layout Enumerate +The use of known, confirmed third-party trackers and other third-party services + for different classes of domains, has been analyzed. + Using public lists of potential tracker domains, we analyzed and compared + the wide spread adaption of these services across domains within Sweden, + as well as internationally. + The use of external resources is high among all classes of domains (see + Section \begin_inset CommandInset ref -LatexCommand vref -reference "chap:Software" +LatexCommand ref +reference "sec:Results-Internal-and-external-requests" \end_inset ). - Datasets based on publicly available domain lists have been released for - scientific scrutinization. -\begin_inset Foot + Web sites using strictly internal resources are less than 7% of top sites, + even less in most categories of curated lists of Swedish websites, while + it is more common among random domains at 10-30%. + This means most web sites around the world have made an active choice to + install external resources from third-party services, which means that + users' traffic data and personal information is leaked (see Section +\begin_inset CommandInset ref +LatexCommand ref +reference "sec:Results-Trackers-and-tracker-organizations" + +\end_inset + +). + Most websites also have at least one known tracker present; 53-72% of random + domains have at least one tracker installed, while 88-98% of top websites + have trackers and 78-100% of websites in the Swedish curated lists. + The number of known tracker organizations is interesting to look at, as + a higher number means users have less control over where leaked data ends + up. + Around 55% of random Swedish domains have 1-3 trackers, and about 5% have + more than 3. + Nearly 50% of global top sites load resources from 3 or more tracker organizati +ons, while about 5% load from more than 20 organizations. + Half of the Swedish media websites load more than 6 known trackers; a single + visit to the front page of each of the 27 investigated sites would leak + information in over +\begin_inset ERT status open \begin_layout Plain Layout -\begin_inset CommandInset href -LatexCommand href -target "http://joelpurra.com/projects/masters-thesis/" + + +\backslash +numprint{3800} +\end_layout \end_inset + external requests +\begin_inset CommandInset ref +LatexCommand eqref +reference "sec:Request-counts" -\end_layout +\end_inset + + to at least 57 organizations +\begin_inset CommandInset ref +LatexCommand eqref +reference "sub:Domain-and-organization-counts" \end_inset - The data allows analysis of websites' HTTP requests including the use of - resources internal versus external to the entry domain, which the most - common confirmed tracker organizations are, what spread they have and how - much the average internet user can expect to be tracked by visiting some - of the most important and popular sites in Sweden, Denmark and worldwide. +. + This means that any guesswork in what types of articles individuals read + would read in a printed newspaper is gone -- and with that probably the + guesswork in exactly what kind of opinions these individuals hold. \end_layout \begin_layout Chapter @@ -711,7 +766,7 @@ price steering \emph on price discrimination \emph default - on some e-commerce web sites + on some e-commerce websites \begin_inset CommandInset citation LatexCommand cite key "Hannak:2014:MPD:2663716.2663744,Mikians:2013:CSP:2535372.2535415" @@ -734,7 +789,7 @@ key "Dou:2007:LEA:1242572.1242651,Pariser:2011:filterbubble" \end_layout \begin_layout Standard -Social networks can use web site tracking data about their users' to increase +Social networks can use website tracking data about their users' to increase per-user advertising incomes by personalization, but they will try to keep most of the information to themselves \begin_inset CommandInset citation @@ -1475,7 +1530,7 @@ reference "fig:Domains-per-organization" \end_inset shows the number of organizations (out of the 980 organizations) that have - a certain number of tracker domains (x­axis). + a certain number of tracker domains (x axis). We see that 47% (459 of 980) have at least two domains listed by Disconnect.me. Google (rightmost point) alone has 271 domains and Yahoo has 71. Some organizations have their domains categorized in more than one category, @@ -2173,6 +2228,13 @@ key "Acar:2014:WNF:2660267.2660347" \begin_layout Chapter Results +\begin_inset CommandInset label +LatexCommand label +name "chap:Results" + +\end_inset + + \end_layout \begin_layout Standard @@ -2995,17 +3057,10 @@ reference "fig:Organizations-per-domain" \end_layout \begin_layout Section -Internal, external and organizations -\end_layout - -\begin_layout Standard -\begin_inset Note Greyedout -status open - -\begin_layout Plain Layout -Write about general characterization of internal/external resources per - category, and implications. -\end_layout +Internal and external requests +\begin_inset CommandInset label +LatexCommand label +name "sec:Results-Internal-and-external-requests" \end_inset @@ -3064,8 +3119,8 @@ reference "fig:Result-selection-internal-secure-organizations" a shows the cumulative distribution function (CDF) of the ratio of external resources used by each domain, with 0% and 99% internal resources marked. - In particular, we show the ratio of domains (y­axis) as a function of the - ratio of external resources seen by each domain (x­axis). + In particular, we show the ratio of domains (y axis) as a function of the + ratio of external resources seen by each domain (x axis). Similar to the HTTPS adaption, we observe significant differences between randomly selected domains and the most popular (top ranked) domains. See how dataset HTTP/HTTPS variation lines follow each other for most datasets, @@ -3078,8 +3133,33 @@ secure \end_inset HTTPS serving as many trackers as insecure HTTP. - For the HTTP variation of random .se domains 40% use strictly external resources -; this seems to be connected with the fact that many domains are + This means that a secure, encrypted connection protecting against eavesdropping + doesn't automatically lead to +\emph on +privacy +\emph default + -- something which users might be lead to believe when it is called a +\begin_inset Quotes eld +\end_inset + +secure connection +\begin_inset Quotes erd +\end_inset + + as well as through the use of +\begin_inset Quotes eld +\end_inset + +security symbols +\begin_inset Quotes erd +\end_inset + + such as padlocks. +\end_layout + +\begin_layout Standard +For the HTTP variation of random .se domains 40% use strictly external resources; + this seems to be connected with the fact that many domains are \begin_inset Quotes eld \end_inset @@ -3101,7 +3181,7 @@ A parked domain is one that has been purchased from a domain name retailer, and load all their resources from an external domain which serves the domain name retailer's resources for all parked domains. - The same domains do not have HTTPS enabled, as can be seen in + The same domains seem to not have HTTPS enabled, as can be seen in \begin_inset CommandInset ref LatexCommand ref reference "fig:Result-selection-http-codes-redirects-internal-external" @@ -3122,24 +3202,17 @@ reference "sub:Ignore-domains-without-content" . \end_layout -\begin_layout Standard -\begin_inset Note Greyedout -status open - -\begin_layout Plain Layout -Write about top Disconnect organizations, some Google results, in Sweden - versus internationally. -\end_layout +\begin_layout Section +HTTP, HTTPS and redirects +\begin_inset CommandInset label +LatexCommand label +name "sec:Results-HTTP,-HTTPS-and-redirects" \end_inset \end_layout -\begin_layout Section -HTTP, HTTPS and redirects -\end_layout - \begin_layout Standard Figure \begin_inset CommandInset ref @@ -3196,9 +3269,21 @@ reference "sec:HTTP,-HTTPS-and-redirects" \end_inset . - The average number of redirects for domains with redirects is 1.23, but - some domains have multiple, chained redirects to a mixture of HTTP and - HTTPS URLs. + The average number of redirects for domains with redirects is +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +numprint{1.23} +\end_layout + +\end_inset + +, but some domains have multiple, chained redirects to a mixture of HTTP + and HTTPS URLs. \end_layout \begin_layout Standard @@ -3237,7 +3322,14 @@ It seems that Swedish media shun secure connections -- none of them present \end_layout \begin_layout Section -Trackers +Trackers and tracker organizations +\begin_inset CommandInset label +LatexCommand label +name "sec:Results-Trackers-and-tracker-organizations" + +\end_inset + + \end_layout \begin_layout Standard @@ -3251,7 +3343,7 @@ While looking at the number of requests made to trackers can give a hint ons which resources are loaded from \begin_inset CommandInset ref LatexCommand eqref -reference "sub:Domain-and-organization" +reference "sub:Domain-and-organization-counts" \end_inset @@ -3279,15 +3371,17 @@ reference "sec:Failed-versus-non-failed" \end_inset - It can again be seen that the amount of tracking is the same in other HTTP-www - variations as in their respective HTTPS-www variation -- the figure shows - that the lines follow each other, with over 80% having some tracking installed. +; around 55% of random Swedish HTTP domains have 1-3 trackers, and about + 5% have more than 3. \end_layout \begin_layout Standard -Out of the Swedish media domains, 50% share information with more than seven - tracker organizations -- and one of them is sharing information with 38 - organizations. +It can again be seen that the amount of tracking is the same in other HTTP-www + variations as in their respective HTTPS-www variation -- the figure shows + that the lines follow each other. + Most websites also have at least one known tracker present; 53-72% of random + domains have at least one tracker installed, while 88-98% of top websites + have trackers and 78-100% of websites in the Swedish curated lists. In the larger Alexa global top \begin_inset ERT status open @@ -3306,6 +3400,90 @@ numprint{10000} only on the front page of the domain. \end_layout +\begin_layout Standard +Out of the Swedish media domains, 50% share information with more than seven + tracker organizations -- and one of them is sharing information with 38 + organizations. + Half of the Swedish media websites load more than 6 known trackers; a single + visit to the front page of each of the 27 investigated sites would leak + information in over +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +numprint{3800} +\end_layout + +\end_inset + + external requests +\begin_inset CommandInset ref +LatexCommand eqref +reference "sec:Request-counts" + +\end_inset + + to at least 57 organizations +\begin_inset CommandInset ref +LatexCommand eqref +reference "sub:Domain-and-organization-counts" + +\end_inset + +. + This means that any guesswork in what types of articles individuals read + would read in a printed newspaper is gone -- and with that probably the + guesswork in exactly what kind of opinions these individuals hold. + While it is already known that commercial media outlets makes their money + through advertising, this level of tracking might be surprising -- it seems + to indicate that what news users read online is very well known. + It could also mean that this type of intimate knowledge of what news is + popular connected with the kind of click traffic advertisers are seeing + means that they have financial incentive to control exactly what the media + writes. + Does bad news bring more clicks than good? Write more about bad news. + Advertisers could also expand article categorization with fine-grained + details and connect article reading to visits on other sites to improve + click-through rates. +\end_layout + +\begin_layout Standard +The organization with the most spread, by far, is Google. + The runner ups with the broad domain class coverage are Facebook and Twitter, + but in terms of domain coverage they are still far behind -- see section + +\begin_inset CommandInset ref +LatexCommand eqref +reference "sub:Top-organizations" + +\end_inset + +. + Google is very popular with all top domains and most Swedish curated datasets + have a coverage above 80% -- and many closer to 90%. + Random domains have a lower reliance on Google at 47-62% -- still about + half of all domains. + Apart from the .SE Health Status list of Swedish media domains, Facebook + doesn't reach 40% in top or curated domains. + Facebook coverage on random zone domains is 6-10%, which is also much lower + than Google's numbers. + Twitter generally has even lower coverage, at about half of that of Facebook + on average. + As can be seen, Google alone oftentimes has a coverage higher than the + domains in the Disconnect category -- it shows that Google's content domains + are in use +\begin_inset CommandInset ref +LatexCommand eqref +reference "sub:Disconnect-Organizations-in-more-than-one-category" + +\end_inset + +. +\end_layout + \begin_layout Chapter Discussion \end_layout @@ -4734,7 +4912,7 @@ reference "sub:Software-considerations-cookies" \end_inset . - As domains/web sites are not reloaded, but rather requested four separate + As domains/websites are not reloaded, but rather requested four separate times \begin_inset CommandInset ref LatexCommand eqref @@ -6083,7 +6261,7 @@ reference "sec:Creating-an-information-web-site" \end_inset -, without having to recalculate all steps for the entire dataset results. +, without having to recalculate all steps for the entire dataset results. While most incremental updates are a matter of easy addition and subtraction, some late analysis steps introduce coverage calculations and other arithmetical divisions, which may cause some data/precision loss if reversed. @@ -8716,7 +8894,7 @@ name "sub:Software-considerations-cookies" \end_layout \begin_layout Standard -Cookies stored by a web site may affect content upon requesting subsequent +Cookies stored by a website may affect content upon requesting subsequent resources, and is one of the primary means of keeping track of a browser. Each browser instance has been started without any cookies, and while cookie usage has not been turned off, none have been stored after finalizing the @@ -9004,8 +9182,8 @@ status collapsed tag, each part should download separately and perform the same duties, including “calling home” to the usual addresses. - In order to confirm this, a query was run on one of the datasets, se.2014-07-10. -random.100000-http-www -- see Table + In order to confirm this, a query was run on one of the datasets, se.2014-07-10.r +andom.100000-http-www -- see Table \begin_inset CommandInset ref LatexCommand ref reference "tab:Google-Tag-Manager-versus-Google-Analytics-and-DoubleClick" @@ -14988,7 +15166,7 @@ addplot+[ \begin_layout Plain Layout - update limits=false, + update limits=false, \end_layout \begin_layout Plain Layout @@ -15895,7 +16073,7 @@ begin{wide} Domain and organization counts \begin_inset CommandInset label LatexCommand label -name "sub:Domain-and-organization" +name "sub:Domain-and-organization-counts" \end_inset @@ -16620,7 +16798,7 @@ gle.sorted.tsv}{Top Disconnect Google domain match coverage}{}{} \end_layout \begin_layout Subsection -Categories +Tracker categories \begin_inset CommandInset label LatexCommand label name "sub:Disconnect-categories-coverage" @@ -17044,7 +17222,7 @@ name "fig:top-categories" status open \begin_layout Plain Layout -Top categories +Tracker categories \end_layout \end_inset @@ -17176,7 +17354,7 @@ reference "sub:Disconnect-categories-coverage" \begin_layout Standard Google is very popular with all Alexa and most Swedish curated datasets - having a coverage above 80% -- and many closer to 90%. + have a coverage above 80% -- and many closer to 90%. Random domains have a lower reliance on Google at 47-62% -- still about half of all domains. Apart from the .SE Health Status list of Swedish media domains, Facebook @@ -17429,7 +17607,7 @@ addplot+[ \begin_layout Plain Layout - update limits=false, + update limits=false, \end_layout \begin_layout Plain Layout