/
report.lyx
821 lines (629 loc) · 18.7 KB
/
report.lyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
#LyX 2.0 created this file. For more info see http://www.lyx.org/
\lyxformat 413
\begin_document
\begin_header
\textclass report
\begin_preamble
\usepackage{color}
\definecolor{updated}{rgb}{0.8,0.85,1}
\providecommand{\versionortoday}{(Unknown version) - \today}
\providecommand{\recentlyupdated}{\emph\textsc\textcolor{updated}{ ~Updated!~ }}
\end_preamble
\use_default_options true
\maintain_unincluded_children false
\language english
\language_package default
\inputencoding auto
\fontencoding global
\font_roman default
\font_sans default
\font_typewriter default
\font_default_family default
\use_non_tex_fonts false
\font_sc false
\font_osf false
\font_sf_scale 100
\font_tt_scale 100
\graphics default
\default_output_format default
\output_sync 0
\bibtex_command default
\index_command default
\paperfontsize default
\spacing single
\use_hyperref true
\pdf_title "Master’s thesis proposal"
\pdf_author "Joel Purra"
\pdf_bookmarks true
\pdf_bookmarksnumbered false
\pdf_bookmarksopen false
\pdf_bookmarksopenlevel 1
\pdf_breaklinks false
\pdf_pdfborder false
\pdf_colorlinks false
\pdf_backref false
\pdf_pdfusetitle true
\papersize default
\use_geometry false
\use_amsmath 1
\use_esint 1
\use_mhchem 1
\use_mathdots 1
\cite_engine basic
\use_bibtopic false
\use_indices false
\paperorientation portrait
\suppress_date false
\use_refstyle 1
\index Index
\shortcut idx
\color #008000
\end_index
\secnumdepth 3
\tocdepth 3
\paragraph_separation indent
\paragraph_indentation default
\quotes_language english
\papercolumns 1
\papersides 1
\paperpagestyle default
\tracking_changes false
\output_changes false
\html_math_output 0
\html_css_as_file 0
\html_be_strict false
\end_header
\begin_body
\begin_layout Title
Master’s thesis planning report
\end_layout
\begin_layout Author
Joel Purra, mig@joelpurra.se, joepu444, 070-3521212
\end_layout
\begin_layout Date
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
versionortoday
\end_layout
\end_inset
\end_layout
\begin_layout Section*
About The Internet Infrastructure Foundation
\begin_inset Foot
status open
\begin_layout Plain Layout
\begin_inset CommandInset href
LatexCommand href
target "https://www.iis.se/"
\end_inset
\end_layout
\end_inset
(.SE)
\end_layout
\begin_layout Standard
.SE is also known as Stiftelsen för internetinfrastruktur (IIS).
\end_layout
\begin_layout Standard
The Internet Infrastructure Foundation is an independent organization, responsib
le for the Swedish top level domain, and working for the benefit of the
public that promotes the positive development of the internet in Sweden.
Their head office is in Stockholm.
In 2012 they had 61 employees and a turnover of almost 120 MSEK.
\begin_inset Foot
status open
\begin_layout Plain Layout
\begin_inset CommandInset href
LatexCommand href
target "https://www.iis.se/docs/SE-Arsredovisning-2012.pdf"
\end_inset
\end_layout
\end_inset
\end_layout
\begin_layout Section*
Background and context
\end_layout
\begin_layout Standard
Part of .SE’s research efforts include continuously analyzing internet infrastruc
ture and usage in Sweden.
Yearly reports convey the status of, for example,
\emph on
Swedes and the internet
\emph default
and
\emph on
.se health status
\emph default
\begin_inset Foot
status open
\begin_layout Plain Layout
\begin_inset CommandInset href
LatexCommand href
target "https://www.iis.se/vad-vi-gor/halsolaget/"
\end_inset
\end_layout
\end_inset
to the public, both in Swedish
\begin_inset Foot
status open
\begin_layout Plain Layout
\begin_inset CommandInset href
LatexCommand href
target "https://www.iis.se/lar-dig-mer/rapporter/"
\end_inset
\end_layout
\end_inset
and English
\begin_inset Foot
status open
\begin_layout Plain Layout
\begin_inset CommandInset href
LatexCommand href
target "https://www.iis.se/english/reports/"
\end_inset
\end_layout
\end_inset
.
Information and statistics are also published on a separate portal, in
collaboration with other organizations.
\begin_inset Foot
status open
\begin_layout Plain Layout
\begin_inset CommandInset href
LatexCommand href
target "https://www.iis.se/vad-vi-gor/internetstatistik/"
\end_inset
\end_layout
\end_inset
\end_layout
\begin_layout Standard
The report
\emph on
.se health status
\emph default
is based on data collected from around 900 .se domain names deemed of importance
to the Swedish society as a whole, as well as random selection of 1% of
the registered .se domain names.
The research is focused on statistics about usage and security in DNS,
IP, web and e-mail; the target audience is IT strategists, executives and
directors.
Data is analyzed and summarized by Anne-Marie Eklund Löwinder, a world-reknown
DNS and security expert
\begin_inset Foot
status open
\begin_layout Plain Layout
\begin_inset CommandInset href
LatexCommand href
target "https://www.iis.se/bloggare/anne-marie/"
\end_inset
\end_layout
\end_inset
, while the technical aspects and tools are under the supervision of Patrik
Wallström, a well known DNSSEC expert and free and open source software
advocate
\begin_inset Foot
status open
\begin_layout Plain Layout
\begin_inset CommandInset href
LatexCommand href
target "https://www.iis.se/bloggare/pawal/"
\end_inset
\end_layout
\end_inset
.
\end_layout
\begin_layout Section*
Subject proposal
\end_layout
\begin_layout Standard
\begin_inset Note Greyedout
status open
\begin_layout Plain Layout
Rewrite after subject has been decided on.
\end_layout
\end_inset
\end_layout
\begin_layout Standard
Correctly adhering to standards, implementing security and data protection
measures, are important for the overall health of the internet, both when
looking at specific examples and even more so on a grander scale.
National results are already published by .SE, in the
\emph on
.se health status
\emph default
report, alongside helpful tips on how to avoid common problems.
In terms of getting an overview of the status in Sweden, the full potential
of the tools
\begin_inset Foot
status open
\begin_layout Plain Layout
\begin_inset CommandInset href
LatexCommand href
target "https://www.iis.se/lar-dig-mer/open-source/"
\end_inset
\end_layout
\end_inset
used
\begin_inset Foot
status open
\begin_layout Plain Layout
\begin_inset CommandInset href
LatexCommand href
target "https://github.com/dotse"
\end_inset
\end_layout
\end_inset
and data collected has not yet been utilized.
\end_layout
\begin_layout Standard
Within the scope of the thesis work, the tools can be improved and/or extended
to include new areas, which can then be analyzed as well.
Collected data has been, at least partially, stored since 2011 and can
potentially be used for retrospective analysis of trends.
There is also the possiblity to compare Sweden's country code top-level
domain (ccTLD) (.se) with another domain name common in Sweden (.nu, also
controlled by .SE) and other ccTLDs.
\end_layout
\begin_layout Standard
The combined topics of DNS, IP, web and e-mail are too broad for this thesis,
and the scope is to be narrowed down during the first weeks.
Expected results include an overview of the national status in the selected
field, potentially comparing it to historical data, or another ccTLD.
\end_layout
\begin_layout Standard
\begin_inset Note Greyedout
status open
\begin_layout Plain Layout
Add preliminary title.
\end_layout
\end_inset
\end_layout
\begin_layout Section*
Problem description
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
recentlyupdated
\end_layout
\end_inset
\end_layout
\begin_layout Quote
The problem description shall be detailed and include a background and a
motivation to why it is important.
Expected results shall also be described.
The problem description shall be grounded in the literature base and the
state‐of‐practice of the provider of the thesis (company, research group).
Plan for adjustment of the problem description along with the progress
of the literature studies and pre-study of the provider.
\end_layout
\begin_layout Subsection*
Background
\end_layout
\begin_layout Standard
\begin_inset Note Greyedout
status open
\begin_layout Plain Layout
Write a couple of definitions; external resources, site/service/domain,
tracker, external services (file hosting services, CDNs, advertising, analytics
, third party content providers, etcetera).
\end_layout
\end_inset
\end_layout
\begin_layout Standard
In everyday web browsing, browers routinely access a lot of material from
other domains or services than the one visited.
These exernal resources vary from content that the user explicitly want
to obtain, to implicitly loaded third party services, ads, and non-visible
resources with the sole purpose of collecting user data and statistical
material.
All are downloaded on behalf of the user with no or few limitations, and
oftentimes without the user's need, understanding and explicit consent.
These external resources can all be seen as browsing habit trackers, whose
knowledge and power increase with any additional visits to other domains
or services loading the same resources.
\end_layout
\begin_layout Standard
\begin_inset Note Greyedout
status open
\begin_layout Plain Layout
What kind of data can be collected by trackers, and how can they be aggregated
both per person and per group of people?
\end_layout
\end_inset
\end_layout
\begin_layout Standard
\begin_inset Note Greyedout
status open
\begin_layout Plain Layout
How much does the average user know about external resources being trackers?
\end_layout
\end_inset
\end_layout
\begin_layout Standard
While online privacy has been in the spotlight due to recently uncovered
mass surveillance operations, the focus has been on national government
intelligence agencies collecting information around the globe.
They have been able to intercept traffic data and metadata by, among several
techniques, covertly hooking into the internet infrastructure.
In contrast, external resources are approved by and actively installed
by site and service owners, and presented openly to users with basic technical
skills and tools.
Because these external resources are used on behalf of the service, they
are also loaded when end-to-end encryption with HTTPS is enabled for enhanced
privacy and security.
This encryption gives these private trackers more information than possible
with large-scale passive traffic interception, even when there is a security
nullifying mixture of encrypted and unencrypted connections.
\end_layout
\begin_layout Subsection*
Expected results
\end_layout
\begin_layout Standard
Previous research show a variety of ad networks and analytics software used
on many of internet's most popular sites.
The
\emph on
.se health status
\emph default
report from 2012 shown that 62% of the top Swedish domains used Google
Analytics.
The assumption is that the number of external resources is at least as
big, as they include Google Analytics.
Technical reasons include cloud services hosting sites and services, content
delivery networks becoming commonplace for scalable speed improvements
and external service providers increasing their quality.
Non-technical reasons include the fact
\end_layout
\begin_layout Standard
Sites served over HTTPS are expected to use as many external resources as
HTTP, even though some of these external resources might not be served
over HTTPS as well.
\end_layout
\begin_layout Standard
Media sites are expected to allow more trackers than other categories, as
their income model is based on third party advertisements.
Other commercial sites are expected to have more trackers than government
sites.
\end_layout
\begin_layout Subsection*
Direction and scope
\end_layout
\begin_layout Standard
The thesis will primarily be written from a Swedish perspective.
This is in part because .SE has access to the full list of Swedish .se domains,
and part because of their previous work with the
\emph on
.se health status
\emph default
reports.
Focus is to analyze .se domains in the reports, as they have already been
deemed important and results can be incorporated in future reports.
The main non-technical grouping is also based on the same reports; government,
media, banks, larger websites, etcetera.
\end_layout
\begin_layout Standard
One assumption is that all external resources can act as trackers, colleting
data and tracking users across domains.
While there are lists of known trackers, these will optionally be used
to emphasize those external resources as
\emph on
confirmed
\emph default
trackers.
\end_layout
\begin_layout Subsection*
Questions
\end_layout
\begin_layout Standard
With domain and resource data in place, it will be aggregated to answer
the following questions.
\end_layout
\begin_layout Itemize
What kinds of resources are there?
\end_layout
\begin_layout Itemize
How many resources are internal versus external per domain?
\end_layout
\begin_layout Itemize
What is the distribution of different kinds of resources?
\end_layout
\begin_layout Itemize
How many external resources are there, considering different levels of uniquenes
s:
\end_layout
\begin_deeper
\begin_layout Itemize
Unique URLs?
\end_layout
\begin_layout Itemize
Unique per file URL?
\end_layout
\begin_layout Itemize
Unique per folder URL?
\end_layout
\begin_layout Itemize
Unique per subdomain?
\end_layout
\begin_layout Itemize
Unique per domain?
\end_layout
\begin_layout Itemize
Unique per TLD?
\end_layout
\end_deeper
\begin_layout Itemize
On how many domains is each external resource is represented?
\end_layout
\begin_layout Itemize
How does usage of external resources differ between groups of domains?
\end_layout
\begin_layout Itemize
How to mark certain external resources as known trackers?
\end_layout
\begin_layout Itemize
What is the usage and distribution of known trackers?
\end_layout
\begin_layout Itemize
Are you as tracked using secure HTTPS as insecure HTTP?
\end_layout
\begin_layout Itemize
How do the results compare to
\end_layout
\begin_deeper
\begin_layout Itemize
Historical .se data, if readily available from earlier .SE status checks?
\end_layout
\begin_layout Itemize
Other ccTLDs?
\end_layout
\begin_layout Itemize
Commonly used gTLDs?
\end_layout
\begin_layout Itemize
Recently introduced newTLDs?
\end_layout
\end_deeper
\begin_layout Standard
Additional questions, which can be considered as bonuses
\end_layout
\begin_layout Itemize
Could any external resources actually be considered internal, despite being
loaded from external domains?
\end_layout
\begin_layout Itemize
How to determine if a resource
\end_layout
\begin_deeper
\begin_layout Itemize
Crosses Sweden's borders in transit?
\end_layout
\begin_layout Itemize
Is handled by an organisation with base or ownership outside of Sweden?
\end_layout
\end_deeper
\begin_layout Itemize
Which external resources are loaded from Sweden and abroad respectively?
\end_layout
\begin_layout Itemize
What user data could potentially be collected, and subsequently inferred?
\end_layout
\begin_layout Itemize
To what extent can the average Swedish internet user's browsing habits be
correlated across the most commonly visited webpages?
\end_layout
\begin_layout Section*
Approach
\begin_inset ERT
status open
\begin_layout Plain Layout
\backslash
recentlyupdated
\end_layout
\end_inset
\end_layout
\begin_layout Quote
The approach is a preliminary description of how the problem will be solved.
This section shall also include a description of a method to evaluate that
the problem is solved in a satisfactory way.
\end_layout
\begin_layout Standard
Based on a list of domains, external resources are listed by downloading
of the front page of each domain, and analyzing its HTML content.
The URLs of external resources will be extracted, and associated with the
domain they were loaded from.
\end_layout
\begin_layout Standard
In order to facilitate repeatable and improvable analysis, tools will be
developed to perform the collection and aggregation steps automatically.
.SE already has a set of tools that run monthly; integration and interoperabilit
y will smooth the process and continious usage.
\end_layout
\begin_layout Subsection*
Potential problems
\end_layout
\begin_layout Itemize
Due to the dynamic nature of modern web pages, a static HTML analysis might
not be enough.
How can pages with dynamic script loading be analyzed?
\end_layout
\begin_layout Itemize
Script aggregation and concatenation could give misleading numbers if only
analyzed per URL.
Is it possible to detect which known scripts are actually running?
\end_layout
\begin_layout Itemize
Can Google Tag Manager scripts, which is script aggregation with asynchronous
loading directed specifically to marketers, be analyzed to show each included
service?
\end_layout
\begin_layout Itemize
Can collected data served by different services differ depending on which
tool is used to fetch the data?
\end_layout
\begin_layout Itemize
Many of the external resources will be overlapping, and downloading them
multiple times can be avoided by caching the file the first time in a run.
Would keeping a local cache of recently requested URLs affect the results?
\end_layout
\begin_layout Itemize
Automated downloading of webpages, especially downloading several in short
succession, can be detected by site and service owners.
Will automated collection done for this report be detected and hindered?
\end_layout
\begin_layout Section*
Literature base
\end_layout
\begin_layout Quote
The literature base describes the planned literature study and gives examples
of different directions of a good theoretical grounding of the work.
\end_layout
\begin_layout Section*
Time plan
\end_layout
\begin_layout Quote
The time plan describes the activities and milestones of the work with the
resolution of a week.
Dates for planned final seminar are included.
For degree projects on advanced level (e.g.
Master level) dates for half‐time checkpoint are also included.
For these degree projects the expected results for the half‐time checkpoint
are also explicitly described.
This plan is updated in cooperation with the tutor.
\end_layout
\begin_layout Section*
Relevant courses taken at LiU
\end_layout
\begin_layout Itemize
TDDC90 Software Security
\end_layout
\begin_layout Itemize
TDDD05 Component Based Software
\end_layout
\begin_layout Itemize
TDDD09 Software Engineering Project
\end_layout
\begin_layout Itemize
TDDD12 Database Technology
\end_layout
\begin_layout Itemize
TDDD24 Web Programming and Interactivity
\end_layout
\begin_layout Itemize
TDDD25 Distributed Systems
\end_layout
\begin_layout Itemize
TDDD27 Advanced Web Programming
\end_layout
\begin_layout Itemize
TDDD43 Advanced Data Models and Databases
\end_layout
\begin_layout Itemize
TDST02 Computer Network Infrastructure
\end_layout
\begin_layout Itemize
TSIT02 Computer Security
\end_layout
\end_body
\end_document