From 95f1865f5fc0e3ff4c4b438c4ab63f6dea7d07ad Mon Sep 17 00:00:00 2001 From: Joel Purra Date: Mon, 22 Sep 2014 21:44:41 +0200 Subject: [PATCH] Write about HTTP response status, non-/failed origins, HTTPS, mime-types --- ...ification.domain-scope.coverage.sorted.tsv | 27 + ....classification.secure.coverage.sorted.tsv | 27 + ...-types.groups.coverage.external.sorted.tsv | 27 + ...-types.groups.coverage.internal.sorted.tsv | 27 + ....request-status.coverage.origin.sorted.tsv | 27 + report/datasets.retries.rates.tsv | 27 + report/report.lyx | 587 ++++++++++++++++-- 7 files changed, 694 insertions(+), 55 deletions(-) create mode 100755 report/datasets.non-failed.classification.domain-scope.coverage.sorted.tsv create mode 100755 report/datasets.non-failed.classification.secure.coverage.sorted.tsv create mode 100644 report/datasets.non-failed.mime-types.groups.coverage.external.sorted.tsv create mode 100644 report/datasets.non-failed.mime-types.groups.coverage.internal.sorted.tsv create mode 100644 report/datasets.request-status.coverage.origin.sorted.tsv create mode 100755 report/datasets.retries.rates.tsv diff --git a/report/datasets.non-failed.classification.domain-scope.coverage.sorted.tsv b/report/datasets.non-failed.classification.domain-scope.coverage.sorted.tsv new file mode 100755 index 0000000..d4a5772 --- /dev/null +++ b/report/datasets.non-failed.classification.domain-scope.coverage.sorted.tsv @@ -0,0 +1,27 @@ +Dataset Domains Same domain Subdomain Superdomain Internal domain External domain Mixed +alexa.2014-09-01.random.10000-http-www 8493 0.0565171317555634 0 0.006593665371482397 0.06487695749440715 0.06699634993524078 0.8681266925703521 +alexa.2014-09-01.random.10000-https 1135 0.027312775330396475 0.009691629955947136 0 0.04052863436123348 0.1277533039647577 0.8317180616740089 +alexa.2014-09-01.top.10000-http-www 8682 0.021769177608845888 0 0.002994701681640175 0.02626123013130615 0.12381939645243031 0.8499193734162636 +alexa.2014-09-01.top.10000-https 2507 0.02473075388911049 0.012365376944555246 0 0.04228161148783406 0.038691663342640605 0.9190267251695253 +alexa.2014-09-01.top.dk.10000-http-www 2310 0.04199134199134199 0 0.004329004329004329 0.046753246753246755 0.05064935064935065 0.9025974025974026 +alexa.2014-09-01.top.dk.10000-https 339 0.032448377581120944 0.011799410029498525 0 0.05309734513274336 0.02654867256637168 0.9203539823008849 +alexa.2014-09-01.top.se.10000-http-www 2895 0.030397236614853194 0 0.0037996545768566492 0.03419689119170984 0.059412780656303975 0.9063903281519862 +alexa.2014-09-01.top.se.10000-https 438 0.0228310502283105 0.00684931506849315 0 0.03424657534246575 0.0228310502283105 0.9429223744292237 +com.2014-08-29.random.10000-http-www 7811 0.12648828575086415 0 0.008577646908206376 0.13775444885418 0.23735757265394955 0.6248879784918705 +com.2014-08-29.random.10000-https 50 0.1 0.02 0 0.12 0.04 0.84 +dk.2014-07-23.random.10000-http-www 7378 0.2618595825426945 0 0.0279208457576579 0.29289780428300355 0.29709948495527244 0.41000271076172407 +dk.2014-07-23.random.10000-https 23 0.2608695652173913 0.043478260869565216 0 0.30434782608695654 0.043478260869565216 0.6521739130434783 +net.2014-08-29.random.10000-http-www 7378 0.11832474925454052 0 0.00623475196530225 0.125914882081865 0.26389265383572785 0.6101924640824072 +net.2014-08-29.random.10000-https 26 0.23076923076923078 0.07692307692307693 0 0.38461538461538464 0 0.6153846153846154 +reach50.2014w35.se-http-www 42 0 0 0 0 0.11904761904761904 0.8809523809523809 +se.2014-07-10.random.100000-http 73605 0.17638747367705998 0.025867807893485497 0 0.20831465253719178 0.36680932001902045 0.42487602744378783 +se.2014-07-10.random.100000-http-www 77261 0.1778775837744787 0 0.026041599254475092 0.2056276776122494 0.3715199130220939 0.42285240936565666 +se.healthstatus.2013.counties-http-www 21 0 0 0 0 0.047619047619047616 0.9523809523809523 +se.healthstatus.2013.domain-registrars-http-www 134 0.1044776119402985 0 0.014925373134328358 0.13432835820895522 0.16417910447761194 0.7014925373134329 +se.healthstatus.2013.financial-services-http-www 72 0.013888888888888888 0 0 0.013888888888888888 0.125 0.8611111111111112 +se.healthstatus.2013.gocs-http-www 57 0.03508771929824561 0 0 0.03508771929824561 0.12280701754385964 0.8421052631578947 +se.healthstatus.2013.higher-education-http-www 47 0.06382978723404255 0 0 0.06382978723404255 0.0425531914893617 0.8936170212765958 +se.healthstatus.2013.isps-http-www 19 0 0 0 0 0 1 +se.healthstatus.2013.media-http-www 28 0 0 0.03571428571428571 0.03571428571428571 0.17857142857142858 0.7857142857142857 +se.healthstatus.2013.municipalities-http-www 271 0.04797047970479705 0 0 0.04797047970479705 0.01845018450184502 0.933579335793358 +se.healthstatus.2013.public-authorities-http-www 203 0.07389162561576355 0 0 0.07389162561576355 0.10344827586206896 0.8226600985221675 diff --git a/report/datasets.non-failed.classification.secure.coverage.sorted.tsv b/report/datasets.non-failed.classification.secure.coverage.sorted.tsv new file mode 100755 index 0000000..91a1cfc --- /dev/null +++ b/report/datasets.non-failed.classification.secure.coverage.sorted.tsv @@ -0,0 +1,27 @@ +Dataset Domains All sec All insec Int sec Int insec Ext sec Ext insec Mix sec Mix int sec Mix ext sec +alexa.2014-09-01.random.10000-http-www 8493 0.023313316849169905 0.4590839514894619 0.024961733192040505 0.8847286000235488 0.027434357706346404 0.3998587071706111 0.5176027316613682 0.09030966678441066 0.5727069351230425 +alexa.2014-09-01.random.10000-https 1135 0.27929515418502204 0.11189427312775331 0.347136563876652 0.3145374449339207 0.29779735682819386 0.13480176211453745 0.6088105726872246 0.33832599118942736 0.5674008810572687 +alexa.2014-09-01.top.10000-http-www 8682 0.04630269523151347 0.3577516701220917 0.045381248560239576 0.7964754664823773 0.0497581202487906 0.33828610919143054 0.5959456346463949 0.15814328495738306 0.6119557705597789 +alexa.2014-09-01.top.10000-https 2507 0.32149980055843635 0.10849621061029119 0.3530115676106901 0.41284403669724773 0.3211009174311927 0.1296370163542082 0.5700039888312725 0.2341443956920622 0.549262066214599 +alexa.2014-09-01.top.dk.10000-http-www 2310 0.0341991341991342 0.41818181818181815 0.03463203463203463 0.8965367965367965 0.03939393939393939 0.37662337662337664 0.5476190476190477 0.0688311688311688 0.5839826839826839 +alexa.2014-09-01.top.dk.10000-https 339 0.3215339233038348 0.10029498525073746 0.4306784660766962 0.336283185840708 0.3333333333333333 0.10619469026548672 0.5781710914454278 0.23303834808259577 0.5604719764011801 +alexa.2014-09-01.top.se.10000-http-www 2895 0.04179620034542314 0.3975820379965458 0.043523316062176166 0.8815198618307427 0.04455958549222798 0.3675302245250432 0.5606217616580311 0.0749568221070811 0.5879101899827288 +alexa.2014-09-01.top.se.10000-https 438 0.3242009132420091 0.12557077625570776 0.4315068493150685 0.3493150684931507 0.3607305936073059 0.13470319634703196 0.550228310502283 0.21917808219178087 0.504566210045662 +com.2014-08-29.random.10000-http-www 7811 0.00550505697093842 0.7408782486237357 0.0015362949686339778 0.6982460632441428 0.008833696069645371 0.6038919472538727 0.25361669440532586 0.3002176417872232 0.3872743566764819 +com.2014-08-29.random.10000-https 50 0.36 0.14 0.5 0.28 0.28 0.2 0.5 0.21999999999999997 0.52 +dk.2014-07-23.random.10000-http-www 7378 0.011656275413391162 0.701951748441312 0.0012198427758200055 0.6416373000813228 0.01924640824071564 0.4098671726755218 0.28639197614529677 0.3571428571428572 0.5708864190837626 +dk.2014-07-23.random.10000-https 23 0.5652173913043478 0.08695652173913043 0.5652173913043478 0.21739130434782608 0.4782608695652174 0.043478260869565216 0.3478260869565218 0.21739130434782614 0.4782608695652174 +net.2014-08-29.random.10000-http-www 7378 0.007725670913526701 0.7399024125779344 0.000813228517213337 0.6523448088912984 0.010165356465166711 0.614394144754676 0.25237191650853885 0.34684196259148825 0.3754404987801573 +net.2014-08-29.random.10000-https 26 0.5769230769230769 0.038461538461538464 0.6538461538461539 0.15384615384615385 0.38461538461538464 0 0.3846153846153847 0.1923076923076923 0.6153846153846154 +reach50.2014w35.se-http-www 42 0.09523809523809523 0.4523809523809524 0.11904761904761904 0.7380952380952381 0.09523809523809523 0.4523809523809524 0.4523809523809524 0.1428571428571428 0.4523809523809524 +se.2014-07-10.random.100000-http 73605 0.08860811086203382 0.5750288703213097 0.001209156986617757 0.5854221859927994 0.09294205556687725 0.367148970857958 0.33636301881665653 0.4133686570205829 0.5399089735751648 +se.2014-07-10.random.100000-http-www 77261 0.0859036253737332 0.5817553487529283 0.0012684277966891445 0.580409262111544 0.08990305587553876 0.37647713594180765 0.33234102587333847 0.4183223100917669 0.5336198081826536 +se.healthstatus.2013.counties-http-www 21 0 0.6190476190476191 0 0.9523809523809523 0 0.6190476190476191 0.38095238095238093 0.04761904761904767 0.38095238095238093 +se.healthstatus.2013.domain-registrars-http-www 134 0.09701492537313433 0.5447761194029851 0.1044776119402985 0.6716417910447762 0.08955223880597014 0.44776119402985076 0.35820895522388063 0.22388059701492535 0.46268656716417905 +se.healthstatus.2013.financial-services-http-www 72 0.09722222222222222 0.5972222222222222 0.08333333333333333 0.75 0.1388888888888889 0.5972222222222222 0.3055555555555556 0.16666666666666663 0.26388888888888895 +se.healthstatus.2013.gocs-http-www 57 0.017543859649122806 0.5263157894736842 0.017543859649122806 0.8596491228070176 0.017543859649122806 0.49122807017543857 0.45614035087719296 0.12280701754385959 0.49122807017543857 +se.healthstatus.2013.higher-education-http-www 47 0 0.6382978723404256 0 0.8936170212765957 0 0.5957446808510638 0.36170212765957444 0.1063829787234043 0.4042553191489362 +se.healthstatus.2013.isps-http-www 19 0.05263157894736842 0.3157894736842105 0.10526315789473684 0.8421052631578947 0.05263157894736842 0.3684210526315789 0.6315789473684211 0.052631578947368474 0.5789473684210527 +se.healthstatus.2013.media-http-www 28 0 0.17857142857142858 0 0.8214285714285714 0 0.14285714285714285 0.8214285714285714 0.1785714285714286 0.8571428571428572 +se.healthstatus.2013.municipalities-http-www 271 0.01107011070110701 0.5424354243542435 0.01107011070110701 0.9704797047970479 0.01107011070110701 0.4944649446494465 0.4464944649446495 0.0184501845018451 0.4944649446494465 +se.healthstatus.2013.public-authorities-http-www 203 0.019704433497536946 0.6748768472906403 0.019704433497536946 0.8719211822660099 0.034482758620689655 0.6009852216748769 0.30541871921182273 0.10837438423645318 0.3645320197044335 diff --git a/report/datasets.non-failed.mime-types.groups.coverage.external.sorted.tsv b/report/datasets.non-failed.mime-types.groups.coverage.external.sorted.tsv new file mode 100644 index 0000000..edd1b91 --- /dev/null +++ b/report/datasets.non-failed.mime-types.groups.coverage.external.sorted.tsv @@ -0,0 +1,27 @@ +Dataset Domains html script style image data text font object document (null) +alexa.2014-09-01.random.10000-http-www 8493 0.6158012480866596 0.8762510302602143 0.5591663723066055 0.8600023548804898 0.11326975156010832 0.08442246555987283 0.10361474155186624 0.0017661603673613563 0 0.5110090662898857 +alexa.2014-09-01.random.10000-https 1135 0.6299559471365639 0.9154185022026432 0.5982378854625551 0.8863436123348017 0.1515418502202643 0.11101321585903083 0.12863436123348018 0.001762114537444934 0 0.5779735682819384 +alexa.2014-09-01.top.10000-http-www 8682 0.7646855563234278 0.9395300621976503 0.6534208707671043 0.9391845196959226 0.24648698456576826 0.259963142133149 0.12301313061506565 0.005759041695461875 0 0.6606772633033863 +alexa.2014-09-01.top.10000-https 2507 0.6521739130434783 0.9222177901874751 0.5261268448344635 0.9002792181890706 0.20941364180295174 0.1747108097327483 0.11806940566414041 0.004786597526924611 0 0.6589549262066214 +alexa.2014-09-01.top.dk.10000-http-www 2310 0.6151515151515151 0.912987012987013 0.5636363636363636 0.8887445887445887 0.129004329004329 0.11168831168831168 0.0761904761904762 0.0004329004329004329 0 0.5056277056277056 +alexa.2014-09-01.top.dk.10000-https 339 0.5693215339233039 0.9026548672566371 0.4896755162241888 0.8761061946902655 0.12684365781710916 0.12979351032448377 0.07964601769911504 0 0 0.5486725663716814 +alexa.2014-09-01.top.se.10000-http-www 2895 0.6283246977547495 0.9264248704663213 0.5813471502590674 0.9050086355785838 0.1547495682210708 0.12435233160621761 0.07392055267702936 0.000690846286701209 0 0.5412780656303973 +alexa.2014-09-01.top.se.10000-https 438 0.5639269406392694 0.9360730593607306 0.5 0.9041095890410958 0.10730593607305935 0.1141552511415525 0.0684931506849315 0 0 0.6301369863013698 +com.2014-08-29.random.10000-http-www 7811 0.583151965177314 0.7054154397644348 0.40763026501088206 0.7225707335808476 0.05338625016003073 0.03994366918448342 0.02842145691972859 0.0006401229035974907 0 0.22801177826142618 +com.2014-08-29.random.10000-https 50 0.38 0.68 0.42 0.62 0.1 0 0.06 0 0 0.42 +dk.2014-07-23.random.10000-http-www 7378 0.36785036595283277 0.5117918134995934 0.4186771482786663 0.524939007861209 0.04662510165356465 0.027514231499051234 0.02548116020601789 0.000813228517213337 0 0.20357820547573868 +dk.2014-07-23.random.10000-https 23 0.34782608695652173 0.6521739130434783 0.30434782608695654 0.6086956521739131 0 0.043478260869565216 0.043478260869565216 0 0 0.30434782608695654 +net.2014-08-29.random.10000-http-www 7378 0.6217132014095961 0.6905665492003253 0.3647329899701816 0.7263486039577122 0.04161019246408241 0.045405258877744645 0.023177012740580104 0.0010843046896177825 0.0001355380862022228 0.20534020059636757 +net.2014-08-29.random.10000-https 26 0.23076923076923078 0.5384615384615384 0.34615384615384615 0.5 0 0 0.038461538461538464 0 0 0.23076923076923078 +reach50.2014w35.se-http-www 42 0.7857142857142857 0.9523809523809523 0.7142857142857143 0.9761904761904762 0.23809523809523808 0.47619047619047616 0.09523809523809523 0 0 0.6428571428571429 +se.2014-07-10.random.100000-http 73605 0.42350383805448 0.6327559269071394 0.5514027579648122 0.6333265403165546 0.05048570069968073 0.0287208749405611 0.026981862645200732 0.0004211670402825895 1.3586033557502887e-05 0.24367909788737177 +se.2014-07-10.random.100000-http-www 77261 0.41925421622810993 0.6251795860783578 0.5434177657550381 0.6416691474353167 0.05125483749886747 0.029458588421066256 0.027348856473511863 0.0004271236458238956 3.882942234762687e-05 0.24591967486830354 +se.healthstatus.2013.counties-http-www 21 0.3333333333333333 1 0.6190476190476191 0.9523809523809523 0 0 0 0 0 0.5714285714285714 +se.healthstatus.2013.domain-registrars-http-www 134 0.43283582089552236 0.7910447761194029 0.4701492537313433 0.8059701492537313 0.08208955223880597 0.05223880597014925 0.04477611940298507 0 0 0.2537313432835821 +se.healthstatus.2013.financial-services-http-www 72 0.4444444444444444 0.875 0.375 0.9444444444444444 0.125 0.1388888888888889 0.027777777777777776 0 0 0.375 +se.healthstatus.2013.gocs-http-www 57 0.49122807017543857 0.9473684210526315 0.42105263157894735 0.9122807017543859 0.12280701754385964 0.07017543859649122 0.05263157894736842 0 0 0.2807017543859649 +se.healthstatus.2013.higher-education-http-www 47 0.40425531914893614 0.8936170212765957 0.5106382978723404 0.851063829787234 0.02127659574468085 0 0.0425531914893617 0 0 0.3617021276595745 +se.healthstatus.2013.isps-http-www 19 0.631578947368421 1 0.42105263157894735 1 0.21052631578947367 0.15789473684210525 0 0 0 0.5263157894736842 +se.healthstatus.2013.media-http-www 28 0.9285714285714286 0.9642857142857143 0.7857142857142857 0.9642857142857143 0.42857142857142855 0.8214285714285714 0 0 0 0.8571428571428571 +se.healthstatus.2013.municipalities-http-www 271 0.45018450184501846 0.9298892988929889 0.6273062730627307 0.9040590405904059 0.06273062730627306 0.01107011070110701 0.02214022140221402 0 0 0.34686346863468637 +se.healthstatus.2013.public-authorities-http-www 203 0.32019704433497537 0.9211822660098522 0.3891625615763547 0.8522167487684729 0.03940886699507389 0.024630541871921183 0.009852216748768473 0 0 0.29064039408866993 diff --git a/report/datasets.non-failed.mime-types.groups.coverage.internal.sorted.tsv b/report/datasets.non-failed.mime-types.groups.coverage.internal.sorted.tsv new file mode 100644 index 0000000..0822d03 --- /dev/null +++ b/report/datasets.non-failed.mime-types.groups.coverage.internal.sorted.tsv @@ -0,0 +1,27 @@ +Dataset Domains html script style image data text font object document (null) +alexa.2014-09-01.random.10000-http-www 8493 0.43153184975862474 0.7439067467326034 0.8033674791004356 0.8430472153538208 0.06169786883315672 0.06734958200871306 0.018839043918521134 0.0032968326857411986 0 0.34016248675379723 +alexa.2014-09-01.random.10000-https 1135 0.6123348017621145 0.7480176211453744 0.7409691629955947 0.7903083700440529 0.08458149779735682 0.07753303964757709 0.028193832599118944 0.001762114537444934 0 0.3682819383259912 +alexa.2014-09-01.top.10000-http-www 8682 0.5126698917300161 0.6256622897949781 0.5813176687399216 0.6559548491131075 0.14236351071181755 0.0801658604008293 0.028449665975581663 0.002073255010366275 0 0.2699838746832527 +alexa.2014-09-01.top.10000-https 2507 0.7901874750698046 0.7534902273633826 0.6741124850418827 0.8049461507778221 0.1970482648583965 0.1455923414439569 0.04467491025129637 0.001994415636218588 0 0.3625847626645393 +alexa.2014-09-01.top.dk.10000-http-www 2310 0.5012987012987012 0.8307359307359308 0.8636363636363636 0.8796536796536797 0.10995670995670996 0.07186147186147186 0.04329004329004329 0.0021645021645021645 0 0.33506493506493507 +alexa.2014-09-01.top.dk.10000-https 339 0.7433628318584071 0.8938053097345132 0.8761061946902655 0.9056047197640118 0.18289085545722714 0.11504424778761062 0.09144542772861357 0.0029498525073746312 0 0.39823008849557523 +alexa.2014-09-01.top.se.10000-http-www 2895 0.4493955094991364 0.8145077720207254 0.8476683937823835 0.8590673575129534 0.13367875647668392 0.07322970639032815 0.036614853195164075 0.0017271157167530224 0 0.33713298791019 +alexa.2014-09-01.top.se.10000-https 438 0.7328767123287672 0.8949771689497716 0.8881278538812786 0.9269406392694064 0.2237442922374429 0.1004566210045662 0.0547945205479452 0 0 0.4429223744292237 +com.2014-08-29.random.10000-http-www 7811 0.3548841377544489 0.2743566764818845 0.3368326718729996 0.5738061707847907 0.02125208039943669 0.015875048009217768 0.0025604916143899628 0.0024324670336704646 0 0.12840865446165664 +com.2014-08-29.random.10000-https 50 0.52 0.78 0.86 0.86 0.06 0.08 0.06 0 0 0.4 +dk.2014-07-23.random.10000-http-www 7378 0.1774193548387097 0.33816752507454595 0.42016806722689076 0.5859311466522092 0.035646516671184604 0.01951748441312009 0.003659528327460016 0.0010843046896177825 0 0.09812957441040933 +dk.2014-07-23.random.10000-https 23 0.5217391304347826 0.8260869565217391 0.9130434782608695 0.8695652173913043 0.08695652173913043 0 0.21739130434782608 0 0 0.17391304347826086 +net.2014-08-29.random.10000-http-www 7378 0.3603957712117105 0.2149634047167254 0.2766332339387368 0.541339116291678 0.012740580103008945 0.010165356465166711 0.0013553808620222283 0.0009487666034155598 0 0.11330984006505829 +net.2014-08-29.random.10000-https 26 0.4230769230769231 0.6923076923076923 0.8846153846153846 0.8846153846153846 0.07692307692307693 0.038461538461538464 0.038461538461538464 0 0 0.38461538461538464 +reach50.2014w35.se-http-www 42 0.5238095238095238 0.5952380952380952 0.47619047619047616 0.6666666666666666 0.16666666666666666 0.047619047619047616 0.047619047619047616 0 0 0.2619047619047619 +se.2014-07-10.random.100000-http 73605 0.23399225596087223 0.3402757964812173 0.42029753413490933 0.5109843081312411 0.029060525779498674 0.02695469057808573 0.0033285782215882074 0.0005298553087426126 9.51022349025202e-05 0.10272399972827932 +se.2014-07-10.random.100000-http-www 77261 0.19307283105318337 0.34052109084790516 0.4141287324782232 0.5028798488241156 0.028177217483594568 0.026002769832127463 0.0033522734626784535 0.000530668772084234 3.882942234762687e-05 0.1001669665160948 +se.healthstatus.2013.counties-http-www 21 0.23809523809523808 0.9523809523809523 0.9523809523809523 0.9523809523809523 0.09523809523809523 0 0.047619047619047616 0 0 0.2857142857142857 +se.healthstatus.2013.domain-registrars-http-www 134 0.3805970149253731 0.7313432835820896 0.7611940298507462 0.7910447761194029 0.05970149253731343 0.08955223880597014 0.007462686567164179 0 0 0.23880597014925373 +se.healthstatus.2013.financial-services-http-www 72 0.4305555555555556 0.8194444444444444 0.8333333333333334 0.8333333333333334 0.18055555555555555 0.027777777777777776 0.041666666666666664 0 0 0.2638888888888889 +se.healthstatus.2013.gocs-http-www 57 0.2631578947368421 0.8421052631578947 0.8421052631578947 0.8596491228070176 0.10526315789473684 0.07017543859649122 0.08771929824561403 0 0 0.2807017543859649 +se.healthstatus.2013.higher-education-http-www 47 0.2978723404255319 0.8723404255319149 0.8723404255319149 0.9574468085106383 0.1702127659574468 0.10638297872340426 0.0425531914893617 0 0 0.2553191489361702 +se.healthstatus.2013.isps-http-www 19 0.631578947368421 0.8947368421052632 0.9473684210526315 0.9473684210526315 0.21052631578947367 0.05263157894736842 0.05263157894736842 0 0 0.3684210526315789 +se.healthstatus.2013.media-http-www 28 0.5357142857142857 0.8214285714285714 0.75 0.7857142857142857 0.2857142857142857 0.07142857142857142 0.17857142857142858 0 0 0.4642857142857143 +se.healthstatus.2013.municipalities-http-www 271 0.1992619926199262 0.940959409594096 0.974169741697417 0.981549815498155 0.02952029520295203 0.01845018450184502 0.007380073800738007 0 0 0.3062730627306273 +se.healthstatus.2013.public-authorities-http-www 203 0.28078817733990147 0.8423645320197044 0.8768472906403941 0.896551724137931 0.06403940886699508 0.03940886699507389 0.019704433497536946 0 0 0.2019704433497537 diff --git a/report/datasets.request-status.coverage.origin.sorted.tsv b/report/datasets.request-status.coverage.origin.sorted.tsv new file mode 100644 index 0000000..4a695d6 --- /dev/null +++ b/report/datasets.request-status.coverage.origin.sorted.tsv @@ -0,0 +1,27 @@ +Dataset Domains 1xx 2xx 3xx 301 302 303 307 4xx 5xx (null) +alexa.2014-09-01.random.10000-http-www 9779 0 0.6055833929849678 0.2629103180284283 0.19040801717967074 0.06902546272625013 0.0030677983433888946 0.00040903977911851925 0 0 0.13150628898660394 +alexa.2014-09-01.random.10000-https 9952 0 0.0420016077170418 0.07204581993569131 0.04391077170418006 0.028135048231511254 0 0 0 0 0.8859525723472669 +alexa.2014-09-01.top.10000-http-www 9759 0 0.6127677016087714 0.27687263039245824 0.18741674351880316 0.0879188441438672 0.0006148170919151552 0.0009222256378727329 0 0 0.11035966799877037 +alexa.2014-09-01.top.10000-https 9971 0 0.0651890482398957 0.1862400962792097 0.13880252732925483 0.04673553304583292 0.00020058168689198675 0.0003008725303379801 0 0 0.7485708554808946 +alexa.2014-09-01.top.dk.10000-http-www 2577 0 0.5731470702367093 0.323244082266201 0.23903764066744276 0.08149010477299184 0.002328288707799767 0.00038804811796662784 0 0 0.10360884749708964 +alexa.2014-09-01.top.dk.10000-https 2637 0 0.04323094425483504 0.08532423208191127 0.06029579067121729 0.02464922260144103 0.0003792188092529389 0 0 0 0.8714448236632537 +alexa.2014-09-01.top.se.10000-http-www 3281 0 0.6193233770192015 0.26302956415726914 0.18744285278878392 0.07131971959768363 0.0027430661383724473 0.0012191405059433099 0 0 0.11764705882352941 +alexa.2014-09-01.top.se.10000-https 3362 0 0.04640095181439619 0.08387864366448543 0.05859607376561571 0.024985127900059488 0 0.000297441998810232 0 0 0.8697204045211184 +com.2014-08-29.random.10000-http-www 9965 0 0.5863522328148519 0.197491219267436 0.10888108379327646 0.08780732563973909 0.0007024586051179127 0.00010035122930255896 0 0 0.216156547917712 +com.2014-08-29.random.10000-https 10000 0 0.0026 0.0024 0.0014 0.001 0 0 0 0 0.995 +dk.2014-07-23.random.10000-http-www 9967 0 0.5370723387177686 0.20317046252633691 0.14076452292565467 0.06090097321159828 0.0014046352964783787 0.00010033109260559848 0 0 0.2597571987558944 +dk.2014-07-23.random.10000-https 10000 0 0.0013 0.001 0.0004 0.0006 0 0 0 0 0.9977 +net.2014-08-29.random.10000-http-www 9971 0 0.5520008023267475 0.1879450406177916 0.10390131381004915 0.08324140006017451 0.0006017450606759602 0.00020058168689198675 0 0 0.2600541570554608 +net.2014-08-29.random.10000-https 10000 0 0.0019 0.0007 0.0004 0.0003 0 0 0 0 0.9974 +reach50.2014w35.se-http-www 46 0 0.6521739130434783 0.2608695652173913 0.1956521739130435 0.06521739130434782 0 0 0 0 0.08695652173913043 +se.2014-07-10.random.100000-http 99497 0 0.5268902579977286 0.2128807903755892 0.15151210589263997 0.06002191020834799 0.0011357126345517955 0.00021106164004944873 0 0 0.2602289516266822 +se.2014-07-10.random.100000-http-www 99428 0 0.5883252202598865 0.18872953292835015 0.1295510319024822 0.057790562014724225 0.0011767309007523032 0.00021120811039143902 0 0 0.22294524681176328 +se.healthstatus.2013.counties-http-www 21 0 0.8571428571428571 0.14285714285714285 0.09523809523809523 0.047619047619047616 0 0 0 0 0 +se.healthstatus.2013.domain-registrars-http-www 146 0 0.541095890410959 0.3767123287671233 0.2534246575342466 0.1232876712328767 0 0 0 0 0.0821917808219178 +se.healthstatus.2013.financial-services-http-www 79 0 0.45569620253164556 0.45569620253164556 0.27848101265822783 0.1518987341772152 0.02531645569620253 0 0 0 0.08860759493670886 +se.healthstatus.2013.gocs-http-www 60 0 0.7166666666666667 0.23333333333333334 0.13333333333333333 0.1 0 0 0 0 0.05 +se.healthstatus.2013.higher-education-http-www 48 0 0.7708333333333334 0.20833333333333334 0.14583333333333334 0.0625 0 0 0 0 0.020833333333333332 +se.healthstatus.2013.isps-http-www 20 0 0.6 0.35 0.15 0.2 0 0 0 0 0.05 +se.healthstatus.2013.media-http-www 32 0 0.5625 0.3125 0.125 0.15625 0 0.03125 0 0 0.125 +se.healthstatus.2013.municipalities-http-www 288 0 0.8506944444444444 0.09027777777777778 0.034722222222222224 0.05555555555555555 0 0 0 0 0.059027777777777776 +se.healthstatus.2013.public-authorities-http-www 214 0 0.6962616822429907 0.2523364485981308 0.1261682242990654 0.12149532710280374 0.004672897196261682 0 0 0 0.0514018691588785 diff --git a/report/datasets.retries.rates.tsv b/report/datasets.retries.rates.tsv new file mode 100755 index 0000000..1c72d65 --- /dev/null +++ b/report/datasets.retries.rates.tsv @@ -0,0 +1,27 @@ +Dataset Domains Successful Unsuccessful Non-failed Failed Success rate Unsuccess rate Non-failure rate Failure rate Rate of change +alexa.2014-09-01.random.10000-http-www 9779 5922 2571 8493 1286 0.6055833929849678 0.2629103180284283 0.868493711013396 0.13150628898660394 - +alexa.2014-09-01.random.10000-https 9952 418 717 1135 8817 0.0420016077170418 0.07204581993569131 0.11404742765273312 0.8859525723472669 5.736959723937732 +alexa.2014-09-01.top.10000-http-www 9759 5980 2702 8682 1077 0.6127677016087714 0.27687263039245824 0.8896403320012296 0.11035966799877037 -0.8754338872718881 +alexa.2014-09-01.top.10000-https 9971 651 1856 2507 7464 0.0652893390833417 0.18613980543576372 0.2514291445191054 0.7485708554808946 5.783011122226602 +alexa.2014-09-01.top.dk.10000-http-www 2577 1477 833 2310 267 0.5731470702367093 0.323244082266201 0.8963911525029103 0.10360884749708964 -0.8615911282966934 +alexa.2014-09-01.top.dk.10000-https 2637 114 225 339 2298 0.04323094425483504 0.08532423208191127 0.1285551763367463 0.8714448236632537 7.410911275581292 +alexa.2014-09-01.top.se.10000-http-www 3281 2032 863 2895 386 0.6193233770192015 0.26302956415726914 0.8823529411764706 0.11764705882352941 -0.8649976962064199 +alexa.2014-09-01.top.se.10000-https 3362 156 282 438 2924 0.04640095181439619 0.08387864366448543 0.13027959547888163 0.8697204045211184 6.392623438429506 +com.2014-08-29.random.10000-http-www 9965 5843 1968 7811 2154 0.5863522328148519 0.197491219267436 0.783843452082288 0.216156547917712 -0.7514643248634242 +com.2014-08-29.random.10000-https 10000 26 24 50 9950 0.0026 0.0024 0.005 0.995 3.603145311049211 +dk.2014-07-23.random.10000-http-www 9967 5353 2025 7378 2589 0.5370723387177686 0.20317046252633691 0.7402428012441056 0.2597571987558944 -0.7389374886875433 +dk.2014-07-23.random.10000-https 10000 13 10 23 9977 0.0013 0.001 0.0023 0.9977 2.8408945152568563 +net.2014-08-29.random.10000-http-www 9971 5504 1874 7378 2593 0.5520008023267475 0.1879450406177916 0.7399458429445391 0.2600541570554608 -0.7393463395254477 +net.2014-08-29.random.10000-https 10000 19 7 26 9974 0.0019 0.0007 0.0026 0.9974 2.835354955649827 +reach50.2014w35.se-http-www 46 30 12 42 4 0.6521739130434783 0.2608695652173913 0.9130434782608695 0.08695652173913043 -0.912816801945929 +se.2014-07-10.random.100000-http 99497 52424 21181 73605 25892 0.5268902579977286 0.2128807903755892 0.7397710483733178 0.2602289516266822 1.9926329437068453 +se.2014-07-10.random.100000-http-www 99428 58496 18765 77261 22167 0.5883252202598865 0.18872953292835015 0.7770547531882367 0.22294524681176328 -0.1432727011420511 +se.healthstatus.2013.counties-http-www 21 18 3 21 0 0.8571428571428571 0.14285714285714285 1 0 -1 +se.healthstatus.2013.domain-registrars-http-www 146 79 55 134 12 0.541095890410959 0.3767123287671233 0.9178082191780822 0.0821917808219178 - +se.healthstatus.2013.financial-services-http-www 79 36 36 72 7 0.45569620253164556 0.45569620253164556 0.9113924050632911 0.08860759493670886 0.07805907172995787 +se.healthstatus.2013.gocs-http-www 60 43 14 57 3 0.7166666666666667 0.23333333333333334 0.95 0.05 -0.43571428571428567 +se.healthstatus.2013.higher-education-http-www 48 37 10 47 1 0.7708333333333334 0.20833333333333334 0.9791666666666666 0.020833333333333332 -0.5833333333333334 +se.healthstatus.2013.isps-http-www 20 12 7 19 1 0.6 0.35 0.95 0.05 1.4000000000000004 +se.healthstatus.2013.media-http-www 32 18 10 28 4 0.5625 0.3125 0.875 0.125 1.4999999999999998 +se.healthstatus.2013.municipalities-http-www 288 245 26 271 17 0.8506944444444444 0.09027777777777778 0.9409722222222222 0.059027777777777776 -0.5277777777777778 +se.healthstatus.2013.public-authorities-http-www 214 149 54 203 11 0.6962616822429907 0.2523364485981308 0.9485981308411215 0.0514018691588785 -0.12919186366135238 diff --git a/report/report.lyx b/report/report.lyx index 51c874c..4f3d58e 100644 --- a/report/report.lyx +++ b/report/report.lyx @@ -21,16 +21,10 @@ \begin{landscape} % Use TiKZ? % http://tex.stackexchange.com/questions/40501/using-restoregeometry-in-environment-next-page-runs-off-the-page-bottom/40503#40503 -\clearpage -\vspace*{\fill} -\begin{minipage}{\textwidth} \setlength\LTcapwidth{\textwidth} % default: 4in (rather less than \textwidth...) \setlength\LTleft{0pt} % default: \parindent \setlength\LTright{0pt} % default: \fill }{ -\end{minipage} -\vfill % equivalent to \vspace{\fill} -\clearpage \end{landscape} \pagebreak \aftergroup\restoregeometry @@ -2261,7 +2255,7 @@ one-shot/preparations.sh \end_layout \begin_layout Standard -Downloads, prepares and analyses third party data sets, and puts them in +Downloads, prepares and analyses third party datasets, and puts them in the current folder for use by subsequent scripts. \end_layout @@ -2880,8 +2874,7 @@ target "http://www.alexa.com/topsites" \end_inset in the world. - It is used in many research papers, and can be seen as the standard data - set. + It is used in many research papers, and can be seen as the standard dataset. \begin_inset Note Greyedout status open @@ -3758,7 +3751,7 @@ status collapsed tag, each part should download separately and perform the same duties, including “calling home” to the usual addresses. - In order to confirm this, a query was run on one of the data sets. + In order to confirm this, a query was run on one of the datasets. \end_layout \begin_layout Standard @@ -5039,6 +5032,35 @@ Looking at preliminary results, a large portion of domains yielded a redirect redirect to an internal or external domain, a specific question was written. \end_layout +\begin_layout Standard +\begin_inset Note Note +status open + +\begin_layout Plain Layout +Pages with wide tables start here. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +begin{wide} +\end_layout + +\end_inset + + +\end_layout + \begin_layout Chapter Results \end_layout @@ -5057,24 +5079,79 @@ Answer questions posed earlier. \end_layout \begin_layout Section -Differences between data sets +Differences between datasets \end_layout \begin_layout Standard -\begin_inset Note Greyedout -status open +Domains lists chosen for this thesis come in three major categories - top + lists, curated lists and random selection from zone files. + While the top lists and curated lists are assumed to primarily contain + sites with staff or enthusiasts to take care of them and make sure they + are available and functioning, the domain lists randomly extracted from + TLD zones might not. + Results below seem to fall into groups of non-random and randomly selected + domains - and result discussions often group them as such. +\end_layout + +\begin_layout Section +Failed versus non-failed +\end_layout + +\begin_layout Standard +Choosing the term non-failed instead of successful when it comes to dividing + and focusing result discussions has its basis in the HTTP standard, which + defines a status code. + Successful requests are generally shown with a HTTP status code of 200 + (actually the entire 2xx group), or a 304 which means that a previously + cached (presumably successful) result is still valid. + Many sites respond with a 3xx status, which isn't exactly successful as + it doesn't contain actual content, but can not be considered a failure + as it will most likely lead to another resource that is successful. + While a status response of 4xx or 5xx shows there is a problem of some + kind, for the purpose of this thesis a response that contains any HTTP + status number is still considered a non-failure, as the remote system has + responded with a proper HTTP response as parsed by +\begin_inset Flex Code +status collapsed \begin_layout Plain Layout -Write about. +har-heedless \end_layout \end_inset + and +\begin_inset Flex Code +status collapsed +\begin_layout Plain Layout +phantomjs \end_layout -\begin_layout Section -Top content +\end_inset + +. + Overall, 4xx/5xx responses have been very rare; while this can be because + of software problems, they do exist and therefore the code seems to work + alright. +\end_layout + +\begin_layout Standard +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +tsvtable{datasets.request-status.coverage.origin.sorted.tsv}{Dataset origin HTTP + response code/group coverage}{}{fixed, display columns/0/.style={string + type, column type=l}, display columns/1/.style={string type, column type=i}} +\end_layout + +\end_inset + + \end_layout \begin_layout Standard @@ -5082,7 +5159,7 @@ Top content status open \begin_layout Plain Layout -Write about. +Update table with full result datasets. \end_layout \end_inset @@ -5090,25 +5167,78 @@ Write about. \end_layout -\begin_layout Section -Top public suffixes +\begin_layout Standard +HAR data that doesn't have a parseable HTTP status outcome number (shown + as +\begin_inset Flex Code +status collapsed + +\begin_layout Plain Layout +(null) +\end_layout + +\end_inset + + above) is considered a failed request. + Looking at the resulting HAR files means that both local software errors + and remote errors are considered failures, and while it might be technically + possible to distinguish a software error from a remote error it has not + been done apart from during initial development and debugging. + In order to reduce temporary or intermittent problems, all domains that + failed were retried up to two times. \end_layout \begin_layout Standard +Non-random domains have a failure rate of below 15% for HTTP, and below + 90% for HTTPS. + Random zone domains have a failure rate of above 20% for HTTP and above + 99% for HTTPS. + The very low HTTPS adoption rates among random sites is both surprising + and not surprising - while larger sites have felt the pressure to implement + them, a non-professional site owner might see it as both an unnecessary + technical challenge and an unnecessary additional cost as most X.509 \begin_inset Note Greyedout status open \begin_layout Plain Layout -Write about. +Insert link to X.509. \end_layout \end_inset + public key infrastructure (PKI) certificates +\begin_inset Note Greyedout +status open +\begin_layout Plain Layout +Insert link to PKI. \end_layout -\begin_layout Section -Top recognized domains +\end_inset + + adds to site hosting bills. +\end_layout + +\begin_layout Standard +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +tsvtable{datasets.retries.rates.tsv}{Dataset HAR failure rates}{Dataset,Domains,Suc +cessful,Unsuccessful,Non-failed,Failed,Non-failure rate,Failure rate}{fixed, + display columns/0/.style={string type, column type=l}, display columns/1/.style={ +string type, column type=i}, display columns/2/.style={string type, column + type=i}, display columns/3/.style={string type, column type=i}, display + columns/4/.style={string type, column type=i}, display columns/5/.style={string + type, column type=i}} +\end_layout + +\end_inset + + \end_layout \begin_layout Standard @@ -5116,7 +5246,7 @@ Top recognized domains status open \begin_layout Plain Layout -Write about. +Update table with full result datasets. \end_layout \end_inset @@ -5124,8 +5254,53 @@ Write about. \end_layout +\begin_layout Standard +During analysis +\begin_inset Flex Code +status collapsed + +\begin_layout Plain Layout +har-dulcify +\end_layout + +\end_inset + + splits results into unfiltered and non-failed only origin domains. + Unless otherwise mentioned, further results are presented based only on + non-failed domains in each dataset. +\end_layout + \begin_layout Section -Top recognized organizations +Internal versus external resources +\end_layout + +\begin_layout Standard +The table shows non-failed origin domains having requests strictly to the + same domain, subdomains or superdomains - together known as internal domains + - or external (non-internal) domains. + Origin domains that are not exclusively loading from either internal or + external resources are loading from mixed domains. + Mixing resources from both internal and external domains is the most common + way to compose a web page for datasets not randomly chosen from zones, + although it is quite common for random domains as well. +\end_layout + +\begin_layout Standard +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +tsvtable{datasets.non-failed.classification.domain-scope.coverage.sorted.tsv}{Internal + versus external resources coverage}{}{fixed, display columns/0/.style={string + type, column type=l}, display columns/1/.style={string type, column type=i}} +\end_layout + +\end_inset + + \end_layout \begin_layout Standard @@ -5133,7 +5308,7 @@ Top recognized organizations status open \begin_layout Plain Layout -Write about top organizations. +Update table with full result datasets. \end_layout \end_inset @@ -5141,44 +5316,115 @@ Write about top organizations. \end_layout +\begin_layout Standard +During analysis +\begin_inset Flex Code +status collapsed + +\begin_layout Plain Layout +har-dulcify +\end_layout + +\end_inset + + splits results into unfiltered, internal and external resources. +\end_layout + \begin_layout Section -Top recognized domain categories +Insecure versus secure resources \end_layout \begin_layout Standard +Using HTTPS to secure the connection between site and site users is considered + an effective way to avoid prying eyes on the otherwise technically quite + open and insecure internet. + Sites which handle sensitive information, such as e-commerce shops, online + payment providers and of course banks often tout being secure to use - + and they have strong financial incentives to provide a service that is + (or at least comes across as) trustworthy. + \begin_inset Note Greyedout status open \begin_layout Plain Layout -Write about. +Reference for strong financial incentives for security? \end_layout \end_inset - + As browsers will warn users if a site secured with HTTPS loads resources + over non-HTTPS connections, site developers will have to make sure each + and every request is secure to avoid being labeled not trustworthy. + This also applies to third-party services, which have to make sure to provide + HTTPS in order to be able to continue providing services to sites making + the switch to a fully secured experience. \end_layout -\begin_layout Section -Disconnect's blocking list matches +\begin_layout Standard +One of the concerns with mixing in HTTP on an HTTPS site is that an attacker + can use traffic sniffers to get a hold of sensitive information leaking + out through HTTP, or man in the middle attacks on several kinds of resources + to insert malicious code, even though the site is supposed to be protected. + The follow table shows which to what extent sites manage to take full advantage + of HTTPS, and to which extent they fail in requesting either internal or + external resources. \end_layout -\begin_layout Subsection -Top domains +\begin_layout Standard +While the technology has been around a long time, it doesn't seem as if + very many sites actually use HTTPS on their sites. + Even origin sites that respond to HTTPS requests seem to either redirect + to a HTTP site, or load at least some of its resources over non-HTTPS connectio +ns. + Typing in a HTTPS address into the browser's address bar will actually + only give full HTTPS security on 27-58% of the domains - a number where + the random domains surprisingly beat the non-random ones. \end_layout \begin_layout Standard -A selection of domains, and their coverage across different datasets. +Why is adoption lower for top sites? As high-traffic sites they might have + a high system load, and since HTTPS require some extra processing and data + exchange, they might have deferred it until the security is +\emph on +really +\emph default + needed - like when passwords of financial information is entered. + Strict HTTPS performance concerns were dismissed by Google engineers in + 2xxx +\begin_inset Note Greyedout +status open + +\begin_layout Plain Layout +Look up year, insert link. \end_layout -\begin_layout Standard -\begin_inset ERT +\end_inset + + - and Google has since implemented HTTPS as an alternative for most and + the default for some services. + +\begin_inset Note Greyedout status open \begin_layout Plain Layout +Insert link to Google's HTTPS defaults. +\end_layout + +\end_inset -\backslash -begin{wide} +\end_layout + +\begin_layout Standard +Another concern is that curated domain lists seem to exhibit an even lower + HTTPS adoption than both random and top domains - the domains have been + selected as they are deemed important to the public in some way. + +\begin_inset Note Greyedout +status open + +\begin_layout Plain Layout +Update paragraph after HTTPS for .SE Health Status is finished. \end_layout \end_inset @@ -5194,9 +5440,9 @@ status open \backslash -tsvtable{datasets.non-failed.disconnect.domains.sorted.tsv}{Top Disconnect domain - matches}{}{fixed, display columns/0/.style={string type, column type=l}, - display columns/1/.style={string type, column type=i}} +tsvtable{datasets.non-failed.classification.secure.coverage.sorted.tsv}{Secure + versus insecure resources coverage}{}{fixed, display columns/0/.style={string + type, column type=l}, display columns/1/.style={string type, column type=i}} \end_layout \end_inset @@ -5217,15 +5463,48 @@ Update table with full result datasets. \end_layout +\begin_layout Section +Content type group coverage +\end_layout + \begin_layout Standard -\begin_inset ERT +The difference between what can be achieved between different types of resources + makes the distribution interesting. + Images +\begin_inset Note Greyedout status open \begin_layout Plain Layout +Does SVG images allow loading external resources? +\end_layout +\end_inset -\backslash -end{wide} + and text (unless improperly labeled during transfer and parsed as another + format) loaded by a browser provide no additional way to load further resources +, while html, scripts and style do. + While data resources can trigger downloading additional resources based + on the logic that consumes the data, it still requires another type of + resource present to do that. +\end_layout + +\begin_layout Standard +Objects and external documents can also access additional resources, but + the use of those types of resources has been very low in the extracted + data. + There might be several reasons, but the fact that the tests were run on + a headless browser without additional plugins installed is probably the + biggest in this case. + An additional reason might be adoption of HTML5 and client side javascript + instead of Flash for visual, dynamic material and animations. + This evolution has been fueled by Apple's resistance towards supporting + Flash on their handheld devices. + +\begin_inset Note Greyedout +status open + +\begin_layout Plain Layout +Reference Apple's Flash implementation resistance. \end_layout \end_inset @@ -5234,11 +5513,21 @@ end{wide} \end_layout \begin_layout Subsection -Top categories +Origin \end_layout \begin_layout Standard -Categories and their coverage across different datasets. +Practically all successful origin requests result in a html response. + The range is 84-100% html, with the difference being seemingly misconfigured + responses, some of which are redirects without actual content. +\end_layout + +\begin_layout Subsection +Internal +\end_layout + +\begin_layout Standard +Internal requests exclude the requests to the origin page. \end_layout \begin_layout Standard @@ -5249,7 +5538,22 @@ status open \backslash -begin{wide} +tsvtable{datasets.non-failed.mime-types.groups.coverage.internal.sorted.tsv}{Content + type group coverage}{}{fixed, display columns/0/.style={string type, column + type=l}, display columns/1/.style={string type, column type=i}} +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset Note Greyedout +status open + +\begin_layout Plain Layout +Update table with full result datasets. \end_layout \end_inset @@ -5257,6 +5561,19 @@ begin{wide} \end_layout +\begin_layout Subsection +External +\end_layout + +\begin_layout Standard +External resources from each group enjoy almost the same coverage as their + internal counterparts. + Among non-zone datasets scripts often reach above 90% coverage, showing + that active and popular web pages contain a lot of external dynamic material. + Images, while not dynamic, as well as styles and html are also popular + to load externally. +\end_layout + \begin_layout Standard \begin_inset ERT status open @@ -5265,9 +5582,9 @@ status open \backslash -tsvtable{datasets.non-failed.disconnect.categories.sorted.tsv}{Disconnect category - matches}{}{fixed, display columns/0/.style={string type, column type=l}, - display columns/1/.style={string type, column type=i}} +tsvtable{datasets.non-failed.mime-types.groups.coverage.external.sorted.tsv}{Content + type group coverage}{}{fixed, display columns/0/.style={string type, column + type=l}, display columns/1/.style={string type, column type=i}} \end_layout \end_inset @@ -5288,6 +5605,86 @@ Update table with full result datasets. \end_layout +\begin_layout Section +Top public suffixes +\end_layout + +\begin_layout Standard +\begin_inset Note Greyedout +status open + +\begin_layout Plain Layout +Write about. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Section +Top recognized domains +\end_layout + +\begin_layout Standard +\begin_inset Note Greyedout +status open + +\begin_layout Plain Layout +Write about. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Section +Top recognized organizations +\end_layout + +\begin_layout Standard +\begin_inset Note Greyedout +status open + +\begin_layout Plain Layout +Write about top organizations. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Section +Top recognized domain categories +\end_layout + +\begin_layout Standard +\begin_inset Note Greyedout +status open + +\begin_layout Plain Layout +Write about. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Section +Disconnect's blocking list matches +\end_layout + +\begin_layout Subsection +Top domains +\end_layout + +\begin_layout Standard +A selection of domains, and their coverage across different datasets. +\end_layout + \begin_layout Standard \begin_inset ERT status open @@ -5296,7 +5693,22 @@ status open \backslash -end{wide} +tsvtable{datasets.non-failed.disconnect.domains.sorted.tsv}{Top Disconnect domain + match coverage}{}{fixed, display columns/0/.style={string type, column type=l}, + display columns/1/.style={string type, column type=i}} +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset Note Greyedout +status open + +\begin_layout Plain Layout +Update table with full result datasets. \end_layout \end_inset @@ -5305,11 +5717,11 @@ end{wide} \end_layout \begin_layout Subsection -Top organizations +Top categories \end_layout \begin_layout Standard -A selection of organizations, and their coverage across different datasets. +Categories and their coverage across different datasets. \end_layout \begin_layout Standard @@ -5320,7 +5732,9 @@ status open \backslash -begin{wide} +tsvtable{datasets.non-failed.disconnect.categories.sorted.tsv}{Disconnect category + match coverage}{}{fixed, display columns/0/.style={string type, column type=l}, + display columns/1/.style={string type, column type=i}} \end_layout \end_inset @@ -5328,6 +5742,27 @@ begin{wide} \end_layout +\begin_layout Standard +\begin_inset Note Greyedout +status open + +\begin_layout Plain Layout +Update table with full result datasets. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsection +Top organizations +\end_layout + +\begin_layout Standard +A selection of organizations, and their coverage across different datasets. +\end_layout + \begin_layout Standard \begin_inset ERT status open @@ -5337,8 +5772,8 @@ status open \backslash tsvtable{datasets.non-failed.disconnect.organizations.sorted.tsv}{Top Disconnect - organization matches}{}{fixed, display columns/0/.style={string type, column - type=l}, display columns/1/.style={string type, column type=i}} + organization match coverage}{}{fixed, display columns/0/.style={string type, + column type=l}, display columns/1/.style={string type, column type=i}} \end_layout \end_inset @@ -5373,6 +5808,19 @@ end{wide} \end_inset +\end_layout + +\begin_layout Standard +\begin_inset Note Note +status open + +\begin_layout Plain Layout +Pages with wide tables end here. +\end_layout + +\end_inset + + \end_layout \begin_layout Chapter @@ -6069,6 +6517,35 @@ Mention the possibility to educate users with a webpage. \end_inset +\end_layout + +\begin_layout Standard +\begin_inset Note Greyedout +status open + +\begin_layout Plain Layout +Mention more rigorously testing har-heedless and phantomjs by setting up + test servers serving different response types (errors, objects/documents, + technically extreme web page content). +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard +\begin_inset Note Greyedout +status open + +\begin_layout Plain Layout +Mention more rigorously testing har-dulcify and data manipulation scripts + by creating test data sets. +\end_layout + +\end_inset + + \end_layout \begin_layout Standard