-
Notifications
You must be signed in to change notification settings - Fork 17
/
5-web-scraping.html
842 lines (752 loc) · 46.3 KB
/
5-web-scraping.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<meta name="generator" content="pandoc" />
<meta http-equiv="X-UA-Compatible" content="IE=EDGE" />
<title>Web Scraping</title>
<script src="5-web-scraping_files/header-attrs-2.9/header-attrs.js"></script>
<script src="5-web-scraping_files/jquery-1.11.3/jquery.min.js"></script>
<meta name="viewport" content="width=device-width, initial-scale=1" />
<link href="5-web-scraping_files/bootstrap-3.3.5/css/lumen.min.css" rel="stylesheet" />
<script src="5-web-scraping_files/bootstrap-3.3.5/js/bootstrap.min.js"></script>
<script src="5-web-scraping_files/bootstrap-3.3.5/shim/html5shiv.min.js"></script>
<script src="5-web-scraping_files/bootstrap-3.3.5/shim/respond.min.js"></script>
<style>h1 {font-size: 34px;}
h1.title {font-size: 38px;}
h2 {font-size: 30px;}
h3 {font-size: 24px;}
h4 {font-size: 18px;}
h5 {font-size: 16px;}
h6 {font-size: 12px;}
code {color: inherit; background-color: rgba(0, 0, 0, 0.04);}
pre:not([class]) { background-color: white }</style>
<script src="5-web-scraping_files/jqueryui-1.11.4/jquery-ui.min.js"></script>
<link href="5-web-scraping_files/tocify-1.9.1/jquery.tocify.css" rel="stylesheet" />
<script src="5-web-scraping_files/tocify-1.9.1/jquery.tocify.js"></script>
<script src="5-web-scraping_files/navigation-1.1/tabsets.js"></script>
<link href="5-web-scraping_files/pagedtable-1.1/css/pagedtable.css" rel="stylesheet" />
<script src="5-web-scraping_files/pagedtable-1.1/js/pagedtable.js"></script>
<style type="text/css">
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
span.underline{text-decoration: underline;}
div.column{display: inline-block; vertical-align: top; width: 50%;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
</style>
<style type="text/css">
code {
white-space: pre;
}
.sourceCode {
overflow: visible;
}
</style>
<style type="text/css" data-origin="pandoc">
pre > code.sourceCode { white-space: pre; position: relative; }
pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
pre > code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
div.sourceCode { margin: 1em 0; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
pre > code.sourceCode { white-space: pre-wrap; }
pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
{ counter-reset: source-line 0; }
pre.numberSource code > span
{ position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
{ content: counter(source-line);
position: relative; left: -1em; text-align: right; vertical-align: baseline;
border: none; display: inline-block;
-webkit-touch-callout: none; -webkit-user-select: none;
-khtml-user-select: none; -moz-user-select: none;
-ms-user-select: none; user-select: none;
padding: 0 4px; width: 4em;
color: #aaaaaa;
}
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; }
div.sourceCode
{ background-color: #f8f8f8; }
@media screen {
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
code span.al { color: #ef2929; } /* Alert */
code span.an { color: #8f5902; font-weight: bold; font-style: italic; } /* Annotation */
code span.at { color: #c4a000; } /* Attribute */
code span.bn { color: #0000cf; } /* BaseN */
code span.cf { color: #204a87; font-weight: bold; } /* ControlFlow */
code span.ch { color: #4e9a06; } /* Char */
code span.cn { color: #000000; } /* Constant */
code span.co { color: #8f5902; font-style: italic; } /* Comment */
code span.cv { color: #8f5902; font-weight: bold; font-style: italic; } /* CommentVar */
code span.do { color: #8f5902; font-weight: bold; font-style: italic; } /* Documentation */
code span.dt { color: #204a87; } /* DataType */
code span.dv { color: #0000cf; } /* DecVal */
code span.er { color: #a40000; font-weight: bold; } /* Error */
code span.ex { } /* Extension */
code span.fl { color: #0000cf; } /* Float */
code span.fu { color: #000000; } /* Function */
code span.im { } /* Import */
code span.in { color: #8f5902; font-weight: bold; font-style: italic; } /* Information */
code span.kw { color: #204a87; font-weight: bold; } /* Keyword */
code span.op { color: #ce5c00; font-weight: bold; } /* Operator */
code span.ot { color: #8f5902; } /* Other */
code span.pp { color: #8f5902; font-style: italic; } /* Preprocessor */
code span.sc { color: #000000; } /* SpecialChar */
code span.ss { color: #4e9a06; } /* SpecialString */
code span.st { color: #4e9a06; } /* String */
code span.va { color: #000000; } /* Variable */
code span.vs { color: #4e9a06; } /* VerbatimString */
code span.wa { color: #8f5902; font-weight: bold; font-style: italic; } /* Warning */
</style>
<script>
// apply pandoc div.sourceCode style to pre.sourceCode instead
(function() {
var sheets = document.styleSheets;
for (var i = 0; i < sheets.length; i++) {
if (sheets[i].ownerNode.dataset["origin"] !== "pandoc") continue;
try { var rules = sheets[i].cssRules; } catch (e) { continue; }
for (var j = 0; j < rules.length; j++) {
var rule = rules[j];
// check if there is a div.sourceCode rule
if (rule.type !== rule.STYLE_RULE || rule.selectorText !== "div.sourceCode") continue;
var style = rule.style.cssText;
// check if color or background-color is set
if (rule.style.color === '' && rule.style.backgroundColor === '') continue;
// replace div.sourceCode by a pre.sourceCode rule
sheets[i].deleteRule(j);
sheets[i].insertRule('pre.sourceCode{' + style + '}', j);
}
}
})();
</script>
<link rel="stylesheet" href="custom.css" type="text/css" />
<style type = "text/css">
.main-container {
max-width: 940px;
margin-left: auto;
margin-right: auto;
}
img {
max-width:100%;
}
.tabbed-pane {
padding-top: 12px;
}
.html-widget {
margin-bottom: 20px;
}
button.code-folding-btn:focus {
outline: none;
}
summary {
display: list-item;
}
pre code {
padding: 0;
}
</style>
<!-- tabsets -->
<style type="text/css">
.tabset-dropdown > .nav-tabs {
display: inline-table;
max-height: 500px;
min-height: 44px;
overflow-y: auto;
border: 1px solid #ddd;
border-radius: 4px;
}
.tabset-dropdown > .nav-tabs > li.active:before {
content: "";
font-family: 'Glyphicons Halflings';
display: inline-block;
padding: 10px;
border-right: 1px solid #ddd;
}
.tabset-dropdown > .nav-tabs.nav-tabs-open > li.active:before {
content: "";
border: none;
}
.tabset-dropdown > .nav-tabs.nav-tabs-open:before {
content: "";
font-family: 'Glyphicons Halflings';
display: inline-block;
padding: 10px;
border-right: 1px solid #ddd;
}
.tabset-dropdown > .nav-tabs > li.active {
display: block;
}
.tabset-dropdown > .nav-tabs > li > a,
.tabset-dropdown > .nav-tabs > li > a:focus,
.tabset-dropdown > .nav-tabs > li > a:hover {
border: none;
display: inline-block;
border-radius: 4px;
background-color: transparent;
}
.tabset-dropdown > .nav-tabs.nav-tabs-open > li {
display: block;
float: none;
}
.tabset-dropdown > .nav-tabs > li {
display: none;
}
</style>
<!-- code folding -->
<style type="text/css">
#TOC {
margin: 25px 0px 20px 0px;
}
@media (max-width: 768px) {
#TOC {
position: relative;
width: 100%;
}
}
@media print {
.toc-content {
/* see https://github.com/w3c/csswg-drafts/issues/4434 */
float: right;
}
}
.toc-content {
padding-left: 30px;
padding-right: 40px;
}
div.main-container {
max-width: 1200px;
}
div.tocify {
width: 20%;
max-width: 260px;
max-height: 85%;
}
@media (min-width: 768px) and (max-width: 991px) {
div.tocify {
width: 25%;
}
}
@media (max-width: 767px) {
div.tocify {
width: 100%;
max-width: none;
}
}
.tocify ul, .tocify li {
line-height: 20px;
}
.tocify-subheader .tocify-item {
font-size: 0.90em;
}
.tocify .list-group-item {
border-radius: 0px;
}
</style>
</head>
<body>
<div class="container-fluid main-container">
<!-- setup 3col/9col grid for toc_float and main content -->
<div class="row">
<div class="col-xs-12 col-sm-4 col-md-3">
<div id="TOC" class="tocify">
</div>
</div>
<div class="toc-content col-xs-12 col-sm-8 col-md-9">
<div id="header">
<h1 class="title toc-ignore">Web Scraping</h1>
<h3 class="subtitle">Collecting Data from the Web using R</h3>
</div>
<hr />
<p>After talking quite a bit about data formats and data processing in the past weeks, today’s session is dedicated to data collection - from the web!</p>
<p>What we will cover:</p>
<ul>
<li>basic web technologies (html, xpath)</li>
<li>scraping static webpages</li>
<li>scraping multiple static webpages</li>
<li>building up and maintaining you own original sets of web-based data</li>
</ul>
<p>What we will not cover (today):</p>
<ul>
<li>scraping dynamic webpages</li>
<li>APIs</li>
<li>extensive data cleaning using regular expressions</li>
</ul>
<div id="why-webscrape-with-r" class="section level1">
<h1>Why webscrape with R? 🌐</h1>
<p>Webscraping broadly includes a) getting (unstructured) data from the web and b) bringing it into shape (e.g. cleaning it, getting it into tabular format).</p>
<p>Why webscrape? While some influential people consider “Data Scientist” 👩💻 as the <a href="https://hbr.org/2012/10/data-scientist-the-sexiest-job-of-the-21st-century">sexiest job</a> of the 21st century (congratulations!), one of the sexiest just emerging academic disciplines (my influential view) - Computational Social Science (CSS). Why so?</p>
<ul>
<li>data abundance online</li>
<li>social interaction online</li>
<li>services track social behavior</li>
</ul>
<p>BUT online data are usually meant for display, not (clean) download!</p>
<p>But getting access to online data would also be incredibly interesting when you think of very pragmatic things like financial resources, time resources, reproducibility and updateability…</p>
<p>Luckily, with <code>R</code> we can automate the whole pipeline of downloading, parsing and post-processing to make our projects easily reproducible.</p>
<p>In general, remember, the basic <strong>workflow for scraping static webpages</strong> is the following.</p>
<p><img src="pics/workflow.png" width="90%" style="display: block; margin: auto;" /></p>
</div>
<div id="scraping-static-websites-with-rvest" class="section level1">
<h1>Scraping static websites with <code>rvest</code> 🚜</h1>
<p>Who doesn’t love Wikipedia? Let’s use this as our first, straight forward test case.</p>
<p><strong>Step 1.</strong> Load the packages <code>rvest</code> and <code>stringr</code>.</p>
<div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(rvest)</span>
<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(stringr)</span></code></pre></div>
<p><strong>Step 2.</strong> Parse the page source.</p>
<div class="sourceCode" id="cb2"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>parsed_url <span class="ot"><-</span> <span class="fu">read_html</span>(<span class="st">"https://en.wikipedia.org/wiki/Cologne"</span>)</span></code></pre></div>
<p><strong>Step 3.</strong> Extract information.</p>
<div class="sourceCode" id="cb3"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a>parsed_nodes <span class="ot"><-</span> <span class="fu">html_nodes</span>(parsed_url, </span>
<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a> <span class="at">xpath =</span> <span class="st">'//p[(((count(preceding-sibling::*) + 1) = 157) and parent::*)]'</span>)</span>
<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a>carnival <span class="ot"><-</span> <span class="fu">html_text</span>(parsed_nodes)</span>
<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a>carnival</span></code></pre></div>
<pre><code>## [1] "Cologne is also famous for Eau de Cologne (German: Kölnisch Wasser; lit: \"Water of Cologne\"), a perfume created by Italian expatriate Johann Maria Farina at the beginning of the 18th century. During the 18th century, this perfume became increasingly popular, was exported all over Europe by the Farina family and Farina became a household name for Eau de Cologne. In 1803 Wilhelm Mülhens entered into a contract with an unrelated person from Italy named Carlo Francesco Farina who granted him the right to use his family name and Mühlens opened a small factory at Cologne's Glockengasse. In later years, and after various court battles, his grandson Ferdinand Mülhens was forced to abandon the name Farina for the company and their product. He decided to use the house number given to the factory at Glockengasse during the French occupation in the early 19th century, 4711. Today, original Eau de Cologne is still produced in Cologne by both the Farina family, currently in the eighth generation, and by Mäurer & Wirtz who bought the 4711 brand in 2006.\n"</code></pre>
<p>How can do I know THIS <code>xpath = '//p[(((count(preceding-sibling::*) + 1) = 157) and parent::*)]'</code> ?</p>
<p>There are two options:</p>
<p><strong>Option 1.</strong> On your page of interest, go to a table that you’d like to scrape. Our favorite bowser for webscraping is Google Chrome but others work as well. On Chrome, you go in View > Developer > inspect elements. If you hover over the html code on the right, you should see boxes of different colors framing different elements of the page. Once the part of the page you would like to scrape is selected, right click on the html code and Copy > Copy Xpath. That’s it.</p>
<p><img src="pics/inspect.png" width="90%" style="display: block; margin: auto;" /></p>
<p><strong>Option 2.</strong> You download the <a href="https://chrome.google.com/webstore/detail/selectorgadget/mhjhnkcfbdhnjickkkdbjoemdmbfginb?hl=de">Chrome Extension</a> <code>SelectorGadget</code> and activate it while browsing the page you’d like to scrape from. You will see a selection box moving with your cursor. You select an element by clickin on it. It turns green - and so does all other content that would be selected with the current XPath.</p>
<p><img src="pics/selector.png" width="90%" style="display: block; margin: auto;" /></p>
<p>You can now de-select everything that is irrelevant to you by clicking it again (it then turns red). Final step, then just click the XPath button at the bottom of the browser window. Make sure to use single quotation marks with this XPath!</p>
<p><img src="pics/selector2.png" width="90%" style="display: block; margin: auto;" /></p>
<p>Let’s repeat step 2 and 3 with a more data-sciency example. 🕺</p>
<p><strong>Step 2.</strong> Parse the page source.</p>
<div class="sourceCode" id="cb5"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a>hot100page <span class="ot"><-</span> <span class="st">"https://www.billboard.com/charts/hot-100"</span></span>
<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a>hot100 <span class="ot"><-</span> <span class="fu">read_html</span>(hot100page)</span></code></pre></div>
<p><strong>Step 3.</strong> Extract information. When going through different levels of html, you can also use tidyverse logic.</p>
<pre><code>body_nodes <- hot100 %>%
html_node("body") %>%
html_children()
body_nodes %>%
html_children()</code></pre>
<p>play with that yourself if you like…</p>
<p>Now let’s get more specific - <code>xml_find_all()</code> takes xpath syntax!</p>
<div class="sourceCode" id="cb7"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a>rank <span class="ot"><-</span> hot100 <span class="sc">%>%</span> </span>
<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a> rvest<span class="sc">::</span><span class="fu">html_nodes</span>(<span class="st">'body'</span>) <span class="sc">%>%</span> </span>
<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a> xml2<span class="sc">::</span><span class="fu">xml_find_all</span>(<span class="st">"//span[contains(@class, 'chart-element__rank__number')]"</span>) <span class="sc">%>%</span> </span>
<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a> rvest<span class="sc">::</span><span class="fu">html_text</span>()</span>
<span id="cb7-5"><a href="#cb7-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb7-6"><a href="#cb7-6" aria-hidden="true" tabindex="-1"></a>artist <span class="ot"><-</span> hot100 <span class="sc">%>%</span> </span>
<span id="cb7-7"><a href="#cb7-7" aria-hidden="true" tabindex="-1"></a> rvest<span class="sc">::</span><span class="fu">html_nodes</span>(<span class="st">'body'</span>) <span class="sc">%>%</span> </span>
<span id="cb7-8"><a href="#cb7-8" aria-hidden="true" tabindex="-1"></a> xml2<span class="sc">::</span><span class="fu">xml_find_all</span>(<span class="st">"//span[contains(@class, 'chart-element__information__artist')]"</span>) <span class="sc">%>%</span> </span>
<span id="cb7-9"><a href="#cb7-9" aria-hidden="true" tabindex="-1"></a> rvest<span class="sc">::</span><span class="fu">html_text</span>()</span>
<span id="cb7-10"><a href="#cb7-10" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb7-11"><a href="#cb7-11" aria-hidden="true" tabindex="-1"></a>title <span class="ot"><-</span> hot100 <span class="sc">%>%</span> </span>
<span id="cb7-12"><a href="#cb7-12" aria-hidden="true" tabindex="-1"></a> rvest<span class="sc">::</span><span class="fu">html_nodes</span>(<span class="st">'body'</span>) <span class="sc">%>%</span> </span>
<span id="cb7-13"><a href="#cb7-13" aria-hidden="true" tabindex="-1"></a> xml2<span class="sc">::</span><span class="fu">xml_find_all</span>(<span class="st">"//span[contains(@class, 'chart-element__information__song')]"</span>) <span class="sc">%>%</span> </span>
<span id="cb7-14"><a href="#cb7-14" aria-hidden="true" tabindex="-1"></a> rvest<span class="sc">::</span><span class="fu">html_text</span>()</span></code></pre></div>
<p><strong>Step 4.</strong> Usually, step 4 is to clean extracted data. In this case, it actually is very clean already. However, if you need help with data cleaning using regular expressions (me, always.), visit the section of <a href="https://r4ds.had.co.nz/strings.html">R for Data Science</a> or simply this <a href="https://evoldyn.gitlab.io/evomics-2018/ref-sheets/R_strings.pdf">cheat sheet</a>.</p>
<p><strong>Step 5.</strong> Put everything into a data frame. 🎵</p>
<div class="sourceCode" id="cb8"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a>chart_df <span class="ot"><-</span> <span class="fu">data.frame</span>(rank, artist, title)</span>
<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a>knitr<span class="sc">::</span><span class="fu">kable</span>(chart_df <span class="sc">%>%</span> <span class="fu">head</span>(<span class="dv">10</span>))</span></code></pre></div>
<table>
<thead>
<tr class="header">
<th align="left">rank</th>
<th align="left">artist</th>
<th align="left">title</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="left">1</td>
<td align="left">Lil Nas X & Jack Harlow</td>
<td align="left">Industry Baby</td>
</tr>
<tr class="even">
<td align="left">2</td>
<td align="left">The Kid LAROI & Justin Bieber</td>
<td align="left">Stay</td>
</tr>
<tr class="odd">
<td align="left">3</td>
<td align="left">Walker Hayes</td>
<td align="left">Fancy Like</td>
</tr>
<tr class="even">
<td align="left">4</td>
<td align="left">Ed Sheeran</td>
<td align="left">Bad Habits</td>
</tr>
<tr class="odd">
<td align="left">5</td>
<td align="left">Drake Featuring Future & Young Thug</td>
<td align="left">Way 2 Sexy</td>
</tr>
<tr class="even">
<td align="left">6</td>
<td align="left">Olivia Rodrigo</td>
<td align="left">Good 4 U</td>
</tr>
<tr class="odd">
<td align="left">7</td>
<td align="left">Doja Cat Featuring SZA</td>
<td align="left">Kiss Me More</td>
</tr>
<tr class="even">
<td align="left">8</td>
<td align="left">Dua Lipa</td>
<td align="left">Levitating</td>
</tr>
<tr class="odd">
<td align="left">9</td>
<td align="left">Wizkid Featuring Justin Bieber & Tems</td>
<td align="left">Essence</td>
</tr>
<tr class="even">
<td align="left">10</td>
<td align="left">Ed Sheeran</td>
<td align="left">Shivers</td>
</tr>
</tbody>
</table>
<div id="scraping-html-tables" class="section level2">
<h2>Scraping HTML tables 🚀</h2>
<p>We have just been scraping an html table… but there is even an easier way to do this in <code>rvest</code>!</p>
<div class="sourceCode" id="cb9"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a>url_p <span class="ot"><-</span> <span class="fu">read_html</span>(<span class="st">"https://en.wikipedia.org/wiki/List_of_human_spaceflights"</span>)</span>
<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a>tables <span class="ot"><-</span> <span class="fu">html_table</span>(url_p, <span class="at">header =</span> <span class="cn">TRUE</span>, <span class="at">fill =</span> <span class="cn">TRUE</span>)</span>
<span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a>spaceflights <span class="ot"><-</span> tables[[<span class="dv">1</span>]]</span>
<span id="cb9-4"><a href="#cb9-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb9-5"><a href="#cb9-5" aria-hidden="true" tabindex="-1"></a>knitr<span class="sc">::</span><span class="fu">kable</span>(spaceflights[,<span class="dv">1</span><span class="sc">:</span><span class="dv">7</span>] <span class="sc">%>%</span> <span class="fu">head</span>(<span class="dv">10</span>))</span></code></pre></div>
<table>
<colgroup>
<col width="8%" />
<col width="18%" />
<col width="18%" />
<col width="18%" />
<col width="12%" />
<col width="12%" />
<col width="12%" />
</colgroup>
<thead>
<tr class="header">
<th align="left">Entity</th>
<th align="left">Soviet Union</th>
<th align="left">Soviet Union</th>
<th align="left">Soviet Union</th>
<th align="left">United States</th>
<th align="left">United States</th>
<th align="left">United States</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="left">Agency</td>
<td align="left">Soviet space program</td>
<td align="left">Soviet space program</td>
<td align="left">Soviet space program</td>
<td align="left">NASA</td>
<td align="left">NASA</td>
<td align="left">NASA</td>
</tr>
<tr class="even">
<td align="left">Decades</td>
<td align="left">Program</td>
<td align="left">Dates[a]</td>
<td align="left">No.[b]</td>
<td align="left">Program[c]</td>
<td align="left">Dates</td>
<td align="left">No.[d]</td>
</tr>
<tr class="odd">
<td align="left">1961–1970</td>
<td align="left">Vostok</td>
<td align="left">1961–1963</td>
<td align="left">6</td>
<td align="left">Mercury</td>
<td align="left">1961–1963</td>
<td align="left">6</td>
</tr>
<tr class="even">
<td align="left">1961–1970</td>
<td align="left">Voskhod</td>
<td align="left">1964–1965</td>
<td align="left">2</td>
<td align="left">X-15</td>
<td align="left">1962–1968</td>
<td align="left">13</td>
</tr>
<tr class="odd">
<td align="left">1961–1970</td>
<td align="left">Soyuz</td>
<td align="left">1967–1991</td>
<td align="left">63</td>
<td align="left">Gemini</td>
<td align="left">1965–1966</td>
<td align="left">10</td>
</tr>
<tr class="even">
<td align="left">1961–1970</td>
<td align="left">Soyuz</td>
<td align="left">1967–1991</td>
<td align="left">63</td>
<td align="left">Apollo</td>
<td align="left">1968–1972</td>
<td align="left">11</td>
</tr>
<tr class="odd">
<td align="left">1971–1980</td>
<td align="left">Soyuz</td>
<td align="left">1967–1991</td>
<td align="left">63</td>
<td align="left">Apollo</td>
<td align="left">1968–1972</td>
<td align="left">11</td>
</tr>
<tr class="even">
<td align="left">1971–1980</td>
<td align="left">Soyuz</td>
<td align="left">1967–1991</td>
<td align="left">63</td>
<td align="left">Skylab</td>
<td align="left">1973–1974</td>
<td align="left">3</td>
</tr>
<tr class="odd">
<td align="left">1971–1980</td>
<td align="left">Soyuz</td>
<td align="left">1967–1991</td>
<td align="left">63</td>
<td align="left">Apollo–Soyuz</td>
<td align="left">1975</td>
<td align="left">1</td>
</tr>
<tr class="even">
<td align="left">1981–1990</td>
<td align="left">Soyuz</td>
<td align="left">1967–1991</td>
<td align="left">63</td>
<td align="left">Space Shuttle</td>
<td align="left">1981–2011</td>
<td align="left">135</td>
</tr>
</tbody>
</table>
<p>Another R workaround for more complex tables is the package <code>htmltab</code> that offers some more flexibility.</p>
</div>
<div id="its-your-turn" class="section level2">
<h2>It’s your turn! 🛠</h2>
<p>Think about web data hosted on a <strong>static webpage</strong> that you might be interested in scraping! Comparably easy to scrape webpages are Wikipedia pages or pages that contain data in table format (as we’ve seen already).</p>
<p>Before you start - it might be worth a thought to get Chrome + Selector Gadget. This will make your future as web scraper a bit easier.</p>
<p><strong>Option A</strong></p>
<ol style="list-style-type: decimal">
<li><p>Inspect the page and try to find the elements you are interested in in the html code.</p></li>
<li><p>Parse the url.</p></li>
<li><p>Extract the XPath for the element you would like to scrape and parse the nodes.</p></li>
<li><p>Inspect the html output you get. What would be the next steps now? Can you already put your output into a dataframe?</p></li>
</ol>
<p><strong>Option B</strong></p>
<ol style="list-style-type: decimal">
<li><p>Find an html table that you’d like to scrape.</p></li>
<li><p>Scrape it.</p></li>
</ol>
</div>
</div>
<div id="scraping-multiple-pages" class="section level1">
<h1>Scraping multiple pages 🤖</h1>
<p>Whenever you want to really understand what’s going on within the functions of a new R package, it is very likely that there is a relevant article published in the <a href="https://www.jstatsoft.org/index">Journal of Statistical Software</a>. Let’s say you are interested in how the journal was doing over the past years.</p>
<p><strong>Step 1.</strong> Inspect the source. Basically, follow steps to extract the Xpath information.</p>
<div class="sourceCode" id="cb10"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="fu">browseURL</span>(<span class="st">"http://www.jstatsoft.org/issue/archive"</span>)</span></code></pre></div>
<p><strong>Step 2</strong> Develop a scraping strategy. We need a set of URLs leading to all sources. Inspect the URLs of different sources and find the pattern. Then, construct the list of URLs from scratch.</p>
<div class="sourceCode" id="cb11"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a>baseurl <span class="ot"><-</span> <span class="st">"http://www.jstatsoft.org/article/view/v"</span></span>
<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a>volurl <span class="ot"><-</span> <span class="fu">paste0</span>(<span class="st">"0"</span>, <span class="fu">seq</span>(<span class="dv">1</span>, <span class="dv">99</span>, <span class="dv">1</span>))</span>
<span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a>volurl[<span class="dv">1</span><span class="sc">:</span><span class="dv">9</span>] <span class="ot"><-</span> <span class="fu">paste0</span>(<span class="st">"00"</span>, <span class="fu">seq</span>(<span class="dv">1</span>, <span class="dv">9</span>, <span class="dv">1</span>))</span>
<span id="cb11-4"><a href="#cb11-4" aria-hidden="true" tabindex="-1"></a>brurl <span class="ot"><-</span> <span class="fu">paste0</span>(<span class="st">"0"</span>, <span class="fu">seq</span>(<span class="dv">1</span>, <span class="dv">9</span>, <span class="dv">1</span>))</span>
<span id="cb11-5"><a href="#cb11-5" aria-hidden="true" tabindex="-1"></a>urls_list <span class="ot"><-</span> <span class="fu">paste0</span>(baseurl, volurl)</span>
<span id="cb11-6"><a href="#cb11-6" aria-hidden="true" tabindex="-1"></a>urls_list <span class="ot"><-</span> <span class="fu">paste0</span>(<span class="fu">rep</span>(urls_list, <span class="at">each =</span> <span class="dv">9</span>), <span class="st">"i"</span>, brurl)</span>
<span id="cb11-7"><a href="#cb11-7" aria-hidden="true" tabindex="-1"></a>urls_list[<span class="dv">1</span><span class="sc">:</span><span class="dv">5</span>]</span></code></pre></div>
<pre><code>## [1] "http://www.jstatsoft.org/article/view/v001i01"
## [2] "http://www.jstatsoft.org/article/view/v001i02"
## [3] "http://www.jstatsoft.org/article/view/v001i03"
## [4] "http://www.jstatsoft.org/article/view/v001i04"
## [5] "http://www.jstatsoft.org/article/view/v001i05"</code></pre>
<p><strong>Step 3</strong> Think about where you want your scraped material to be stored and create a directory.</p>
<div class="sourceCode" id="cb13"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a>tempwd <span class="ot"><-</span> (<span class="st">"data/jstatsoftStats"</span>)</span>
<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a><span class="fu">dir.create</span>(tempwd)</span>
<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a><span class="fu">setwd</span>(tempwd)</span></code></pre></div>
<p><strong>Step 4</strong> Download the pages. Note that we did not do this step last time, when we were only scraping one page.</p>
<div class="sourceCode" id="cb14"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a>folder <span class="ot"><-</span> <span class="st">"html_articles/"</span></span>
<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a><span class="fu">dir.create</span>(folder)</span>
<span id="cb14-3"><a href="#cb14-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb14-4"><a href="#cb14-4" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> (i <span class="cf">in</span> <span class="dv">1</span><span class="sc">:</span><span class="fu">length</span>(urls_list)) {</span>
<span id="cb14-5"><a href="#cb14-5" aria-hidden="true" tabindex="-1"></a> <span class="co"># only update, don't replace</span></span>
<span id="cb14-6"><a href="#cb14-6" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> (<span class="sc">!</span><span class="fu">file.exists</span>(<span class="fu">paste0</span>(folder, names[i]))) { </span>
<span id="cb14-7"><a href="#cb14-7" aria-hidden="true" tabindex="-1"></a> <span class="co"># skip article when we run into an error </span></span>
<span id="cb14-8"><a href="#cb14-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">tryCatch</span>( </span>
<span id="cb14-9"><a href="#cb14-9" aria-hidden="true" tabindex="-1"></a> <span class="fu">download.file</span>(urls_list[i], <span class="at">destfile =</span> <span class="fu">paste0</span>(folder, names[i])),</span>
<span id="cb14-10"><a href="#cb14-10" aria-hidden="true" tabindex="-1"></a> <span class="at">error =</span> <span class="cf">function</span>(e) e</span>
<span id="cb14-11"><a href="#cb14-11" aria-hidden="true" tabindex="-1"></a> )</span>
<span id="cb14-12"><a href="#cb14-12" aria-hidden="true" tabindex="-1"></a> <span class="co"># don't kill their server --> be polite! </span></span>
<span id="cb14-13"><a href="#cb14-13" aria-hidden="true" tabindex="-1"></a> <span class="fu">Sys.sleep</span>(<span class="fu">runif</span>(<span class="dv">1</span>, <span class="dv">0</span>, <span class="dv">1</span>)) </span>
<span id="cb14-14"><a href="#cb14-14" aria-hidden="true" tabindex="-1"></a> </span>
<span id="cb14-15"><a href="#cb14-15" aria-hidden="true" tabindex="-1"></a>} }</span></code></pre></div>
<p>While R is downloading the pages for you, you can watch it directly in the directory you defined…</p>
<p><img src="pics/html_files.png" width="70%" style="display: block; margin: auto;" /></p>
<p>Check whether it worked.</p>
<div class="sourceCode" id="cb15"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a>list_files <span class="ot"><-</span> <span class="fu">list.files</span>(folder, <span class="at">pattern =</span> <span class="st">"0.*"</span>)</span>
<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a>list_files_path <span class="ot"><-</span> <span class="fu">list.files</span>(folder, <span class="at">pattern =</span> <span class="st">"0.*"</span>, <span class="at">full.names =</span> <span class="cn">TRUE</span>)</span>
<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a><span class="fu">length</span>(list_files)</span></code></pre></div>
<p>Yay! Apparently, we scraped the html pages of 802 articles.</p>
<div id="gitignoring-files" class="section level2">
<h2>(Git)ignoring files 🙅</h2>
<p>In case you scraping project is is linked to GitHub (as it will be in your assignment!), it can be useful to <strong>.gitignore</strong> the folder of downloaded files. This means that the folder can be stored in your local directory of the project but will not be synced with the remote (main) repository. Here is information on how to do this using <a href="https://carpentries-incubator.github.io/git-Rstudio-course/02-ignore/index.html">RStudio</a>. In Github Desktop it is very simple, you do your scraping work, the folder is created in your local repository and before your commit and push these changes, you go on <code>Repository</code> > <code>Repository Settings</code> > <code>Ignored Files</code> and edit the .gitignore file (add the name of the new folder / files you don’t want to sync). More generally, it makes sense to exclude .Rproj files, .RData files (and other binary or large data files), draft folders and sensitive information from version control. Remember, git is built to track changes in code, not in large data files.</p>
<p><strong>Step 5</strong> Import files and parse out information. A loop is helpful here!</p>
<div class="sourceCode" id="cb16"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a><span class="co"># define output first</span></span>
<span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a>authors <span class="ot"><-</span> <span class="fu">character</span>()</span>
<span id="cb16-3"><a href="#cb16-3" aria-hidden="true" tabindex="-1"></a>title <span class="ot"><-</span> <span class="fu">character</span>()</span>
<span id="cb16-4"><a href="#cb16-4" aria-hidden="true" tabindex="-1"></a>datePublish <span class="ot"><-</span> <span class="fu">character</span>()</span>
<span id="cb16-5"><a href="#cb16-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb16-6"><a href="#cb16-6" aria-hidden="true" tabindex="-1"></a><span class="co"># then run the loop</span></span>
<span id="cb16-7"><a href="#cb16-7" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> (i <span class="cf">in</span> <span class="dv">1</span><span class="sc">:</span><span class="fu">length</span>(list_files_path)) {</span>
<span id="cb16-8"><a href="#cb16-8" aria-hidden="true" tabindex="-1"></a> html_out <span class="ot"><-</span> <span class="fu">read_html</span>(list_files_path[i])</span>
<span id="cb16-9"><a href="#cb16-9" aria-hidden="true" tabindex="-1"></a> </span>
<span id="cb16-10"><a href="#cb16-10" aria-hidden="true" tabindex="-1"></a> authors[i] <span class="ot"><-</span> <span class="fu">html_text</span>(<span class="fu">html_nodes</span>(html_out , <span class="at">xpath =</span> <span class="st">'//*[contains(concat( " ", @class, " " ), concat( " ", "authors_long", " " ))]//strong'</span>))</span>
<span id="cb16-11"><a href="#cb16-11" aria-hidden="true" tabindex="-1"></a> </span>
<span id="cb16-12"><a href="#cb16-12" aria-hidden="true" tabindex="-1"></a> title[i] <span class="ot"><-</span> <span class="fu">html_text</span>(<span class="fu">html_nodes</span>(html_out , <span class="at">xpath =</span> <span class="st">'//*[contains(concat( " ", @class, " " ), concat( " ", "page-header", " " ))]'</span>))</span>
<span id="cb16-13"><a href="#cb16-13" aria-hidden="true" tabindex="-1"></a> </span>
<span id="cb16-14"><a href="#cb16-14" aria-hidden="true" tabindex="-1"></a> datePublish[i] <span class="ot"><-</span> <span class="fu">html_text</span>(<span class="fu">html_nodes</span>(html_out , <span class="at">xpath =</span> <span class="st">'//*[contains(concat( " ", @class, " " ), concat( " ", "article-meta", " " ))]//*[contains(concat( " ", @class, " " ), concat( " ", "row", " " )) and (((count(preceding-sibling::*) + 1) = 2) and parent::*)]//*[contains(concat( " ", @class, " " ), concat( " ", "col-sm-8", " " ))]'</span>))</span>
<span id="cb16-15"><a href="#cb16-15" aria-hidden="true" tabindex="-1"></a>}</span>
<span id="cb16-16"><a href="#cb16-16" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb16-17"><a href="#cb16-17" aria-hidden="true" tabindex="-1"></a><span class="co"># inspect data</span></span>
<span id="cb16-18"><a href="#cb16-18" aria-hidden="true" tabindex="-1"></a>authors[<span class="dv">1</span><span class="sc">:</span><span class="dv">3</span>]</span>
<span id="cb16-19"><a href="#cb16-19" aria-hidden="true" tabindex="-1"></a>title[<span class="dv">1</span><span class="sc">:</span><span class="dv">2</span>]</span>
<span id="cb16-20"><a href="#cb16-20" aria-hidden="true" tabindex="-1"></a>datePublish[<span class="dv">1</span><span class="sc">:</span><span class="dv">3</span>]</span>
<span id="cb16-21"><a href="#cb16-21" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb16-22"><a href="#cb16-22" aria-hidden="true" tabindex="-1"></a><span class="co"># create a data frame</span></span>
<span id="cb16-23"><a href="#cb16-23" aria-hidden="true" tabindex="-1"></a>dat <span class="ot"><-</span> <span class="fu">data.frame</span>(<span class="at">authors =</span> authors, <span class="at">title =</span> title, <span class="at">datePublish =</span> datePublish)</span>
<span id="cb16-24"><a href="#cb16-24" aria-hidden="true" tabindex="-1"></a><span class="fu">dim</span>(dat)</span></code></pre></div>
<p><strong>Step 6</strong> Clean data…</p>
<p>You see, scraping data from multiple pages is no problem in R. Most of the brain work often goes into developing a scraping strategy and tidying the data, not into the actual downloading/scraping part.</p>
<p>Scraping is also possible in much more complex scenarios! Watch out for workshop presentations on</p>
<ul>
<li>Dynamic webscraping with RSelenium</li>
<li>Web APIs</li>
<li>Regular expressions with stringr</li>
<li>Data cleaning with janitor</li>
</ul>
<p>and many more 🤩</p>
</div>
</div>
<div id="good-scraping-practice" class="section level1">
<h1>Good scraping practice</h1>
<p>There is a set of general rules to the game:</p>
<ol style="list-style-type: decimal">
<li>You take all the responsibility for your web scraping work.</li>
<li>Think about the nature of the data. Does it entail sensitive information? Do not collect personal data without explicit permission.</li>
<li>Take all copyrights of a country’s jurisdiction into account. If you publish data, do not commit copyright fraud.</li>
<li>If possible, stay identifiable. Stay polite. Stay friendly. Obey the scraping etiquette.</li>
<li>If in doubt, ask the author/creator/provider of data for permission—if your interest is entirely scientific, chances aren’t bad that you get data.</li>
</ol>
<div id="how-do-i-know-the-scraping-etiquette-of-a-site" class="section level2">
<h2>How do I know the scraping etiquette of a site? 🤝</h2>
<p>Robot exclusion standards (<code>robot.txt</code>) are informal protocols to prohibit web robots from crawling content. They list documents that are allowed to crawl and which not. It is not a technical barrier but an ask for compliance. They are located in the root directory of a website (e.g <code>https://de.wikipedia.org/robots.txt</code>).</p>
<p>For example, let’s have a look at wikipedia’s <a href="https://de.wikipedia.org/robots.txt">robot.txt</a> file, which is very human readable.</p>
<p>General rules are listed under <code>User-agent: *</code> which is most interesting for R-based crawlers. A universal ban for a directory looks like this <code>Disallows: /</code>, sometimes Crawl-delays are suggested (in seconds) <code>Crawl-delay: 2</code>.</p>
</div>
<div id="what-is-polite-scraping" class="section level2">
<h2>What is “polite” scraping? 🐌</h2>
<p>First thing would be not to scrape at a speed that causes trouble for their server. Therefore, whenever you loop over a list of URLs, add a <code>Sys.sleep(runif(1, 1, 2))</code> at the end of the loop.</p>
<p>And generally, it is better practice to store data on your local drive first (<code>download.file()</code>), then parse (<code>read_html()</code>).</p>
<p><strong>A footnote on sustainability.</strong> In the digital context, we often forget that or actions do have physical consequences. For example, training AI, using blockchain and just streaming videos do cause considerable amounts of CO2 emissions. So does bombarding a server with requests - certainly to a much lesser extent than the examples before - but please consider whether you have to re-run a large scraping project 100 times in order to debug things.</p>
<p>Furthermore, downloading massive amounts of data may arouse attention from server administrators. Assuming that you’ve got nothing to hide, you should stay identifiable beyond your IP address.</p>
</div>
<div id="how-can-i-stay-identifyable" class="section level2">
<h2>How can I stay identifyable? 👤</h2>
<p>Option 1: Get in touch with website administrators / data owners.</p>
<p>Option 2: Use HTTP header fields From and User-Agent to provide information about yourself.</p>
<pre><code>url <- "http://a-totally-random-website.com"
rvest_session <- session(url,
add_headers(`From` = "my@email.com",
`UserAgent` = R.Version()$version.string))
scraped_text <- rvest_session %>%
html_elements(xpath = "p//a") %>%
html_text()
</code></pre>
<p>rvest’s <code>session()</code> creates a session object that responds to HTTP and HTML methods. Here, we provide our email address and the current R version as User-Agent information. This will pop up in the server logs: The webpage administrator has the chance to easily get in touch with you.</p>
</div>
</div>
<div id="sources" class="section level1">
<h1>Sources</h1>
<p>This tutorial drew heavily on Simon Munzert’s book <a href="http://r-datacollection.com/">Automated Data Collection with R</a> and related <a href="https://github.com/simonmunzert/web-scraping-with-r-extended-edition">course materials</a>. We also used an <a href="https://towardsdatascience.com/tidy-web-scraping-in-r-tutorial-and-resources-ac9f72b4fe47">example</a> from Keith McNulty’s blog post on tidy web scraping in R.</p>
</div>
<hr />
<p style="text-align: center;">A work by Lisa Oswald & Tom Arend</a></p>
<p style="text-align: center;"><span style="color: #808080;"><em>Prepared for Intro to Data Science, taught by Simon Munzert</em></span></p>
<p style="text-align: center;"><span style="color: #808080;"><em><a href="https://www.hertie-school.org/en/">Hertie School, Berlin</em></span></p>
<!-- Add icon library -->
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.4/css/all.min.css" >
<!-- Add font awesome icons -->
<p style="text-align: center;">
<a href="https://github.com/intro-to-data-science-21" <i class="fab fa-github"></i><a>
</p>
</div>
</div>
</div>
<script>
// add bootstrap table styles to pandoc tables
function bootstrapStylePandocTables() {
$('tr.odd').parent('tbody').parent('table').addClass('table table-condensed');
}
$(document).ready(function () {
bootstrapStylePandocTables();
});
</script>
<!-- tabsets -->
<script>
$(document).ready(function () {
window.buildTabsets("TOC");
});
$(document).ready(function () {
$('.tabset-dropdown > .nav-tabs > li').click(function () {
$(this).parent().toggleClass('nav-tabs-open');
});
});
</script>
<!-- code folding -->
<script>
$(document).ready(function () {
// temporarily add toc-ignore selector to headers for the consistency with Pandoc
$('.unlisted.unnumbered').addClass('toc-ignore')
// move toc-ignore selectors from section div to header
$('div.section.toc-ignore')
.removeClass('toc-ignore')
.children('h1,h2,h3,h4,h5').addClass('toc-ignore');
// establish options
var options = {
selectors: "h1,h2,h3",
theme: "bootstrap3",
context: '.toc-content',
hashGenerator: function (text) {
return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_');
},
ignoreSelector: ".toc-ignore",
scrollTo: 0
};
options.showAndHide = true;
options.smoothScroll = true;
// tocify
var toc = $("#TOC").tocify(options).data("toc-tocify");
});
</script>
<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
(function () {
var script = document.createElement("script");
script.type = "text/javascript";
script.src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
document.getElementsByTagName("head")[0].appendChild(script);
})();
</script>
</body>
</html>