forked from covh/covviz
/
CovViz-DMP.tex
504 lines (407 loc) · 31.1 KB
/
CovViz-DMP.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
%% LyX 2.3.6 created this file. For more info, see http://www.lyx.org/.
%% Do not edit unless you really know what you are doing.
\documentclass[10pt,oneside,american,svgnames]{scrbook}
\usepackage{tgheros}
\renewcommand{\ttdefault}{cmtl}
\renewcommand{\familydefault}{\sfdefault}
\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage[letterpaper]{geometry}
\geometry{verbose,tmargin=2cm,bmargin=2cm,lmargin=2cm,rmargin=3cm}
\setcounter{secnumdepth}{3}
\setcounter{tocdepth}{3}
\setlength{\parskip}{\medskipamount}
\setlength{\parindent}{0pt}
\synctex=1
\usepackage{babel}
\usepackage{varioref}
\usepackage{multicol}
\usepackage{setspace}
\usepackage[numbers]{natbib}
\onehalfspacing
\usepackage[unicode=true,
bookmarks=true,bookmarksnumbered=true,bookmarksopen=true,bookmarksopenlevel=2,
breaklinks=true,pdfborder={0 0 0},pdfborderstyle={},backref=page,colorlinks=false]
{hyperref}
\hypersetup{pdftitle={Data Management Plan - CovViz}}
\makeatletter
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% User specified LaTeX commands.
% some macros for easy typing often used proper names
\newcommand\covviz{\textsc{CovViz}\xspace}
% Setup von Links
\usepackage{xcolor}
\definecolor{darkblue}{rgb}{0,0,.5}
\newcommand\allcolors{DarkGreen}
\hypersetup{pdftex=true, colorlinks=true, breaklinks=true, allcolors=\allcolors, urlcolor=darkblue, citecolor=gray}
\sloppy{} %%% NOTE: use '\sloppy{}' to ease correct URL line breaking
% flexible spaces at macro ends
\usepackage{xspace}
% for scrbook (KOMA-Buch) complaining about \sl command
\DeclareOldFontCommand{\sl}{\normalfont\slshape}{\@nomath\sl}
% rcommendation to use for scrbook with hyperrref
\usepackage{bookmark}
% toc indents
\usepackage{tocloft}
%%\cftsetindents{<entry>}{<indent>}{<numwidth>}
\cftsetindents{section}{0.5em}{2em}
\cftsetindents{subsection}{1.5em}{2.5em}
\cftsetindents{subsubsection}{3em}{3.5em}
\cftsetindents{paragraph}{4em}{4.5em}
% somehow following manual titleformat settings mutated to a hard requirement to let pdflatex generate our document
\usepackage{titlesec}
\titleclass{\chapter}{straight}
\titleformat{\chapter}{\LARGE\bfseries}{\thechapter}{1em}{}
\titlespacing*{\chapter}{0pt}{2em}{1em}
\titleformat{\section}{\Large\bfseries}{\thesection}{1em}{}
\titleformat{\subsection}{\large\bfseries}{\thesubsection}{1em}{}
\titleformat{\subsubsection}{\large\bfseries}{\thesubsubsection}{1em}{}
\titleformat{\paragraph}{\bfseries}{\theparagraph}{0.5em}{}
% twocolumn formating
\usepackage{multicol}
\setlength{\columnsep}{0.5cm}
\setlength{\columnseprule}{0.2pt}
% Kopf- und Fusszeilen
\usepackage{lastpage} % 'Seite x von y' ermoeglichen
\usepackage{fancyhdr} % freier konfigurierbare Kopf- und Fusszeilen
%\thispagestyle{empty} % Seitenstil (weitgehend) leeren
\pagestyle{fancy} % Seitenstil 'fancy'
\renewcommand{\headrulewidth}{0.0pt}
% "Seite n vom m" zur weiteren Verwendung definieren
\newcommand{\pagenm}{\thepage\ / \pageref{LastPage}}
\fancyfoot[C]{\pagenm}
% Fusszeile fuer Seiten auf welchen ein Kapitel startet ueberschreiben
\fancypagestyle{plain}{
\fancyfoot[C]{\pagenm}
}
\fancypagestyle{fancy}{
\fancyhead[R]{\pagenm}
}
\makeatother
\begin{document}
\def\authorOne{»jalsti«}\def\authorOneMail{57985357+jalsti@users.noreply.github.com}\def\pdfsubject{}% set PDF author, subject
\hypersetup{pdfauthor=\authorOne, pdfsubject=\pdfsubject{}}
Version 1.4, \today{}\hfill{}Author: \authorOne{} (\href{\authorOneMail}{\authorOneMail})
\chapter*{Data Management Plan for »\covviz{}«}
\renewcommand\contentsname{\vspace*{-70pt}}
\begin{multicols}{2}
~\vspace{-4.5em}
\tableofcontents{}
\end{multicols}
\chapter{General}
This data management plan is intended for own reference and as a help for potential further development
by other parties, this DMP has no external requirements.
The DMP describes the data management of \covviz{} (and each of its input, respective output, datasets),
which is a project residing in the field of life sciences / medicine, and data science. The goal of \covviz{}
is the visualization of COVID-19 cases and incidences for german districts, with more current and reliable
case numbers than provided by the »Robert Koch-Institut« (RKI)\footnote{\href{https://www.rki.de/}{https://www.rki.de/}},
including better collection of late registered cases, through usage of the data provide by the »Risklayer«\footnote{\href{http://www.risklayer.com/}{http://www.risklayer.com/}}
volunteers. (Please see ``\href{http://risklayer.com/blog/KIT\%20Press\%20release\%20and\%20additional\%20coverage\%20(T-Online\%20&\%20SPIEGEL)\%20on\%20Risklayer\%20Corona\%20tracking/}{KIT press release and additional coverage (T-online \& Spiegel) on risklayer corona tracking}''\footnote{\href{http://risklayer.com/blog/KIT\%20Press\%20release\%20and\%20additional\%20coverage\%20(T-Online\%20&\%20SPIEGEL)\%20on\%20Risklayer\%20Corona\%20tracking/}{http://risklayer.com/blog/KIT Press release and additional coverage (T-Online \& SPIEGEL) on Risklayer Corona tracking/}},
for more reasoning by »Risklayer« about the enhanced collection of case numbers.)
The project had been started at 04/26/2020 (first code commit) by Dr.\,Andreas Krüger, gained increasing
attention after an article about it has been published in Telepolis at 05/09/2020 (``\href{https://www.heise.de/tp/features/CoronaVirus-Landkreise-brauchen-nun-Aufmerksamkeit-4717709.html}{CoronaVirus: Landkreise brauchen nun Aufmerksamkeit}''\footnote{\href{https://www.heise.de/tp/features/CoronaVirus-Landkreise-brauchen-nun-Aufmerksamkeit-4717709.html}{https://www.heise.de/tp/features/CoronaVirus-Landkreise-brauchen-nun-Aufmerksamkeit-4717709.html}}),
and got forked by \authorOne{} at 09/20/2020, to further enhance the visualization and amount of data
output. Still the data gathering stayed the same (with only enhancing the management of some data files),
so this DMP is in general applicable for both paths of the project, whereas the original one did not
receive many updates anymore, so \authorOne{} might be regarded as the project coordinator and contact
person for data management questions, at least regarding the forked version with the enhanced output
data.
The project runtime had been planned to last as long as the »Risklayer« volunteers provide the data,
which ended at 03/14/2022 because of the high work load, and the makers thinking it was not relevant
for the pandemic situation any more (``\href{https://www.zdf.de/nachrichten/panorama/corona-fallzahlen-risklayer-omikron-100.html}{Risklayer hört mit Erfassung auf}''\footnote{\href{https://www.zdf.de/nachrichten/panorama/corona-fallzahlen-risklayer-omikron-100.html}{https://www.zdf.de/nachrichten/panorama/corona-fallzahlen-risklayer-omikron-100.html}}).
Nevertheless the project might be developed further for another data base, keeping the visualization
granularity for all the 401 german district, so this DMP stays relevant.
»Github«\footnote{\href{https://github.com/}{https://github.com/}} is used as free hosting platform
for the both separate versioning control system (VCS) repositories, for program code (\href{https://github.com/jalsti/covviz/}{https://github.com/jalsti/covviz/})
and generated output (files at \href{https://github.com/jalsti/cov19de/}{https://github.com/jalsti/cov19de/},
output visible at \href{https://jalsti.github.io/cov19de/}{https://jalsti.github.io/cov19de/}), so »Github«
in some sense could be regarded as sponsor of the project, but except of that no funding is happening.
Also the backup plan, long time storage, and data security relies on the actions done by »Github« (which
has no special certification for these tasks), all relevant data files are (if there content changes)
committed by a wrapper script to the output repository, immediately during every successful visualization
generation run of the project, which in general (if no external problems occurred) would be daily.
The usage of »Github« already implies the usage of Git\footnote{\href{https://git-scm.com/}{https://git-scm.com/}}
as VCS, and that the code and all its data is under an open source license, providing free access and
usage, in this case the license is the GNU General Public License v3.0\footnote{\href{https://www.gnu.org/licenses/gpl-3.0.en.html}{https://www.gnu.org/licenses/gpl-3.0.en.html}}.
(Note: Additionally to the part of the project the \covviz{} fork focuses on, there still exist some
Jupyter Notebooks\footnote{\href{https://jupyter.org}{https://jupyter.org}} from the original project
(sub-folder \texttt{`notebooks}` in the code VCS), with more code, generating different and more specific
output data, which were disregarded from the beginning on, as they got not developed further with the
project, these can in general be thought of as by now dysfunctional, so their output data is not regarded
in this DMP.)
\chapter{Basic data information}
All base input data, which is directly used to generate the visualization, is in CSV Format (only the
``\nameref{subsec:Wikipedia-district-list}'', see \vref{subsec:Wikipedia-district-list}, gets once
converted from HTML to CSV, one initial time), the generated output data (COVID-19 cases tables and timeline
plots) is stored in HTML files and PNG images, where for all these data formats plenty free software
programs exist for viewing, no special ones are required. All these formats are chosen because they follow
public standards, are easily interpretable, and will be supported for a long time by many programs and
programming languages, providing a broad interoperability. The generation process is done with Python,
the used version is 3.8, newer ones (current newest version: 3.10) should also work flawlessly. The wrapper
script uses the Linux shell Bash and (besides Git) only standard Linux tools, no special versions are
required.
Some datasets are only generated once (mentioned in the according dataset subsections starting \vpageref{sec:Base-data}),
after the project's code has been downloaded the first time, during a separate first initialization run
of \covviz{}, with the according included Bash script (see project's README for more details).
The \hypertarget{overall data amount}{overall data amount} of in a potential future needed storage space
is unclear, because daily new time series data would get added, for the daily refreshed data. The numbers
below reflect the state up to the 03/14/2022, where the »Risklayer« data collection stopped, currently
no further generation takes place.
No exceptional computer resources are needed for the data generation process, an average desktop computer
will be sufficient, although a multi-core system will speed up the process. Only the storage size of
the complete history of the generated plots may be notable, with 64\,GB (the history of all data files
takes up 300\,MB, the generated output of one run 180\,MB and the code part repository 6.5\,MB).
No real quality assurance of the data is provided, if the input data it has errors, the run will just
fail or generate bad output, which would have to get checked manually through visual inspection of the
overall plausibility of the generated output data tables and plots.
No project guidelines for consistent organization of the data exist, even similar data files may vary
in their format (like column order of similar data).
No particular standards are used to describe the data, only the re-used dataset ``\nameref{subsec:Opendatasoft-geographical-positi}''
(partly) uses DCAT. Besides that, to understand the data, no particular knowledge is needed, only tables
and plots must be read/interpreted by their headings, in general no separate documentation of the data
exists, besides this DMP and where mentioned in the dataset sections below.
\section{Costs, legal issues}
No personal costs are regularly planned for data management and storage, only some code adjustments must
be made from time to time, to be further able to create the output data, because of changes/errors on
the input data side, which can be taken as costs of 1 person-hour/month.
As no further documentation of the data exists in the project itself, no costs get generated by this.
Although intellectual property rights exist in in the project (see according notes in the input data
sections), no costs are associated with it.
No additional meta-data gets collected, and no Persistent Identifiers (PIDs) will be used, no personal
or otherwise sensitive data is used, and no data protection or other legal or ethical issues need to
be considered.
\chapter{Dataset description}
\section{Base input data\label{sec:Base-data}}
All used base input data for the code is stored in the code's data sub-folder named `\texttt{data}',
in files named like described in the sections below. After each successful run of \covviz{} the new
data files get added to the according equally named sub-folder of the VCS repository for the generated
output. Data files only generated once, will also only get added once. The complete history of input
data can be downloaded from the VCS for the generated output at \href{https://github.com/jalsti/cov19de/tree/master/data}{https://github.com/jalsti/cov19de/tree/master/data}.
\subsection{Daily Risklayer COVID-19 data\label{subsec:Daily-Risklayer-COVID-19}}
This re-used dataset created by the »Risklayer« volunteers has its origin at \href{http://risklayer-explorer.com/media/data/events/GermanyValues.csv}{http://risklayer-explorer.com/media/data/events/GermanyValues.csv},
and contains all the daily COVID-19 cases of all 401 german districts for each day since 4th of March
2020, daily hand-collected by »Risklayer« volunteers from different district’s COVID-19 publications,
including late registered cases, of the previous few days in relation to the collection date. The according
single data sources for all numbers are documented in the ``\nameref{subsec:Daily-Risklayer-COVID-19-sources}''
data (see \vref{subsec:Daily-Risklayer-COVID-19-sources}). It is the main data source for the project
and gets downloaded at the beginning of each \covviz{} run, stored in a file named \texttt{`GermanyValues\_Risklayer-YYYYMMDD.csv}',
where \texttt{YYYY} represents the year, \texttt{MM} the month, and \texttt{DD} the day of the run.
A single data file currently has a size of about 35\,KB.
Re-creation of the original »Risklayer« dataset from scratch would be impossible, because the sources
for the numbers changed over time, and the places where former numbers were gathered from often do not
exist anymore.
»Risklayer« holds the copyright of the original data, as work of literature, scholarship or the arts,
usage is allowed with naming their copyright.
As the data in \covviz{} gets stored to the VCS on every regular data generation run, it may be additional
interesting for anyone striving to get the history of the daily changing late registered cases, how the
numbers for a day changed iteratively over the following days.
\subsection{Opendatasoft geographical positions of districts\label{subsec:Opendatasoft-geographical-positi}}
This re-used dataset created by the »Robert Koch-Institut«, the »Bundesamt für Kartographie und Geodäsie«\footnote{\href{http://www.bkg.bund.de/}{http://www.bkg.bund.de/}},
and »Opendatasoft«\footnote{\href{https://www.opendatasoft.com/}{https://www.opendatasoft.com/}} has
its origin at \href{https://public.opendatasoft.com/explore/dataset/covid-19-germany-landkreise/download/?format=csv&lang=en&use_labels_for_header=true&csv_separator=\%3B}{https://public.opendatasoft.com/explore/dataset/covid-19-germany-landkreise/download/?format=csv\&lang}
\href{https://public.opendatasoft.com/explore/dataset/covid-19-germany-landkreise/download/?format=csv&lang=en&use_labels_for_header=true&csv_separator=\%3B}{=en\&use\_labels\_for\_header=true\&csv\_separator=\%3B},
and contains the geographical positions of all 401 german districts. It gets downloaded during the initialization
phase of the project. In \covviz{} it is used to generate the dataset ``\nameref{subsec:Distances-between-districts}``
(see \vref{subsec:Distances-between-districts}), and gets stored in a file named \texttt{`covid-19-germany-landkreise.csv}'.
The single data file has a size of about 21.7\,MB.
The data is documented at \href{https://public.Opendatasoft.com/explore/dataset/covid-19-germany-landkreise/}{https://public.Opendatasoft.com/explore/dataset/covid-19-germany-landkreise/},
re-creation of all the geographical data would be potentially difficult, it not pulling the data again
from another source.
The »Robert Koch-Institut« holds the copyright of the data, usage is allowed when naming the copyright
holder and the source URL.
Re-using this \covviz{} dataset would probably not be of interest to anyone, because it usually does
not change and can be received from the mentioned »Opendatasoft« data origin.
\subsection{Daily Risklayer COVID-19 data sources\label{subsec:Daily-Risklayer-COVID-19-sources}}
This re-used dataset created by the »Risklayer« volunteers has its origin at \href{https://docs.google.com/spreadsheets/d/1EZNqMVK6hCccMw4pq-4T30ADJoayulgx9f98Vui7Jmc/gviz/tq?tqx=out:csv&sheet=Haupt&range=A5:AU406}{https://docs.google.com/}\linebreak{}
\href{https://docs.google.com/spreadsheets/d/1EZNqMVK6hCccMw4pq-4T30ADJoayulgx9f98Vui7Jmc/gviz/tq?tqx=out:csv&sheet=Haupt&range=A5:AU406}{spreadsheets/d/1EZNqMVK6hCccMw4pq-4T30ADJoayulgx9f98Vui7Jmc/gviz/tq?tqx=out:csv\&sheet=Haupt}\linebreak{}
\href{https://docs.google.com/spreadsheets/d/1EZNqMVK6hCccMw4pq-4T30ADJoayulgx9f98Vui7Jmc/gviz/tq?tqx=out:csv&sheet=Haupt&range=A5:AU406}{\&range=A5:AU406},
and contains the web links to the sources of the ``\nameref{subsec:Daily-Risklayer-COVID-19}'' dataset
numbers, the origins from where the COVID-19 numbers get collected by the volunteers for each district.
It gets shown in the generated output at the according district's sections, is downloaded at the beginning
of each \covviz{} run, and gets stored in a file named `\texttt{GermanyKreisebene\_Risklayer\_haupt–YYYYMMDD.csv}',
where \texttt{YYYY} represents the year, \texttt{MM} the month, and \texttt{DD} the day of the run.
A single data file currently has a size of about 227\,KB.
The data is documented at \href{https://docs.google.com/spreadsheets/d/1EZNqMVK6hCccMw4pq-4T30ADJoayulgx9f98Vui7Jmc}{https://docs.google.com/spreadsheets/d/1EZNqMVK6hCccMw4pq-4T30ADJoayulgx9f98Vui7Jmc},
re-creation of the original »Risklayer« dataset from scratch would be impossible, because the sources
for the numbers changed over time, and it would not be reproducible where former numbers were gathered
from.
»Risklayer« holds the copyright of the original data, as work of literature, scholarship or the arts,
usage is allowed with naming their copyright.
The CSV data may be re-used by everyone interested in daily COVID-19 data sources, as the data gets stored
to the VCS on every regular data generation run, it may be additional interesting for anyone striving
to get the history of the data sources (although not all might exist anymore).
\subsection{Distances between districts\label{subsec:Distances-between-districts}}
This dataset gets created based on the dataset ``\nameref{subsec:Opendatasoft-geographical-positi}'',
and contains the geographical distances in kilometers between all 401 german districts, which are calculated
once during the initialization phase of the project, the result gets stored in a file named `\texttt{distances.csv}'.
It is used to generate a list of districts which are in a 50\,km range around another district, to be
able to show the according links to their data in the final ``\nameref{subsec:Daily-COVID-19-numbers}''
output, on each district's output section.
The created single data file has a size of 4.66\,MB.
Re-creation of this dataset would be quite easy, so re-usage also would probably be not of much interest
to other parties (although the GPL v3.0 would make it possible), maybe similar datasets also exist in
other places, still it was not big effort to generate it from the source to the wanted CSV format with
the district identifiers in the header, so it made sense to just do it inside \covviz{}.
\subsection{Wikipedia district list\label{subsec:Wikipedia-district-list}}
This dataset gets created based on the »Wikipedia«\footnote{\href{https://www.wikipedia.org/}{https://www.wikipedia.org/}}
list of districts in Germany and has its origin at \href{https://de.wikipedia.org/wiki/Liste_der_Landkreise_in_Deutschland}{https://de.wikipedia.org/wiki/Liste\_der\_Landkreise\_in\_Deutschland}.
It is downloaded once during the initialization phase of the project, the main table out of the article
page gets converted to CSV, and the data is used for showing links to the »Wikipedia« pages of the districts.
It gets stored in a file named `\texttt{wikipedia\_kreise\_most.csv}'.
Other, unused, fields in the final converted CSV are the district identifier, district seat, link to
the district seat on »Wikipedia«, number of inhabitants, district area, and the link to the image of
a map on »Wikimedia Commons«\footnote{\href{https://commons.wikimedia.org/}{https://commons.wikimedia.org/}},
showing the location of the district inside its federal state.
The created single data file has a size of 57\,KB.
No additional documentation is needed, to understand the dataset, and re-creation based on the same data
source should be simple.
The base data is copyrighted by the »Wikipedia« respective »Wikimedia Commons« contributors, re-usage
of the CSV is probably not of too much interest, because of the ease of generating it.
\subsection{BNN Risklayer data\label{subsec:BNN-Risklayer-data}}
This re-used dataset is used to get the population number of a district, which is also needed to calculate
the prevalence. As this data could also be gathered from the dataset ``\nameref{subsec:Wikipedia-district-list}'',
it stays unclear why the original author uses this separate file, maybe (as the numbers differ for many
districts between both sources) to show the same prevalence values like other projects using the »Risklayer«
data. It gets downloaded during the initialization phase of the project, and is stored in a file named
‘\texttt{GermanyKreisebene\_Risklayer\_bnn-20200425.csv}’.
The single data file has a size of about 34.6\,KB.
Unfortunately the origin of this dataset is not completely reliably determinable, as it got not documented
by the original project author. Data gets downloaded from a »Risklayer« data sheet copy at \href{https://docs.google.com/spreadsheets/d/1EZNqMVK6hCccMw4pq-4T30ADJoayulgx9f98Vui7Jmc/edit?tqx=out:csv&sheet=bnn\#gid=83083193}{https://docs.google.com/spreadsheets/d/1EZNqMVK6hCccMw4pq-4T30ADJoayulgx9f98Vui7Jmc/edit?tqx=out:csv\&sheet=bnn\#gid=83083193},
which itself probably originated \linebreak{}
from \href{https://docs.google.com/spreadsheets/d/1wg-s4_Lz2Stil6spQEYFdZaBEp8nWW26gVyfHqvcl8s/edit\#gid=83083193}{https://docs.google.com/spreadsheets/d/1wg-s4\_Lz2Stil6spQEYFdZaBEp8nWW26gVyfHqvcl8s/edit}
\href{https://docs.google.com/spreadsheets/d/1wg-s4_Lz2Stil6spQEYFdZaBEp8nWW26gVyfHqvcl8s/edit\#gid=83083193}{\#gid=83083193},
where no further documentation is available.
So probably »Risklayer« holds the copyright (respective Dr.\,Andreas Krüger, whose data sheet copy is
used) of the original data, as work of literature, scholarship or the arts, usage should therefore be
allowed with naming their copyright, the original project is published also under GPL v3.0, so re-usage
of the CSV should again be no problem, but would probably be not be of much interest, as the data origin
is not finally clear and the data could be gathered from the guessed original data source mentioned,
and re-creation of a data set with similar content would be possible from other sources, like from the
dataset ``\nameref{subsec:Wikipedia-district-list}''.
\section{Output data\label{sec:Output-data}}
All the directly COVID-19--situation related generated content of \covviz{}, like plots, data tables
and accompanying textual information, is understood as output data, so it gets fully described, including
its output format details where which kind of data is used.
The complete history of generated output data can be downloaded from \href{https://jalsti.github.io/cov19de/}{https://jalsti.github.io/cov19de/},
in the folders described next.
These datasets get generated on each run of \covviz{}, and after that also get stored to the VCS for
the generated output, in the same folder hierarchy like on the code's VCS part. They contain all the
PNG plots (sub-folder `\texttt{pics}') and tables (in the HTML files named after the district identifiers
-- respective federal state names or country -- in the sub-folder `\texttt{pages}') showing the COVID-19
data, which are altogether the intended core output \covviz{} has been created for.
The data is self-explanatory and may be re-used by everyone (under GPL v3.0 restrictions) interested
in daily COVID-19 data. As the data gets stored to the VCS on every regular data generation run, it may
be additional interesting for anyone striving to get the history of the daily changing late registered
cases, in a slightly more readable way than with the plain base data sets used to generate the output.
In general there are a number of other similar data sources, but already usually without the late cases
data, and less informative plots, additionally I do not know of any with the regular storing to a public
VCS, including that one of the pregenerated plots, with the possibility that these could also be re-generated
without too much effort, thanks to the versioned base datasets.
As already \hyperlink{overall data amount}{mentioned above}, the storage size of the complete history
of the generated plots may be notable, with 64\,GB, the history of all data files takes up 300\,MB,
and the generated output of one run 180\,MB.
\subsection{COVID-19 numbers for Germany\label{subsec:Daily-COVID-19-numbers-germany}}
For the overview of the COVID-19 numbers of the complete country of Germany, the following numbers get
calculated and shown based on the input datasets:
\begin{itemize}
\item total cases
\item prevalence / 100,000 people
\item 7 day new cases sum
\item 7 day incidence / 100,000 people
\item 14 days new cases moving average (centered)
\item concrete numbers for different incidence border numbers (for 100,000 population) which appeared over
time: 10, 35, 50, 100, 150, 165, 200, 350
\item expectation day
\end{itemize}
\subsubsection{Output format details}
A single (large) HTML page is generated (and published at \href{https://jalsti.github.io/cov19de/pages/Deutschland.html}{https://jalsti.github.io/cov19de/pages/Deutschland.html}),
containing:
\begin{itemize}
\item a timeline plot for the whole data range, with country name, population and the concrete numbers of
the incidences and incidence borders, daily cases, 14 day cases average, total cases, and a marker for
the expectation day
\item a text showing the population and concrete numbers for the prevalence of the country, as well as a list
of the total cases numbers for the previous 30 data days
\item an overview table for all its federal states, containing rows with columns for the federal state name
(including a web link to its \covviz{}-generated data page), including its flag as visual indicator,
and latest COVID-19 numbers from the ``\nameref{subsec:Daily-COVID-19-numbers}'' dataset for:
\begin{itemize}
\item 7 day new cases sum
\item prevalence / 100,000 people
\item 7 day incidence / 100,000 people
\item population
\item expectation day
\item effective reproduction number
\end{itemize}
\item additionally each table row contains the number of total cases for each data day, colored in a heatmap-like
manner to easily spot periods of fast changing values
\item the plots generated by \covviz{} (like described in paragraph \emph{\nameref{par:Federal-state}}) of
all german federal states, with web links to their \covviz{}-generated data page, and an additionally
possibility to open all of these plots together in a new page
\item a table with an overview of the districts' COVID-19 numbers, similar to the one for the federal states,
but with additional columns with the district name and a second time the latest number of the total cases
\item (at the bottom the web link to, and a screenshot of, an outdated external data sheet with death cases)
\end{itemize}
\subsection{COVID-19 numbers for german districts and federal states\label{subsec:Daily-COVID-19-numbers}}
For each district, and each federal state of Germany, the following COVID-19 numbers get calculated from
the input datasets:
\begin{itemize}
\item daily cases
\item total cases
\item prevalence / 100,000 people
\item 7 day new cases sum
\item 7 day incidence / 100,000 people
\item 14 days new cases moving average (centered)
\item expectation day
\item effective reproduction number
\item concrete numbers for different incidence border numbers (for 100,000 population) which appeared over
time: 10, 35, 50, 100, 150, 165, 200, 350
\end{itemize}
Only for federal states the following additional data gets calculated:
\begin{itemize}
\item 7 days cases mean
\end{itemize}
\subsubsection{Output format details}
For each federal state a (large) single HTML page is generated, containing a starting section for the
federal state itself, and after that a section for each of its districts.
\paragraph{Federal state section\label{par:Federal-state}}
The shown data is:
\begin{itemize}
\item the federal state name, number of its districts and date of the generation run
\item the same timeline plot as for the country, but now for the federal state's numbers, and with an additional
color gradient indicating how large its prevalence value is, relative to all other federal states.
\item a text showing the population and concrete numbers for the prevalence of the federal state
\item a table showing the concrete number for all plotted data days of:
\begin{itemize}
\item total cases
\item daily cases
\item 7 day mean
\end{itemize}
\item an overview table for all its districts, like the one on the page for the country's COVID-19 data, but
now only for the districts of the according federal state
\end{itemize}
\paragraph{District sections}
The section for a district contains:
\begin{itemize}
\item the district name, description, identifier and population
\item a list with links to the \covviz{}-generated data pages of all districts within 50\,km, as well as
a link to only show all the according \covviz{} plots on one page
\item the same timeline plot as for the federal state, but now for the district's numbers, and the color gradient
as indicator relative to all other districts, and an additionally possibility to open all of these plots
together in a new page
\item a text showing the district's population, concrete numbers for prevalence, incidence borders, and links
to: the COVID-19 numbers' data sources, a »TU Dortmund«\footnote{\href{https://www-ai.cs.tu-dortmund.de/}{https://www-ai.cs.tu-dortmund.de/}}
site with other COVID-19 information to the district, the »Wikipedia« pages of the district and the district
seat, and two search engines with COVID results of the last week for the district
\item a table showing the concrete number for all plotted data days of:
\begin{itemize}
\item total cases
\item daily cases
\item 7 day new cases sum
\item 7 day incidence / 100,000 people
\end{itemize}
\end{itemize}
\section{Dataset dependencies}
The dataset ``\nameref{subsec:Opendatasoft-geographical-positi}'' is used to once generate the dataset
``\nameref{subsec:Distances-between-districts}''.
All other datasets have no further dependencies besides that they are needed to generate the output data.
\end{document}