In [48]:
from pyspark.sql import DataFrameReader
from pyspark.sql import SparkSession
from pyspark.ml.feature import IndexToString, Normalizer, StringIndexer, VectorAssembler, VectorIndexer
from pyspark.ml.classification import DecisionTreeClassifier
from helpers.helper_functions import translate_to_file_string


# for pretty printing
def printDf(sprkDF): 
    newdf = sprkDF.toPandas()
    from IPython.display import display, HTML
    return HTML(newdf.to_html())

In [49]:
inputFile = translate_to_file_string("./data/RKI_COVID19_20210512.csv")

## Create Spark Session

In [50]:
#create a SparkSession
spark = (SparkSession
       .builder
       .appName("RKICOVID19UNDERSTANDING")
       .getOrCreate())
# create a DataFrame using an ifered Schema 
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ",") \
       .csv(inputFile)   
print(df.printSchema())

root
 |-- ObjectId: integer (nullable = true)
 |-- IdBundesland: integer (nullable = true)
 |-- Bundesland: string (nullable = true)
 |-- Landkreis: string (nullable = true)
 |-- Altersgruppe: string (nullable = true)
 |-- Geschlecht: string (nullable = true)
 |-- AnzahlFall: integer (nullable = true)
 |-- AnzahlTodesfall: integer (nullable = true)
 |-- Meldedatum: string (nullable = true)
 |-- IdLandkreis: integer (nullable = true)
 |-- Datenstand: string (nullable = true)
 |-- NeuerFall: integer (nullable = true)
 |-- NeuerTodesfall: integer (nullable = true)
 |-- Refdatum: string (nullable = true)
 |-- NeuGenesen: integer (nullable = true)
 |-- AnzahlGenesen: integer (nullable = true)
 |-- IstErkrankungsbeginn: integer (nullable = true)
 |-- Altersgruppe2: string (nullable = true)

None


In [51]:
### Show First Row

In [52]:
df.show(1)

+--------+------------+------------------+------------+------------+----------+----------+---------------+--------------------+-----------+--------------------+---------+--------------+--------------------+----------+-------------+--------------------+-----------------+
|ObjectId|IdBundesland|        Bundesland|   Landkreis|Altersgruppe|Geschlecht|AnzahlFall|AnzahlTodesfall|          Meldedatum|IdLandkreis|          Datenstand|NeuerFall|NeuerTodesfall|            Refdatum|NeuGenesen|AnzahlGenesen|IstErkrankungsbeginn|    Altersgruppe2|
+--------+------------+------------------+------------+------------+----------+----------+---------------+--------------------+-----------+--------------------+---------+--------------+--------------------+----------+-------------+--------------------+-----------------+
|       1|           1|Schleswig-Holstein|SK Flensburg|     A15-A34|         M|         1|              0|2021/03/21 00:00:...|       1001|12.05.2021, 00:00...|        0|            -9|20

In [53]:
### Anzahl der Datensätze
print(df.count())

1909100


In [57]:
### Alle betroffenen Bundesländer
df.groupBy("Bundesland").count().show()

+--------------------+------+
|          Bundesland| count|
+--------------------+------+
|      Sachsen-Anhalt| 52526|
|       Niedersachsen|149948|
|         Brandenburg| 68754|
|              Berlin| 93527|
|              Bayern|345711|
|             Sachsen|112548|
|             Hamburg| 25261|
|              Bremen| 11563|
| Nordrhein-Westfalen|375788|
|           Thüringen| 78950|
|              Hessen|144697|
|   Baden-Württemberg|266140|
|Mecklenburg-Vorpo...| 27318|
|            Saarland| 21100|
|  Schleswig-Holstein| 44964|
|     Rheinland-Pfalz| 90305|
+--------------------+------+



In [55]:
### Anzahl der Datensätze nach Bundesland
df.describe().show()

+-------+-----------------+------------------+-----------------+------------------+------------+----------+------------------+-------------------+--------------------+------------------+--------------------+--------------------+------------------+--------------------+-------------------+-----------------+--------------------+-----------------+
|summary|         ObjectId|      IdBundesland|       Bundesland|         Landkreis|Altersgruppe|Geschlecht|        AnzahlFall|    AnzahlTodesfall|          Meldedatum|       IdLandkreis|          Datenstand|           NeuerFall|    NeuerTodesfall|            Refdatum|         NeuGenesen|    AnzahlGenesen|IstErkrankungsbeginn|    Altersgruppe2|
+-------+-----------------+------------------+-----------------+------------------+------------+----------+------------------+-------------------+--------------------+------------------+--------------------+--------------------+------------------+--------------------+-------------------+-----------------+--