# Übung 2.9 Todesursachen
Wie bereits in anderen Übungen besprochen, muss man zuerst wieder die Daten in das HDFS laden. Dazu habe ich wieder die Datei `death2016.csv` in das Volume des `namenode` Containers hineinkopiert, ich bin in das dazugehörende Verzeichnis in dem `namenode` Container hineingegangen mittels `docker exec -it namenode bash` und `cd /hadoop-data`, und schließlich habe ich die Daten in das HDFS mittels `hadoop fs -copyFromLocal death2016.csv workspace/pyspark` kopiert. 

In [3]:
import pandas as pd
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, BooleanType
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql import functions
from pyspark.sql import dataframe
from pyspark.sql.functions import to_timestamp, to_date, year, dayofweek

## Einlesen der Datei

In [14]:
# Spark session & context
spark = SparkSession \
    .builder \
    .master('spark://spark-master:7077') \
    .appName("uebung_29") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()
sc = spark.sparkContext

In [31]:
# Define schema of death file
death_cols = [
    StructField('country', StringType()),
    StructField('cause_no', IntegerType()),
    StructField('cause_name', StringType()),
    StructField('sex', StringType()),
    StructField('age', IntegerType()),
    StructField('age_group', StringType()),
]

for year in range(2000, 2017):
    death_low_up = [StructField(f'deaths_{year}', FloatType()),
                    StructField(f'low_{year}', FloatType()),
                    StructField(f'up_{year}', FloatType())]
    death_cols += death_low_up
    
death_schema = StructType(death_cols)

In [34]:
# Read in death file
file_path = 'hdfs://namenode:8020/user/root/workspace/pyspark/death2016.csv'
deaths = spark.read.csv(file_path, death_schema)
print(deaths.printSchema())
print(deaths.show(1))

root
 |-- country: string (nullable = true)
 |-- cause_no: integer (nullable = true)
 |-- cause_name: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- age_group: string (nullable = true)
 |-- deaths_2000: float (nullable = true)
 |-- low_2000: float (nullable = true)
 |-- up_2000: float (nullable = true)
 |-- deaths_2001: float (nullable = true)
 |-- low_2001: float (nullable = true)
 |-- up_2001: float (nullable = true)
 |-- deaths_2002: float (nullable = true)
 |-- low_2002: float (nullable = true)
 |-- up_2002: float (nullable = true)
 |-- deaths_2003: float (nullable = true)
 |-- low_2003: float (nullable = true)
 |-- up_2003: float (nullable = true)
 |-- deaths_2004: float (nullable = true)
 |-- low_2004: float (nullable = true)
 |-- up_2004: float (nullable = true)
 |-- deaths_2005: float (nullable = true)
 |-- low_2005: float (nullable = true)
 |-- up_2005: float (nullable = true)
 |-- deaths_2006: float (nullable = true)
 |-- 

In [15]:
help(spark.read.csv)

Help on method csv in module pyspark.sql.readwriter:

csv(path, schema=None, sep=None, encoding=None, quote=None, escape=None, comment=None, header=None, inferSchema=None, ignoreLeadingWhiteSpace=None, ignoreTrailingWhiteSpace=None, nullValue=None, nanValue=None, positiveInf=None, negativeInf=None, dateFormat=None, timestampFormat=None, maxColumns=None, maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None, columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None, samplingRatio=None, enforceSchema=None, emptyValue=None, locale=None, lineSep=None, pathGlobFilter=None, recursiveFileLookup=None, modifiedBefore=None, modifiedAfter=None, unescapedQuoteHandling=None) method of pyspark.sql.readwriter.DataFrameReader instance
    Loads a CSV file and returns the result as a  :class:`DataFrame`.
    
    This function will go through the input once to determine the input schema if
    ``inferSchema`` is enabled. To avoid going through the entire data once, di

In [4]:
# NEVER FORGET to stop the session
spark.stop()