# Data Preperation

In [1]:
from pyspark.sql.session import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from helpers.helper_functions import translate_to_file_string

inputFile = translate_to_file_string("./data/Data_Preparation_Result.csv")

## Create Spark Session

In [3]:
#create a SparkSession
spark = (SparkSession
       .builder
       .appName("DataModelling")
       .getOrCreate())
# create a DataFrame using an ifered Schema 
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile)   
print(df.printSchema())

root
 |-- Bundesland: string (nullable = true)
 |-- BundeslandIndex: integer (nullable = true)
 |-- Landkreis: string (nullable = true)
 |-- LandkreisIndex: integer (nullable = true)
 |-- Altersgruppe: string (nullable = true)
 |-- AltersgruppeIndex: double (nullable = true)
 |-- Geschlecht: string (nullable = true)
 |-- GeschlechtIndex: double (nullable = true)
 |-- FallStatus: string (nullable = true)
 |-- FallStatusIndex: double (nullable = true)

None


## Vorbereitung der Daten

In [5]:
# Aufbau des Feature-Vectors
assembler =  VectorAssembler(outputCol="features", inputCols=["GeschlechtIndex","AltersgruppeIndex","LandkreisIndex"])
featureVector = assembler.transform(df)

In [6]:
splits = featureVector.randomSplit([0.8, 0.2 ], 345678)
trainingVector = splits[0]
testVector = splits[1]

## Linear-Regression-Training

In [8]:
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, loss='squaredError', solver="normal", featuresCol="features", labelCol="FallStatusIndex")


# Fit the Linear Regression Modell
lrModel = lr.fit(trainingVector)

# Print the Coefficients and Intercept
print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

Coefficients: [0.0,0.0,0.0]
Intercept: 0.07950121975060988


## Test the Modell

In [9]:
predictionsLR = lrModel.transform(testVector)
predictionsLR.show()

+-----------------+---------------+----------------+--------------+------------+-----------------+----------+---------------+----------+---------------+----------------+-------------------+
|       Bundesland|BundeslandIndex|       Landkreis|LandkreisIndex|Altersgruppe|AltersgruppeIndex|Geschlecht|GeschlechtIndex|FallStatus|FallStatusIndex|        features|         prediction|
+-----------------+---------------+----------------+--------------+------------+-----------------+----------+---------------+----------+---------------+----------------+-------------------+
|Baden-Württemberg|              8|     LK Biberach|          8426|     A60-A79|              2.0|         M|            1.0| GESTORBEN|            2.0|[1.0,2.0,8426.0]|0.07950121975060988|
|Baden-Württemberg|              8|     LK Biberach|          8426|     A60-A79|              2.0|         M|            1.0| GESTORBEN|            2.0|[1.0,2.0,8426.0]|0.07950121975060988|
|Baden-Württemberg|              8|     LK Biberac

In [10]:
evaluator = RegressionEvaluator(labelCol="FallStatusIndex",predictionCol="prediction", metricName="rmse")

In [11]:
print("root mean square error = " , evaluator.evaluate(predictionsLR))
spark.stop()

root mean square error =  0.3471212842076568
