In [1]:
import findspark
findspark.init()
import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

## Read in Data

In [4]:
bme_file_location = "sofia_small/*bme280sof.csv"
sds_file_location = "sofia_small/*sds011sof.csv"

file_type = "csv"
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

df_bme = spark.read.format(file_type) \
    .option("inferSchema", infer_schema) \
    .option("header", first_row_is_header) \
    .option("sep", delimiter) \
    .load(bme_file_location)

df_sds = spark.read.format(file_type) \
    .option("inferSchema", infer_schema) \
    .option("header", first_row_is_header) \
    .option("sep", delimiter) \
    .load(sds_file_location)


In [5]:
from pyspark.sql.functions import year, month
from pyspark.sql.functions import to_date
from pyspark.sql.functions import to_timestamp,date_format
from pyspark.sql import functions as F
from pyspark.sql.functions import count, avg
from pyspark.sql.functions import col


df_sds_transformed = df_sds.withColumn('year',year(df_sds.timestamp))\
    .withColumn('month', month(df_sds.timestamp))\
    .withColumn("day", date_format(col("timestamp"), "d"))\
    .withColumn("ts", to_date(col("timestamp")).cast("date"))

df_sds_transformed = df_sds_transformed.groupBy("ts").agg(avg("P1"), avg("P2")).orderBy(["ts"], ascending=True)

df_sds_transformed.show()

+----------+------------------+------------------+
|        ts|           avg(P1)|           avg(P2)|
+----------+------------------+------------------+
|2017-07-01|17.764459663706905| 8.341274009698298|
|2017-07-02| 9.846284524930946| 6.325375406399083|
|2017-07-03| 20.35557791635181|17.195223293020813|
|2017-07-04| 8.984114511906204| 6.868896334621589|
|2017-07-05|10.412705222705208| 7.964031059031043|
|2017-07-06| 10.85810864999049| 8.447780535930221|
|2017-07-07|  9.61407907302477| 7.430200547526546|
|2017-07-08| 12.10184730986929| 9.885236809576535|
|2017-07-09|12.441132935466914|10.319859653725135|
|2017-07-10|14.278580865387667|12.425794746989531|
|2017-07-11|16.458481004748865|13.907630592351836|
|2017-07-12|14.077904752827623|10.800456856017325|
|2017-07-13| 11.50965046888325| 8.878007956805918|
|2017-07-14| 5.461827450735828| 3.109895859316535|
|2017-07-15|10.245437171815821| 7.799760959824183|
|2017-07-16| 11.48468567866602| 9.461745052205105|
|2017-07-17| 8.730244358596998|

In [6]:
df_bme_transformed = df_bme.withColumn('year',year(df_bme.timestamp))\
    .withColumn('month', month(df_bme.timestamp))\
    .withColumn("day", date_format(col("timestamp"), "d"))\
    .withColumn("ts", to_date(col("timestamp")).cast("date"))

df_bme_transformed = df_bme_transformed.groupBy("ts").agg(avg("pressure"), avg("temperature"), avg("humidity"))\
    .orderBy(["ts"], ascending=True)

df_bme_transformed.show()

+----------+-----------------+------------------+------------------+
|        ts|    avg(pressure)|  avg(temperature)|     avg(humidity)|
+----------+-----------------+------------------+------------------+
|2017-07-01|94572.18985080464| 33.33327613327619|32.792403355736745|
|2017-07-02|94441.42854684066|28.197254514672572| 44.52180304740427|
|2017-07-03| 94668.7624325228| 18.25461707200771| 78.17694325226508|
|2017-07-04|95313.96683276288| 22.32803235375923|  50.4074079911003|
|2017-07-05|95440.82530922632|23.534423652694652|44.841247660928104|
|2017-07-06|95312.02019876736|25.778363851992495|  42.4970118595822|
|2017-07-07|95248.96706425186|27.469182004089852|40.482749797878675|
|2017-07-08|95059.96317789043| 25.71446886446881|51.478899690053524|
|2017-07-09|95089.78527820377|27.075451422027033| 49.46747614048477|
|2017-07-10| 95128.1010232264|28.758966410703227|44.910974230932034|
|2017-07-11|95059.89666140104|30.580405242122936| 41.59478715494001|
|2017-07-12|94791.26359009359| 30.

In [18]:
combined_df = df_bme_transformed.join(df_sds_transformed, on=['ts'], how='left').orderBy(["ts"], ascending=True)
combined_df.show()
x_train = combined_df.filter(F.col('ts').between("2017-07-01", "2017-07-21"))
x_test = combined_df.filter(F.col('ts') > "2017-07-21")
x_train.show()
x_test.show()


+----------+-----------------+------------------+------------------+------------------+------------------+
|        ts|    avg(pressure)|  avg(temperature)|     avg(humidity)|           avg(P1)|           avg(P2)|
+----------+-----------------+------------------+------------------+------------------+------------------+
|2017-07-01|94572.18985080464| 33.33327613327619|32.792403355736745|17.764459663706905| 8.341274009698298|
|2017-07-02|94441.42854684066|28.197254514672572| 44.52180304740427| 9.846284524930946| 6.325375406399083|
|2017-07-03| 94668.7624325228| 18.25461707200771| 78.17694325226508| 20.35557791635181|17.195223293020813|
|2017-07-04|95313.96683276288| 22.32803235375923|  50.4074079911003| 8.984114511906204| 6.868896334621589|
|2017-07-05|95440.82530922632|23.534423652694652|44.841247660928104|10.412705222705208| 7.964031059031043|
|2017-07-06|95312.02019876736|25.778363851992495|  42.4970118595822| 10.85810864999049| 8.447780535930221|
|2017-07-07|95248.96706425186|27.4691

## Test Linear Regression

In [36]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import GBTRegressor

vectorAssembler = VectorAssembler(inputCols = ['avg(pressure)', 'avg(temperature)', 'avg(humidity)'], outputCol = 'features')
features_df = vectorAssembler.transform(combined_df)
features_df = features_df.select(['features', 'avg(P1)'])
test_features_df = vectorAssembler.transform(x_test)
train_features_df = vectorAssembler.transform(x_train)
test_features_df = test_features_df.withColumnRenamed("avg(P1)","label")
train_features_df = train_features_df.withColumnRenamed("avg(P1)","label")

lr = LinearRegression(featuresCol = 'features', labelCol='label', maxIter=50)
param_grid = ParamGridBuilder() \
            .addGrid(lr.regParam, [0.1, 0.3, 0.5]) \
            .addGrid(lr.elasticNetParam, [.5, .7, .9]) \
            .build()
cv = CrossValidator(estimator=lr, estimatorParamMaps=param_grid, evaluator=RegressionEvaluator(), numFolds=5).setParallelism(6)
train_features_df.show()

cvModel = cv.fit(train_features_df)
besty = cvModel.bestModel
print("  ElasticNetParam:", besty._java_obj.parent().getElasticNetParam())
print("  RegParam:", besty._java_obj.parent().getRegParam())
test_predictions = besty.transform(test_features_df)
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="label",
    predictionCol="prediction")
RMSE = evaluator.evaluate(test_predictions)
print(RMSE)
evaluator = RegressionEvaluator(
    metricName="r2",
    labelCol="label",
    predictionCol="prediction")
r2 = evaluator.evaluate(test_predictions)
print(r2)

+----------+-----------------+------------------+------------------+------------------+------------------+--------------------+
|        ts|    avg(pressure)|  avg(temperature)|     avg(humidity)|             label|           avg(P2)|            features|
+----------+-----------------+------------------+------------------+------------------+------------------+--------------------+
|2017-07-01|94572.18985080464| 33.33327613327619|32.792403355736745|17.764459663706905| 8.341274009698298|[94572.1898508046...|
|2017-07-02|94441.42854684066|28.197254514672572| 44.52180304740427| 9.846284524930946| 6.325375406399083|[94441.4285468406...|
|2017-07-03| 94668.7624325228| 18.25461707200771| 78.17694325226508| 20.35557791635181|17.195223293020813|[94668.7624325228...|
|2017-07-04|95313.96683276288| 22.32803235375923|  50.4074079911003| 8.984114511906204| 6.868896334621589|[95313.9668327628...|
|2017-07-05|95440.82530922632|23.534423652694652|44.841247660928104|10.412705222705208| 7.96403105903104

In [43]:
gbt = GBTRegressor(featuresCol = 'features', labelCol='label', maxIter=50)
param_grid = ParamGridBuilder() \
            .addGrid(gbt.maxDepth, [5, 10, 20, 30]) \
            .addGrid(gbt.maxBins, [8, 16, 32]) \
            .build()
cv = CrossValidator(estimator=gbt, estimatorParamMaps=param_grid, evaluator=RegressionEvaluator(), numFolds=5).setParallelism(6)
train_features_df.show()

cvModel = cv.fit(train_features_df)
besty = cvModel.bestModel
print("  max depth:", besty._java_obj.parent().getMaxDepth())
print("  max bins:", besty._java_obj.parent().getMaxBins())
test_predictions = besty.transform(test_features_df)
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="label",
    predictionCol="prediction")
RMSE = evaluator.evaluate(test_predictions)
print(RMSE)

+----------+-----------------+------------------+------------------+------------------+------------------+--------------------+
|        ts|    avg(pressure)|  avg(temperature)|     avg(humidity)|             label|           avg(P2)|            features|
+----------+-----------------+------------------+------------------+------------------+------------------+--------------------+
|2017-07-01|94572.18985080464| 33.33327613327619|32.792403355736745|17.764459663706905| 8.341274009698298|[94572.1898508046...|
|2017-07-02|94441.42854684066|28.197254514672572| 44.52180304740427| 9.846284524930946| 6.325375406399083|[94441.4285468406...|
|2017-07-03| 94668.7624325228| 18.25461707200771| 78.17694325226508| 20.35557791635181|17.195223293020813|[94668.7624325228...|
|2017-07-04|95313.96683276288| 22.32803235375923|  50.4074079911003| 8.984114511906204| 6.868896334621589|[95313.9668327628...|
|2017-07-05|95440.82530922632|23.534423652694652|44.841247660928104|10.412705222705208| 7.96403105903104

## Window Function Work

In [32]:
from pyspark.sql.window import Window
w = Window.orderBy("ts")
df1_train = x_train.withColumn("prev_avg(P1)", F.lag("avg(P1)").over(w))
df1_train.drop("avg(P2)").show()
df1_test = x_test.withColumn("prev_avg(P1)", F.lag("avg(P1)").over(w))
df1_test.drop("avg(P2)").show()

+----------+-----------------+------------------+------------------+------------------+------------------+
|        ts|    avg(pressure)|  avg(temperature)|     avg(humidity)|           avg(P1)|      prev_avg(P1)|
+----------+-----------------+------------------+------------------+------------------+------------------+
|2017-07-01|94572.18985080464| 33.33327613327619|32.792403355736745|17.764459663706905|              null|
|2017-07-02|94441.42854684066|28.197254514672572| 44.52180304740427| 9.846284524930946|17.764459663706905|
|2017-07-03| 94668.7624325228| 18.25461707200771| 78.17694325226508| 20.35557791635181| 9.846284524930946|
|2017-07-04|95313.96683276288| 22.32803235375923|  50.4074079911003| 8.984114511906204| 20.35557791635181|
|2017-07-05|95440.82530922632|23.534423652694652|44.841247660928104|10.412705222705208| 8.984114511906204|
|2017-07-06|95312.02019876736|25.778363851992495|  42.4970118595822| 10.85810864999049|10.412705222705208|
|2017-07-07|95248.96706425186|27.4691

In [33]:
df_lags_train = x_train.select("ts", "avg(P1)")
df_lags_test = x_test.select("ts", "avg(P1)")

for i in range(1, 8): 
    df_lags_train = df_lags_train.withColumn(f"P1_lag_{i}", F.lag(F.col('avg(P1)'), i).over(w))
    df_lags_test = df_lags_test.withColumn(f"P1_lag_{i}", F.lag(F.col('avg(P1)'), i).over(w))

In [34]:
lag_feature_df_train = df_lags_train.select("P1_lag_1", "P1_lag_2", "P1_lag_3", "P1_lag_4", "P1_lag_5", "P1_lag_6", "P1_lag_7", "avg(P1)")
lag_feature_df_train = lag_feature_df_train.where(col("P1_lag_7").isNotNull())
lag_feature_df_test = df_lags_test.select("P1_lag_1", "P1_lag_2", "P1_lag_3", "P1_lag_4", "P1_lag_5", "P1_lag_6", "P1_lag_7", "avg(P1)")
lag_feature_df_test = lag_feature_df_test.where(col("P1_lag_7").isNotNull())

## Lagging P1 values from the past 7 days

In [35]:
vectorAssembler = VectorAssembler(inputCols = ["P1_lag_1", "P1_lag_2", "P1_lag_3", "P1_lag_4", "P1_lag_5", "P1_lag_6", "P1_lag_7"], outputCol = 'features')
lag_df_train = vectorAssembler.transform(lag_feature_df_train)
lag_df_train = lag_df_train.select(['features', 'avg(P1)'])
lag_df_test = vectorAssembler.transform(lag_feature_df_test)
lag_df_test = lag_df_test.select(['features', 'avg(P1)'])

lag_df_test = lag_df_test.withColumnRenamed("avg(P1)","label")
lag_df_train = lag_df_train.withColumnRenamed("avg(P1)","label")

lr = LinearRegression(featuresCol = 'features', labelCol='label', maxIter=50)
param_grid = ParamGridBuilder() \
            .addGrid(lr.regParam, [0.1, 0.3, 0.5]) \
            .addGrid(lr.elasticNetParam, [.5, .7, .9]) \
            .build()
cv = CrossValidator(estimator=lr, estimatorParamMaps=param_grid, evaluator=RegressionEvaluator(), numFolds=5).setParallelism(6)

cvModel = cv.fit(lag_df_train)
besty = cvModel.bestModel
print("  ElasticNetParam:", besty._java_obj.parent().getElasticNetParam())
print("  RegParam:", besty._java_obj.parent().getRegParam())
test_predictions = besty.transform(lag_df_test)
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="label",
    predictionCol="prediction")
RMSE = evaluator.evaluate(test_predictions)
print(RMSE)
evaluator = RegressionEvaluator(
    metricName="r2",
    labelCol="label",
    predictionCol="prediction")
r2 = evaluator.evaluate(test_predictions)
print(r2)

  ElasticNetParam: 0.9
  RegParam: 0.5
1.9186891735786973
-0.38509498528318686


In [44]:
gbt = GBTRegressor(featuresCol = 'features', labelCol='label', maxIter=50)
param_grid = ParamGridBuilder() \
            .addGrid(gbt.maxDepth, [5, 10, 20, 30]) \
            .addGrid(gbt.maxBins, [8, 16, 32]) \
            .build()
cv = CrossValidator(estimator=gbt, estimatorParamMaps=param_grid, evaluator=RegressionEvaluator(), numFolds=5).setParallelism(6)

cvModel = cv.fit(lag_df_train)
besty = cvModel.bestModel
print("  max depth:", besty._java_obj.parent().getMaxDepth())
print("  max bins:", besty._java_obj.parent().getMaxBins())
test_predictions = besty.transform(lag_df_test)
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="label",
    predictionCol="prediction")
RMSE = evaluator.evaluate(test_predictions)
print(RMSE)

  max depth: 5
  max bins: 8
4.72753754333242


## Lagging P1 values from past  7 days AND lagging temps / pressures / humidities

In [3]:
zoo

NameError: name 'zoo' is not defined

In [None]:
# convert them into long rows with the lag information next...
         # groupBy sensor?
         # get averages for each day for each sensor
         # take the past 7 days lag information for variables
         # https://www.slideshare.net/SparkSummit/time-series-analytics-with-spark-spark-summit-east-talk-by-simon-ouellette
        

In [12]:
# https://medium.com/@sergey.ivanchuk/practical-pyspark-window-function-examples-cb5c7e1a3c41