In [1]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorIndexer,StringIndexer, VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
bart_df = spark.read.csv("s3a://raw-data-2016-bart-weather/bart-data2016/date-hour-soo-dest-2016.csv")

In [3]:
bart_trans_df = bart_df.select(col("_c0").alias('date'),col("_c1").alias("hour"),col("_c2").alias("origin_station"),
                         col("_c3").alias("destination_station"),col("_c4").alias("people"),
                         dayofmonth(col("_c0")).alias("day_of_month"),month(col("_c0")).alias("month_n") )

In [4]:
bart_fix_types_df =bart_trans_df.withColumn("people",bart_trans_df['people'].cast(IntegerType()))

In [5]:
#grouped_bart_df = 
bart_grouped = bart_fix_types_df.groupby("month_n",'day_of_month','destination_station').sum('people')
bart_grouped_rename = bart_grouped.select("month_n","day_of_month","destination_station",
                                          col("sum(people)").alias("total_exits"))

In [6]:
# ready for our model
bart_grouped_rename.show()

+-------+------------+-------------------+-----------+
|month_n|day_of_month|destination_station|total_exits|
+-------+------------+-------------------+-----------+
|      1|           4|               CIVC|      19792|
|      1|          13|               19TH|      13751|
|      1|          19|               POWL|      26823|
|      1|          25|               DALY|       8902|
|      1|          25|               RICH|       4443|
|      1|          26|               24TH|      14010|
|      1|          27|               ASHB|       5870|
|      1|          31|               16TH|       6650|
|      2|          18|               WDUB|       3675|
|      2|          20|               12TH|       5378|
|      2|          22|               DELN|       9488|
|      2|          22|               ORIN|       3097|
|      2|          29|               SHAY|       3196|
|      3|          29|               ORIN|       3196|
|      4|           5|               EMBR|      47860|
|      4| 

In [7]:
## Next bring in the weather data

In [8]:
weather_df = spark.read.csv("s3a://raw-data-2016-bart-weather/weather-data-2016/weather-historical-2016-sf",header=True)

In [9]:
weather_select = weather_df.select("temp-avg",'temp-low','temp-high',
                                   'humidity-avg','seapress-avg','wind-avg','precip-inches',
                                   dayofmonth(col('Month')).alias("day_of_month_weather"),
                                  month(col("Month")).alias("month_n_weather"))

In [10]:
grouped_data = weather_select.join(bart_grouped_rename, on =
                                  [weather_select["month_n_weather"]==bart_grouped_rename["month_n"], \
                                   weather_select["day_of_month_weather"]==bart_grouped_rename["day_of_month"]])

In [11]:
ml_df = grouped_data.select("temp-avg","temp-low","temp-high","humidity-avg","seapress-avg","wind-avg",
                    "precip-inches","month_n","day_of_month","destination_station","total_exits")

In [13]:
## Index the string station name to a numeric name
ml_df =StringIndexer(inputCol="destination_station", outputCol="indexedStation").fit(ml_df ).transform(ml_df)

In [14]:
ml_df.show(2)

+--------+--------+---------+------------+------------+--------+-------------+-------+------------+-------------------+-----------+--------------+
|temp-avg|temp-low|temp-high|humidity-avg|seapress-avg|wind-avg|precip-inches|month_n|day_of_month|destination_station|total_exits|indexedStation|
+--------+--------+---------+------------+------------+--------+-------------+-------+------------+-------------------+-----------+--------------+
|      52|      50|       54|          85|        30.2|       7|            0|      1|          10|               CAST|        881|          24.0|
|      54|      48|       59|          77|       30.13|      15|         0.33|      1|          13|               19TH|      13751|          22.0|
+--------+--------+---------+------------+------------+--------+-------------+-------+------------+-------------------+-----------+--------------+
only showing top 2 rows



In [15]:
# confirm that our groupby worked
# ml_df.where((col("month_n")==1) & (col("destination_station")=='CIVC')) .show(50)

In [16]:
bart_trans_df.withColumn("people",bart_trans_df['people'].cast(IntegerType()))

DataFrame[date: string, hour: string, origin_station: string, destination_station: string, people: int, day_of_month: int, month_n: int]

In [17]:
# set columns as integers
ml_df = ml_df.withColumn("temp-avg",ml_df['temp-avg'].cast(IntegerType()))
ml_df = ml_df.withColumn("temp-low",ml_df['temp-low'].cast(IntegerType()))
ml_df = ml_df.withColumn("temp-high",ml_df['temp-high'].cast(IntegerType()))
ml_df = ml_df.withColumn("humidity-avg",ml_df['humidity-avg'].cast(IntegerType()))
ml_df = ml_df.withColumn("seapress-avg",ml_df['seapress-avg'].cast(IntegerType()))
ml_df = ml_df.withColumn("wind-avg",ml_df['wind-avg'].cast(IntegerType()))
ml_df = ml_df.withColumn("precip-inches",ml_df['precip-inches'].cast(IntegerType()))
ml_df.cache()

DataFrame[temp-avg: int, temp-low: int, temp-high: int, humidity-avg: int, seapress-avg: int, wind-avg: int, precip-inches: int, month_n: int, day_of_month: int, destination_station: string, total_exits: bigint, indexedStation: double]

In [18]:
trainingData, testData = ml_df.randomSplit([0.7, 0.3])
train_rows = trainingData.count()
test_rows = testData.count()
print("There are {} rows of training and {} rows of testing data".format(train_rows,test_rows))

There are 5702 rows of training and 2487 rows of testing data


In [32]:
gb_assembler = VectorAssembler(inputCols=['temp-avg',
 'temp-low',
 'temp-high',
 'humidity-avg',
 'seapress-avg',
 'wind-avg',
 'precip-inches',
 'month_n',
 'day_of_month',
 'indexedStation',
 'total_exits'], outputCol="features")
training = gb_assembler.transform(trainingData).select(col("features"),col("total_exits").alias("label-exits"))
training.show()
## cache the model to test hyperparameters
training.cache()

+--------------------+-----------+
|            features|label-exits|
+--------------------+-----------+
|[51.0,45.0,56.0,5...|       4301|
|[51.0,48.0,54.0,5...|       6650|
|[52.0,50.0,54.0,8...|        881|
|[53.0,45.0,60.0,7...|       5378|
|[54.0,49.0,58.0,8...|       8902|
|[54.0,49.0,58.0,8...|       4443|
|[55.0,49.0,60.0,7...|       8070|
|[56.0,47.0,64.0,6...|       3196|
|[56.0,49.0,62.0,7...|       7982|
|[56.0,50.0,61.0,7...|       3675|
|[58.0,48.0,68.0,7...|       9488|
|[58.0,52.0,63.0,8...|       7252|
|[58.0,53.0,62.0,8...|       5130|
|[58.0,53.0,62.0,8...|      26823|
|[59.0,53.0,64.0,6...|       6202|
|[59.0,53.0,65.0,7...|       5321|
|[60.0,52.0,67.0,8...|       3196|
|[60.0,52.0,68.0,8...|       4675|
|[60.0,53.0,67.0,8...|       1566|
|[61.0,54.0,67.0,8...|       9282|
+--------------------+-----------+
only showing top 20 rows



DataFrame[features: vector, label-exits: bigint]

In [40]:
gb_model = GBTRegressor(labelCol="label-exits",featuresCol="features",maxIter=200,maxDepth=3,maxBins=50)
gb_model_trained = gb_model.fit(training)

In [41]:
testing = gb_assembler.transform(testData).select(col("features"),col("total_exits").alias("label-exits-true"))
testing.show()
## cache the model to test hyperparameters
testing.cache()

+--------------------+----------------+
|            features|label-exits-true|
+--------------------+----------------+
|[52.0,46.0,57.0,8...|           14010|
|[53.0,47.0,58.0,8...|           19792|
|[54.0,48.0,59.0,7...|           13751|
|[55.0,49.0,60.0,7...|            4055|
|[55.0,49.0,61.0,8...|            5870|
|[58.0,48.0,68.0,7...|            3097|
|[61.0,56.0,65.0,7...|            6519|
|[65.0,50.0,79.0,5...|           47860|
|[67.0,55.0,78.0,6...|           14987|
|[71.0,57.0,84.0,5...|            3357|
|[54.0,45.0,63.0,7...|            4644|
|[56.0,47.0,64.0,6...|            6931|
|[58.0,53.0,62.0,8...|            5674|
|[59.0,48.0,70.0,6...|            5529|
|[60.0,48.0,71.0,7...|            7056|
|[61.0,52.0,70.0,7...|            5581|
|[61.0,56.0,65.0,7...|           28433|
|[61.0,56.0,66.0,8...|            1331|
|[63.0,55.0,70.0,7...|            9299|
|[67.0,54.0,80.0,9...|            8399|
+--------------------+----------------+
only showing top 20 rows



DataFrame[features: vector, label-exits-true: bigint]

In [42]:
prediction = gb_model_trained.transform(testing)
predicted = prediction.select(col("features"),"prediction","label-exits-true")
predicted.show()

+--------------------+------------------+----------------+
|            features|        prediction|label-exits-true|
+--------------------+------------------+----------------+
|[52.0,46.0,57.0,8...|14151.195758889318|           14010|
|[53.0,47.0,58.0,8...|22589.490141147173|           19792|
|[54.0,48.0,59.0,7...|13655.782118741836|           13751|
|[55.0,49.0,60.0,7...|4073.5359256222882|            4055|
|[55.0,49.0,61.0,8...| 5828.050678583003|            5870|
|[58.0,48.0,68.0,7...|3040.6595956371975|            3097|
|[61.0,56.0,65.0,7...| 6496.733250098102|            6519|
|[65.0,50.0,79.0,5...| 50845.08057795934|           47860|
|[67.0,55.0,78.0,6...|14350.477385933322|           14987|
|[71.0,57.0,84.0,5...|3350.8958005100376|            3357|
|[54.0,45.0,63.0,7...| 4827.678057528416|            4644|
|[56.0,47.0,64.0,6...| 6881.440379702534|            6931|
|[58.0,53.0,62.0,8...|5722.0800705642505|            5674|
|[59.0,48.0,70.0,6...| 5461.830648176785|            552

In [43]:
evaluator = RegressionEvaluator(
    labelCol="label-exits-true", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predicted)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 981.354


In [50]:
# save our model
gb_model_trained.save("s3a://predicting-bart-ridership-model/")