In [1]:
import os
from pyspark.sql import SparkSession, SQLContext
from pyspark import SparkContext, SparkConf

from pyspark.sql.functions import explode
from pyspark.sql.functions import countDistinct, avg
from pyspark.sql.functions import dayofmonth,dayofyear,year,month,hour,weekofyear,date_format
from pyspark.sql.functions import col as func_col
from pyspark.sql.functions import lit
from pyspark.sql.functions import *
from pyspark.ml import Pipeline

In [2]:
user_id = 'Edureka_749763'
app_name = '{0} : Spark SQL'.format(user_id)

In [3]:
#Configuration of the Spark Session
conf = SparkConf()  # create the configuration
conf.set('spark.driver.extraClassPath', "/usr/share/cmf/common_jars/mysql-connector-java-5.1.15.jar")  
conf.set('spark.executor.extraClassPath', "/usr/share/cmf/common_jars/mysql-connector-java-5.1.15.jar")
#conf.set('spark.driver.extraClassPath', "/usr/share/java/mysql-connector-java-5.1.42-bin.jar")  
#conf.set('spark.executor.extraClassPath', "/usr/share/java/mysql-connector-java-5.1.42-bin.jar")
#os.environ['SPARK_CLASSPATH'] = "/usr/share/java/mysql-connector-java-5.1.42-bin.jar" 
#Spark Session object
spark = SparkSession.builder.config(conf=conf).appName(app_name).getOrCreate()

In [4]:
df_mysql = (spark.read.format("jdbc")
    .option("url", 'jdbc:mysql://dbserver.edu.cloudlab.com/labuser_database')
    .option("driver", "com.mysql.jdbc.Driver")
    .option("dbtable", "bike_predictions")
    .option("user", 'edu_labuser')
    .option("password", "edureka")
    .load()
)
df_mysql.show()

+------+-------+----------+-------+-----+------+--------+---------+------+----------+-----+---+-----+----+----+
|season|holiday|workingday|weather| temp| atemp|humidity|windspeed|casual|registered|count|day|month|year|hour|
+------+-------+----------+-------+-----+------+--------+---------+------+----------+-----+---+-----+----+----+
|     1|      0|         0|      1| 9.84|14.395|      81|      0.0|     3|        13|   16|  1|    1|2011|   0|
|     1|      0|         0|      1| 9.02|13.635|      80|      0.0|     8|        32|   40|  1|    1|2011|   1|
|     1|      0|         0|      1| 9.02|13.635|      80|      0.0|     5|        27|   32|  1|    1|2011|   2|
|     1|      0|         0|      1| 9.84|14.395|      75|      0.0|     3|        10|   13|  1|    1|2011|   3|
|     1|      0|         0|      1| 9.84|14.395|      75|      0.0|     0|         1|    1|  1|    1|2011|   4|
|     1|      0|         0|      2| 9.84| 12.88|      75|   6.0032|     0|         1|    1|  1|    1|201

In [5]:
## load testing Data and then analyze it
test_csv_file = '/user/edureka_749763/bike-sharing-demand/test.csv'
test_df = spark.read.csv(test_csv_file, inferSchema=True, header=True)
test_df.show(5)

+--------------------+------+-------+----------+-------+-----+------+--------+---------+
|            datetime|season|holiday|workingday|weather| temp| atemp|humidity|windspeed|
+--------------------+------+-------+----------+-------+-----+------+--------+---------+
|2011-01-20 00:00:...|     1|      0|         1|      1|10.66|11.365|      56|  26.0027|
|2011-01-20 01:00:...|     1|      0|         1|      1|10.66|13.635|      56|      0.0|
|2011-01-20 02:00:...|     1|      0|         1|      1|10.66|13.635|      56|      0.0|
|2011-01-20 03:00:...|     1|      0|         1|      1|10.66| 12.88|      56|  11.0014|
|2011-01-20 04:00:...|     1|      0|         1|      1|10.66| 12.88|      56|  11.0014|
+--------------------+------+-------+----------+-------+-----+------+--------+---------+
only showing top 5 rows



In [6]:
import pandas as pd
from pylab import *
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline, PipelineModel

stringIndexer = StringIndexer(inputCol = "season", outputCol = 'season' + 'Index')
OHencoder = OneHotEncoder(inputCol=stringIndexer.getOutputCol(), outputCol="season_cat")
stages = []
stages += [stringIndexer, OHencoder]
assemblerInputs = ["season_cat"] + ['temp','atemp','humidity','windspeed']
Vectassembler = VectorAssembler(inputCols=assemblerInputs, outputCol='features')
stages += [Vectassembler]
cols = test_df.columns
pipeline = Pipeline(stages = stages)
pipelineModel = pipeline.fit(test_df)
test_df = pipelineModel.transform(test_df)
selectedCols = ['features']+cols
test_df = test_df.select(selectedCols)
test_df.show(5)

+--------------------+--------------------+------+-------+----------+-------+-----+------+--------+---------+
|            features|            datetime|season|holiday|workingday|weather| temp| atemp|humidity|windspeed|
+--------------------+--------------------+------+-------+----------+-------+-----+------+--------+---------+
|[0.0,0.0,1.0,10.6...|2011-01-20 00:00:...|     1|      0|         1|      1|10.66|11.365|      56|  26.0027|
|[0.0,0.0,1.0,10.6...|2011-01-20 01:00:...|     1|      0|         1|      1|10.66|13.635|      56|      0.0|
|[0.0,0.0,1.0,10.6...|2011-01-20 02:00:...|     1|      0|         1|      1|10.66|13.635|      56|      0.0|
|[0.0,0.0,1.0,10.6...|2011-01-20 03:00:...|     1|      0|         1|      1|10.66| 12.88|      56|  11.0014|
|[0.0,0.0,1.0,10.6...|2011-01-20 04:00:...|     1|      0|         1|      1|10.66| 12.88|      56|  11.0014|
+--------------------+--------------------+------+-------+----------+-------+-----+------+--------+---------+
only showi

In [None]:
# Make prediction using stored model
reloaded_model = PipelineModel.load("/user/edureka_749763/models_gradient_boost_regressor")
predictions = reloaded_model.transform(test_df)
predictions.select("prediction","features").show(5)
predictions=predictions.drop("features","indexedFeatures")
predictions.show(5)


+-----------------+--------------------+
|       prediction|            features|
+-----------------+--------------------+
| 95.0677949329888|[0.0,0.0,1.0,10.6...|
|43.58663693854156|[0.0,0.0,1.0,10.6...|
|43.58663693854156|[0.0,0.0,1.0,10.6...|
|97.83113259769293|[0.0,0.0,1.0,10.6...|
|97.83113259769293|[0.0,0.0,1.0,10.6...|
+-----------------+--------------------+
only showing top 5 rows

+--------------------+------+-------+----------+-------+-----+------+--------+---------+-----------------+
|            datetime|season|holiday|workingday|weather| temp| atemp|humidity|windspeed|       prediction|
+--------------------+------+-------+----------+-------+-----+------+--------+---------+-----------------+
|2011-01-20 00:00:...|     1|      0|         1|      1|10.66|11.365|      56|  26.0027| 95.0677949329888|
|2011-01-20 01:00:...|     1|      0|         1|      1|10.66|13.635|      56|      0.0|43.58663693854156|
|2011-01-20 02:00:...|     1|      0|         1|      1|10.66|13.635|  

In [None]:
# Save to RDBMS
predictions.write.mode('append').format('jdbc').options(
    url='jdbc:mysql://dbserver.edu.cloudlab.com/labuser_database',
    driver='com.mysql.jdbc.Driver',
    dbtable='bike_prediciton_results',
    user='edu_labuser', 
    password='edureka'
).save()