In [1]:
### Application Development for Model Generation
### For the above steps wrote write an application to:
## Step 1. Clean and Transform the data
## Step 2. Develop the model and persist it

In [2]:
import os
from pyspark.sql import SparkSession, SQLContext
from pyspark import SparkContext, SparkConf

from pyspark.sql.functions import explode
from pyspark.sql.functions import countDistinct, avg
from pyspark.sql.functions import dayofmonth,dayofyear,year,month,hour,weekofyear,date_format
from pyspark.sql.functions import col as func_col
from pyspark.sql.functions import lit
from pyspark.sql.functions import *
from pyspark.ml import Pipeline

In [3]:
user_id = 'Edureka_749763'
app_name = '{0} : Spark SQL'.format(user_id)

In [4]:
#Configuration of the Spark Session
conf = SparkConf()  # create the configuration
conf.set('spark.driver.extraClassPath', "/usr/share/cmf/common_jars/mysql-connector-java-5.1.15.jar")  
conf.set('spark.executor.extraClassPath', "/usr/share/cmf/common_jars/mysql-connector-java-5.1.15.jar")
#conf.set('spark.driver.extraClassPath', "/usr/share/java/mysql-connector-java-5.1.42-bin.jar")  
#conf.set('spark.executor.extraClassPath', "/usr/share/java/mysql-connector-java-5.1.42-bin.jar")
#os.environ['SPARK_CLASSPATH'] = "/usr/share/java/mysql-connector-java-5.1.42-bin.jar" 
#Spark Session object
spark = SparkSession.builder.config(conf=conf).appName(app_name).getOrCreate()

In [5]:
## load training Data and then analyze it
training_csv_file = '/user/edureka_749763/bike-sharing-demand/train.csv'
train_df = spark.read.csv(training_csv_file, inferSchema=True, header=True)
train_df.show(5)

+--------------------+------+-------+----------+-------+----+------+--------+---------+------+----------+-----+
|            datetime|season|holiday|workingday|weather|temp| atemp|humidity|windspeed|casual|registered|count|
+--------------------+------+-------+----------+-------+----+------+--------+---------+------+----------+-----+
|2011-01-01 00:00:...|     1|      0|         0|      1|9.84|14.395|      81|      0.0|     3|        13|   16|
|2011-01-01 01:00:...|     1|      0|         0|      1|9.02|13.635|      80|      0.0|     8|        32|   40|
|2011-01-01 02:00:...|     1|      0|         0|      1|9.02|13.635|      80|      0.0|     5|        27|   32|
|2011-01-01 03:00:...|     1|      0|         0|      1|9.84|14.395|      75|      0.0|     3|        10|   13|
|2011-01-01 04:00:...|     1|      0|         0|      1|9.84|14.395|      75|      0.0|     0|         1|    1|
+--------------------+------+-------+----------+-------+----+------+--------+---------+------+----------

In [6]:
train_df.printSchema()

root
 |-- datetime: timestamp (nullable = true)
 |-- season: integer (nullable = true)
 |-- holiday: integer (nullable = true)
 |-- workingday: integer (nullable = true)
 |-- weather: integer (nullable = true)
 |-- temp: double (nullable = true)
 |-- atemp: double (nullable = true)
 |-- humidity: integer (nullable = true)
 |-- windspeed: double (nullable = true)
 |-- casual: integer (nullable = true)
 |-- registered: integer (nullable = true)
 |-- count: integer (nullable = true)



In [7]:
train_df.describe()

DataFrame[summary: string, season: string, holiday: string, workingday: string, weather: string, temp: string, atemp: string, humidity: string, windspeed: string, casual: string, registered: string, count: string]

In [8]:
train_df.describe().show()

+-------+------------------+-------------------+------------------+------------------+------------------+-----------------+------------------+------------------+-----------------+------------------+------------------+
|summary|            season|            holiday|        workingday|           weather|              temp|            atemp|          humidity|         windspeed|           casual|        registered|             count|
+-------+------------------+-------------------+------------------+------------------+------------------+-----------------+------------------+------------------+-----------------+------------------+------------------+
|  count|             10886|              10886|             10886|             10886|             10886|            10886|             10886|             10886|            10886|             10886|             10886|
|   mean|2.5066139996325556|0.02856880396839978|0.6808745177291935| 1.418427337865148|20.230859819952173|23.65508405291192| 61.8

In [9]:
# Convert the datetime into meaningful columns such as hour, day, month, year, etc
train_df=train_df.withColumn('day',dayofmonth(train_df["datetime"]))
train_df=train_df.withColumn('month',month(train_df["datetime"]))
train_df=train_df.withColumn('year',year(train_df["datetime"]))
train_df=train_df.withColumn('hour',hour(train_df["datetime"]))
train_df=train_df.drop("datetime")

In [10]:
train_df.show(5)

+------+-------+----------+-------+----+------+--------+---------+------+----------+-----+---+-----+----+----+
|season|holiday|workingday|weather|temp| atemp|humidity|windspeed|casual|registered|count|day|month|year|hour|
+------+-------+----------+-------+----+------+--------+---------+------+----------+-----+---+-----+----+----+
|     1|      0|         0|      1|9.84|14.395|      81|      0.0|     3|        13|   16|  1|    1|2011|   0|
|     1|      0|         0|      1|9.02|13.635|      80|      0.0|     8|        32|   40|  1|    1|2011|   1|
|     1|      0|         0|      1|9.02|13.635|      80|      0.0|     5|        27|   32|  1|    1|2011|   2|
|     1|      0|         0|      1|9.84|14.395|      75|      0.0|     3|        10|   13|  1|    1|2011|   3|
|     1|      0|         0|      1|9.84|14.395|      75|      0.0|     0|         1|    1|  1|    1|2011|   4|
+------+-------+----------+-------+----+------+--------+---------+------+----------+-----+---+-----+----+----+
o

In [11]:
# Explore how count varies with different features such as hour, month, etc
train_df.groupby("hour").count().orderBy("hour").show()
train_df.groupby("month").count().orderBy("month").show()
train_df.groupby("year").count().orderBy("year").show()
train_df.groupby("day").count().orderBy("day").show()
train_df.count()

+----+-----+
|hour|count|
+----+-----+
|   0|  455|
|   1|  454|
|   2|  448|
|   3|  433|
|   4|  442|
|   5|  452|
|   6|  455|
|   7|  455|
|   8|  455|
|   9|  455|
|  10|  455|
|  11|  455|
|  12|  456|
|  13|  456|
|  14|  456|
|  15|  456|
|  16|  456|
|  17|  456|
|  18|  456|
|  19|  456|
+----+-----+
only showing top 20 rows

+-----+-----+
|month|count|
+-----+-----+
|    1|  884|
|    2|  901|
|    3|  901|
|    4|  909|
|    5|  912|
|    6|  912|
|    7|  912|
|    8|  912|
|    9|  909|
|   10|  911|
|   11|  911|
|   12|  912|
+-----+-----+

+----+-----+
|year|count|
+----+-----+
|2011| 5422|
|2012| 5464|
+----+-----+

+---+-----+
|day|count|
+---+-----+
|  1|  575|
|  2|  573|
|  3|  573|
|  4|  574|
|  5|  575|
|  6|  572|
|  7|  574|
|  8|  574|
|  9|  575|
| 10|  572|
| 11|  568|
| 12|  573|
| 13|  574|
| 14|  574|
| 15|  574|
| 16|  574|
| 17|  575|
| 18|  563|
| 19|  574|
+---+-----+



10886

In [12]:
# Convert the season column into the string Data type for using the string indexer
train_df = train_df.withColumn("season", train_df["season"].cast(StringType()))
train_df.printSchema()
train_df.show()

train_df.toPandas().transpose().describe()

root
 |-- season: string (nullable = true)
 |-- holiday: integer (nullable = true)
 |-- workingday: integer (nullable = true)
 |-- weather: integer (nullable = true)
 |-- temp: double (nullable = true)
 |-- atemp: double (nullable = true)
 |-- humidity: integer (nullable = true)
 |-- windspeed: double (nullable = true)
 |-- casual: integer (nullable = true)
 |-- registered: integer (nullable = true)
 |-- count: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- hour: integer (nullable = true)

+------+-------+----------+-------+-----+------+--------+---------+------+----------+-----+---+-----+----+----+
|season|holiday|workingday|weather| temp| atemp|humidity|windspeed|casual|registered|count|day|month|year|hour|
+------+-------+----------+-------+-----+------+--------+---------+------+----------+-----+---+-----+----+----+
|     1|      0|         0|      1| 9.84|14.395|      81|      0.0|     3|

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10876,10877,10878,10879,10880,10881,10882,10883,10884,10885
count,15,15,15,15,15,15,15,15,15,15,...,15,15,15,15,15,15,15,15,15,15
unique,10,10,11,10,8,10,9,10,9,11,...,14,14,14,14,14,13,14,14,13,14
top,0,1,1,1,1,1,0,1,1,1,...,1,1,1,1,1,19,1,1,12,1
freq,4,4,3,3,5,4,4,4,4,3,...,2,2,2,2,2,2,2,2,2,2


In [13]:
#### Handle NULL values
#train_df.fillna( { 'season':'0', 'temp':0.00, 'atemp':0.00, 'humidity':0, 'windspeed':0.0} )
train_df = train_df.dropna() 
train_df.show(5)
train_df.count()

+------+-------+----------+-------+----+------+--------+---------+------+----------+-----+---+-----+----+----+
|season|holiday|workingday|weather|temp| atemp|humidity|windspeed|casual|registered|count|day|month|year|hour|
+------+-------+----------+-------+----+------+--------+---------+------+----------+-----+---+-----+----+----+
|     1|      0|         0|      1|9.84|14.395|      81|      0.0|     3|        13|   16|  1|    1|2011|   0|
|     1|      0|         0|      1|9.02|13.635|      80|      0.0|     8|        32|   40|  1|    1|2011|   1|
|     1|      0|         0|      1|9.02|13.635|      80|      0.0|     5|        27|   32|  1|    1|2011|   2|
|     1|      0|         0|      1|9.84|14.395|      75|      0.0|     3|        10|   13|  1|    1|2011|   3|
|     1|      0|         0|      1|9.84|14.395|      75|      0.0|     0|         1|    1|  1|    1|2011|   4|
+------+-------+----------+-------+----+------+--------+---------+------+----------+-----+---+-----+----+----+
o

10886

In [14]:
#### ML Model training with Data 

import pandas as pd
from pylab import *
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
#from pyspark.ml.feature import OneHotEncoderEstimator
 
stringIndexer = [StringIndexer(inputCol = "season", outputCol = 'seasonIndex')]
OHencoder = [OneHotEncoder(inputCol='seasonIndex', outputCol="season_cat")]
#OHencoder = OneHotEncoderEstimator(inputCols=['seasonIndex'], outputCols=["season_cat"])
assemblerInputs = ["season_cat"] + ['temp','atemp','humidity','windspeed']
Vectassembler = [VectorAssembler(inputCols=assemblerInputs, outputCol='features')]
stages = stringIndexer + OHencoder + Vectassembler

cols = train_df.columns
pipeline = Pipeline(stages = stringIndexer + OHencoder + Vectassembler)
pipelineModel = pipeline.fit(train_df)

train_df = pipelineModel.transform(train_df)
selectedCols = ['features']+cols
train_df = train_df.select(selectedCols)
train_df.printSchema()
train_df.show(5)




root
 |-- features: vector (nullable = true)
 |-- season: string (nullable = true)
 |-- holiday: integer (nullable = true)
 |-- workingday: integer (nullable = true)
 |-- weather: integer (nullable = true)
 |-- temp: double (nullable = true)
 |-- atemp: double (nullable = true)
 |-- humidity: integer (nullable = true)
 |-- windspeed: double (nullable = true)
 |-- casual: integer (nullable = true)
 |-- registered: integer (nullable = true)
 |-- count: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- hour: integer (nullable = true)

+--------------------+------+-------+----------+-------+----+------+--------+---------+------+----------+-----+---+-----+----+----+
|            features|season|holiday|workingday|weather|temp| atemp|humidity|windspeed|casual|registered|count|day|month|year|hour|
+--------------------+------+-------+----------+-------+----+------+--------+---------+------+----------+-

In [15]:
pd.DataFrame(train_df.take(100), columns=train_df.columns)

Unnamed: 0,features,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,day,month,year,hour
0,"(0.0, 0.0, 0.0, 9.84, 14.395, 81.0, 0.0)",1,0,0,1,9.84,14.395,81,0.0000,3,13,16,1,1,2011,0
1,"(0.0, 0.0, 0.0, 9.02, 13.635, 80.0, 0.0)",1,0,0,1,9.02,13.635,80,0.0000,8,32,40,1,1,2011,1
2,"(0.0, 0.0, 0.0, 9.02, 13.635, 80.0, 0.0)",1,0,0,1,9.02,13.635,80,0.0000,5,27,32,1,1,2011,2
3,"(0.0, 0.0, 0.0, 9.84, 14.395, 75.0, 0.0)",1,0,0,1,9.84,14.395,75,0.0000,3,10,13,1,1,2011,3
4,"(0.0, 0.0, 0.0, 9.84, 14.395, 75.0, 0.0)",1,0,0,1,9.84,14.395,75,0.0000,0,1,1,1,1,2011,4
5,"[0.0, 0.0, 0.0, 9.84, 12.88, 75.0, 6.0032]",1,0,0,2,9.84,12.880,75,6.0032,0,1,1,1,1,2011,5
6,"(0.0, 0.0, 0.0, 9.02, 13.635, 80.0, 0.0)",1,0,0,1,9.02,13.635,80,0.0000,2,0,2,1,1,2011,6
7,"(0.0, 0.0, 0.0, 8.2, 12.88, 86.0, 0.0)",1,0,0,1,8.20,12.880,86,0.0000,1,2,3,1,1,2011,7
8,"(0.0, 0.0, 0.0, 9.84, 14.395, 75.0, 0.0)",1,0,0,1,9.84,14.395,75,0.0000,1,7,8,1,1,2011,8
9,"(0.0, 0.0, 0.0, 13.12, 17.425, 76.0, 0.0)",1,0,0,1,13.12,17.425,76,0.0000,8,6,14,1,1,2011,9


In [16]:
# Split between training and testing datasets
vtrain_df = train_df.select('features', 'count')
splits = vtrain_df.randomSplit([0.7, 0.3])
f_train_df = splits[0]
f_test_df = splits[1]

In [17]:
# LinearRegression
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

lr = LinearRegression(featuresCol = 'features', labelCol='count')
lr_model = lr.fit(f_train_df)
pred = lr_model.evaluate(f_test_df)
pred.predictions.show(1000)
trainingSummary = lr_model.summary
lr_predictions = lr_model.transform(f_test_df)
lr_predictions.select("prediction","count","features").show(5)
lr_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="count",metricName="r2")
print("lr_evaluator output = %g" % lr_evaluator.evaluate(lr_predictions))


+--------------------+-----+-------------------+
|            features|count|         prediction|
+--------------------+-----+-------------------+
|(7,[3,4,5],[2.46,...|   43|  34.40162228633605|
|(7,[3,4,5],[4.1,6...|  199|  56.26511963058158|
|(7,[3,4,5],[4.92,...|    3|  47.45380199151839|
|(7,[3,4,5],[4.92,...|   10| -36.61103190185037|
|(7,[3,4,5],[5.74,...|   59|   92.6405530055977|
|(7,[3,4,5],[5.74,...|   15|-27.596764555330367|
|(7,[3,4,5],[5.74,...|   24| -44.40973133400411|
|(7,[3,4,5],[5.74,...|    3| -64.02485924245678|
|(7,[3,4,5],[5.74,...|   22| -64.02485924245678|
|(7,[3,4,5],[6.56,...|    8|  40.27297817380635|
|(7,[3,4,5],[6.56,...|  219|  40.27297817380635|
|(7,[3,4,5],[6.56,...|    4|  26.26217252491155|
|(7,[3,4,5],[6.56,...|    2| 12.251366876016746|
|(7,[3,4,5],[6.56,...|   36| 12.251366876016746|
|(7,[3,4,5],[7.38,...|  122|  71.70453455855802|
|(7,[3,4,5],[8.2,1...|   53| 111.55266598990512|
|(7,[3,4,5],[8.2,1...|   96| 111.55266598990512|
|(7,[3,4,5],[8.2,1..

+------------------+-----+--------------------+
|        prediction|count|            features|
+------------------+-----+--------------------+
| 34.40162228633605|   43|(7,[3,4,5],[2.46,...|
| 56.26511963058158|  199|(7,[3,4,5],[4.1,6...|
| 47.45380199151839|    3|(7,[3,4,5],[4.92,...|
|-36.61103190185037|   10|(7,[3,4,5],[4.92,...|
|  92.6405530055977|   59|(7,[3,4,5],[5.74,...|
+------------------+-----+--------------------+
only showing top 5 rows

lr_evaluator output = 0.272643


In [18]:
# RandomForestRegressor
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(train_df)
(trainingData, testData) = train_df.randomSplit([0.7, 0.3])
rf = RandomForestRegressor(labelCol="count",featuresCol="indexedFeatures")
pipeline = Pipeline(stages=[featureIndexer, rf])
model = pipeline.fit(trainingData)
predictions = model.transform(testData)
predictions.select("prediction", "count","indexedFeatures").show(5)
evaluator = RegressionEvaluator(labelCol="count", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("rmse = %g" % rmse)
rfModel = model.stages[0]
print(rfModel)

+------------------+-----+--------------------+
|        prediction|count|     indexedFeatures|
+------------------+-----+--------------------+
| 84.45425652474088|   51|(7,[3,4,5],[4.92,...|
| 80.21387487132554|   17|(7,[3,4,5],[5.74,...|
|  69.3230970841756|   10|(7,[3,4,5],[5.74,...|
|62.423359841066386|   24|(7,[3,4,5],[5.74,...|
|60.065983953690896|   22|(7,[3,4,5],[5.74,...|
+------------------+-----+--------------------+
only showing top 5 rows

rmse = 150.338
VectorIndexer_4e0bb5560f4eafd601f2


In [19]:
# DecisionTreeRegressor
from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

featureIndexer =VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(train_df)
(trainingData, testData) = train_df.randomSplit([0.7, 0.3])
dt = DecisionTreeRegressor(labelCol="count",featuresCol="indexedFeatures")
pipeline = Pipeline(stages=[featureIndexer, dt])
model = pipeline.fit(trainingData)
predictions = model.transform(testData)
predictions.select("prediction", "count" , "features").show(100)
evaluator = RegressionEvaluator(labelCol="count", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("rmse = %g" % rmse)
treeModel = model.stages[1]
print(treeModel)


+------------------+-----+--------------------+
|        prediction|count|            features|
+------------------+-----+--------------------+
|54.841911764705884|   64|(7,[3,4,5],[2.46,...|
|54.841911764705884|   43|(7,[3,4,5],[2.46,...|
|54.841911764705884|  199|(7,[3,4,5],[4.1,6...|
|54.841911764705884|   24|(7,[3,4,5],[4.1,9...|
|54.841911764705884|  158|(7,[3,4,5],[4.92,...|
|54.841911764705884|   51|(7,[3,4,5],[4.92,...|
|54.841911764705884|  210|(7,[3,4,5],[4.92,...|
|54.841911764705884|   17|(7,[3,4,5],[5.74,...|
|54.841911764705884|   22|(7,[3,4,5],[5.74,...|
|54.841911764705884|  149|(7,[3,4,5],[6.56,...|
|54.841911764705884|    5|(7,[3,4,5],[6.56,...|
|54.841911764705884|    8|(7,[3,4,5],[6.56,...|
|54.841911764705884|  219|(7,[3,4,5],[6.56,...|
|54.841911764705884|    4|(7,[3,4,5],[6.56,...|
|54.841911764705884|   52|(7,[3,4,5],[6.56,...|
| 99.38947368421053|   53|(7,[3,4,5],[8.2,1...|
|  73.0984251968504|   90|(7,[3,4,5],[8.2,1...|
|  73.0984251968504|  110|(7,[3,4,5],[8.

In [20]:
# GBTRegressor
from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import PipelineModel

featureIndexer =VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(train_df)
(trainingData, testData) = train_df.randomSplit([0.7, 0.3])
gbt = GBTRegressor(labelCol="count",featuresCol="indexedFeatures", maxIter=10)
pipeline = Pipeline(stages=[featureIndexer, gbt])
model = pipeline.fit(trainingData)
model.write().overwrite().save("/user/edureka_749763/models_gradient_boost_regressor")

predictions = model.transform(testData)
predictions.select("prediction", "count" , "features").show(5)
evaluator = RegressionEvaluator(labelCol="count" , predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("rmse = %g" % rmse)
gbtModel = model.stages[1]
print(gbtModel)


+------------------+-----+--------------------+
|        prediction|count|            features|
+------------------+-----+--------------------+
|54.350469933321996|   24|(7,[3,4,5],[4.1,9...|
| 49.44951711137391|   10|(7,[3,4,5],[4.92,...|
| 56.15039449071708|   72|(7,[3,4,5],[5.74,...|
|52.668817210105495|   10|(7,[3,4,5],[5.74,...|
| 49.44951711137391|   15|(7,[3,4,5],[5.74,...|
+------------------+-----+--------------------+
only showing top 5 rows

rmse = 151.203
GBTRegressionModel (uid=GBTRegressor_4b01960d7f7b31eb290d) with 10 trees


In [21]:
## GBTRegressionModel  gives better results

In [22]:
# GBTRegressionModel  gives better results

# Evaluate using Saved Model
reloaded_model = PipelineModel.load("/user/edureka_749763/models_gradient_boost_regressor")
predictions = reloaded_model.transform(testData)
predictions.select("prediction", "count" , "features").show(5)
evaluator = RegressionEvaluator(labelCol="count" , predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("rmse = %g" % rmse)
gbtModel = reloaded_model.stages[1]
print(gbtModel)



+------------------+-----+--------------------+
|        prediction|count|            features|
+------------------+-----+--------------------+
|54.350469933321996|   24|(7,[3,4,5],[4.1,9...|
| 49.44951711137391|   10|(7,[3,4,5],[4.92,...|
| 56.15039449071708|   72|(7,[3,4,5],[5.74,...|
|52.668817210105495|   10|(7,[3,4,5],[5.74,...|
| 49.44951711137391|   15|(7,[3,4,5],[5.74,...|
+------------------+-----+--------------------+
only showing top 5 rows

rmse = 151.203
GBTRegressionModel (uid=GBTRegressor_4b01960d7f7b31eb290d) with 10 trees


In [32]:
df_mysql = (spark.read.format("jdbc")
    .option("url", 'jdbc:mysql://dbserver.edu.cloudlab.com/labuser_database')
    .option("driver", "com.mysql.jdbc.Driver")
    .option("dbtable", "bike_predictions")
    .option("user", 'edu_labuser')
    .option("password", "edureka")
    .load()
)
df_mysql.show()

+------+-------+----------+-------+-----+------+--------+---------+------+----------+-----+---+-----+----+----+
|season|holiday|workingday|weather| temp| atemp|humidity|windspeed|casual|registered|count|day|month|year|hour|
+------+-------+----------+-------+-----+------+--------+---------+------+----------+-----+---+-----+----+----+
|     1|      0|         0|      1| 9.84|14.395|      81|      0.0|     3|        13|   16|  1|    1|2011|   0|
|     1|      0|         0|      1| 9.02|13.635|      80|      0.0|     8|        32|   40|  1|    1|2011|   1|
|     1|      0|         0|      1| 9.02|13.635|      80|      0.0|     5|        27|   32|  1|    1|2011|   2|
|     1|      0|         0|      1| 9.84|14.395|      75|      0.0|     3|        10|   13|  1|    1|2011|   3|
|     1|      0|         0|      1| 9.84|14.395|      75|      0.0|     0|         1|    1|  1|    1|2011|   4|
|     1|      0|         0|      2| 9.84| 12.88|      75|   6.0032|     0|         1|    1|  1|    1|201

In [34]:
sqlContext = SQLContext(spark.sparkContext)
dataframe_mysql = (sqlContext.read.format("jdbc")
                   .option("url", "jdbc:mysql://dbserver.edu.cloudlab.com/labuser_database")
                   .option("driver", "com.mysql.jdbc.Driver")
                   .option("dbtable", "bike_predictions")
                   .option("user", "edu_labuser")
                   .option("password", "edureka")
                   .load()
                  )
dataframe_mysql.show()

+------+-------+----------+-------+-----+------+--------+---------+------+----------+-----+---+-----+----+----+
|season|holiday|workingday|weather| temp| atemp|humidity|windspeed|casual|registered|count|day|month|year|hour|
+------+-------+----------+-------+-----+------+--------+---------+------+----------+-----+---+-----+----+----+
|     1|      0|         0|      1| 9.84|14.395|      81|      0.0|     3|        13|   16|  1|    1|2011|   0|
|     1|      0|         0|      1| 9.02|13.635|      80|      0.0|     8|        32|   40|  1|    1|2011|   1|
|     1|      0|         0|      1| 9.02|13.635|      80|      0.0|     5|        27|   32|  1|    1|2011|   2|
|     1|      0|         0|      1| 9.84|14.395|      75|      0.0|     3|        10|   13|  1|    1|2011|   3|
|     1|      0|         0|      1| 9.84|14.395|      75|      0.0|     0|         1|    1|  1|    1|2011|   4|
|     1|      0|         0|      2| 9.84| 12.88|      75|   6.0032|     0|         1|    1|  1|    1|201