In [1]:
!pip install pyspark
!pip install findspark

# Import SparkSession
from pyspark.sql import SparkSession

# Create a Spark Session
spark = SparkSession.builder.master("local[*]").getOrCreate()

# Check Spark Session Information
spark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.2.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.2-py2.py3-none-any.whl size=281824025 sha256=4608ac91bd4b5fb56ff6458ff818ea64491b2cc57b1db9167140f39017b574fd
  Stored in directory: /root/.cache/pip/wheels/6c/e3/9b/0525ce8a69478916513509d43693511463c6468db0de237c86
Successfully built pyspark
Installing collected packages: py4j, pyspa

In [2]:
# (1) Import the required Python dependencies
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [3]:
# (2) Instantiate a Spark Context
sqlContext = SQLContext(spark)



In [4]:
# (3) Load the Bike Sharing dataset into a Spark DataFrame
bike_sharing_df = sqlContext.read.format('com.databricks.spark.csv').options(header = 'true', inferschema = 'true').load('/content/day.csv')
bike_sharing_df.show(5)

+-------+-------------------+------+---+----+-------+-------+----------+----------+--------+--------+--------+---------+------+----------+----+
|instant|             dteday|season| yr|mnth|holiday|weekday|workingday|weathersit|    temp|   atemp|     hum|windspeed|casual|registered| cnt|
+-------+-------------------+------+---+----+-------+-------+----------+----------+--------+--------+--------+---------+------+----------+----+
|      1|2011-01-01 00:00:00|     1|  0|   1|      0|      6|         0|         2|0.344167|0.363625|0.805833| 0.160446|   331|       654| 985|
|      2|2011-01-02 00:00:00|     1|  0|   1|      0|      0|         0|         2|0.363478|0.353739|0.696087| 0.248539|   131|       670| 801|
|      3|2011-01-03 00:00:00|     1|  0|   1|      0|      1|         1|         1|0.196364|0.189405|0.437273| 0.248309|   120|      1229|1349|
|      4|2011-01-04 00:00:00|     1|  0|   1|      0|      2|         1|         1|     0.2|0.212122|0.590435| 0.160296|   108|      145

In [5]:
# (4) Calculate the level of Correlation between the relevant Independent Variables and the Dependent Variable
independent_variables = ['season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed']
dependent_variable = ['cnt']
bike_sharing_df = bike_sharing_df.select( independent_variables + dependent_variable ) 
for i in bike_sharing_df.columns:
        print( "Correlation to CNT for ", i, bike_sharing_df.stat.corr('cnt', i))

Correlation to CNT for  season 0.40610037079863526
Correlation to CNT for  yr 0.5667097078680867
Correlation to CNT for  mnth 0.2799771122192702
Correlation to CNT for  holiday -0.06834771589248398
Correlation to CNT for  weekday 0.06744341241063072
Correlation to CNT for  workingday 0.06115606306052115
Correlation to CNT for  weathersit -0.29739123883466345
Correlation to CNT for  temp 0.6274940090334915
Correlation to CNT for  atemp 0.6310656998491827
Correlation to CNT for  hum -0.1006585621371548
Correlation to CNT for  windspeed -0.2345449974216706
Correlation to CNT for  cnt 1.0


In [6]:
# (5) Generate Input Feature Vectors from the Raw Spark DataFrame
multivariate_feature_columns = ['season', 'yr', 'mnth', 'temp', 'atemp']
multivariate_label_column = 'cnt'
vector_assembler = VectorAssembler(inputCols = multivariate_feature_columns, outputCol = 'features')
bike_sharing_features_df = vector_assembler.transform(bike_sharing_df).select(['features', multivariate_label_column])
bike_sharing_features_df.head(10)

[Row(features=DenseVector([1.0, 0.0, 1.0, 0.3442, 0.3636]), cnt=985),
 Row(features=DenseVector([1.0, 0.0, 1.0, 0.3635, 0.3537]), cnt=801),
 Row(features=DenseVector([1.0, 0.0, 1.0, 0.1964, 0.1894]), cnt=1349),
 Row(features=DenseVector([1.0, 0.0, 1.0, 0.2, 0.2121]), cnt=1562),
 Row(features=DenseVector([1.0, 0.0, 1.0, 0.227, 0.2293]), cnt=1600),
 Row(features=DenseVector([1.0, 0.0, 1.0, 0.2043, 0.2332]), cnt=1606),
 Row(features=DenseVector([1.0, 0.0, 1.0, 0.1965, 0.2088]), cnt=1510),
 Row(features=DenseVector([1.0, 0.0, 1.0, 0.165, 0.1623]), cnt=959),
 Row(features=DenseVector([1.0, 0.0, 1.0, 0.1383, 0.1162]), cnt=822),
 Row(features=DenseVector([1.0, 0.0, 1.0, 0.1508, 0.1509]), cnt=1321)]

In [7]:
# (6) Split the Raw DataFrame into a Training DataFrame and a Test DataFrame
train_df, test_df = bike_sharing_features_df.randomSplit([0.75, 0.25], seed=12345)
train_df.count(), test_df.count()

(557, 174)

In [8]:
# (7) Train a Multivariate Linear Regression Model on the Training DataFrame
linear_regression = LinearRegression(featuresCol = 'features', labelCol = multivariate_label_column)
linear_regression_model = linear_regression.fit(train_df)

In [9]:
# (8) Output Multivariate Linear Regression Model Summary Statistics to evaluate the Training Model
print("Model Coefficients: " + str(linear_regression_model.coefficients))
print("Intercept: " + str(linear_regression_model.intercept))
training_summary = linear_regression_model.summary
print("RMSE: %f" % training_summary.rootMeanSquaredError)
print("R-SQUARED: %f" % training_summary.r2)
print("TRAINING DATASET DESCRIPTIVE SUMMARY: ")
train_df.describe().show()
print("TRAINING DATASET RESIDUALS: ")
training_summary.residuals.show()

Model Coefficients: [534.0177726438907,2084.9239629668486,-55.03776131002569,1970.2427278852049,4075.8038004166997]
Intercept: -429.568050037851
RMSE: 1020.791883
R-SQUARED: 0.741306
TRAINING DATASET DESCRIPTIVE SUMMARY: 
+-------+------------------+
|summary|               cnt|
+-------+------------------+
|  count|               557|
|   mean| 4452.924596050269|
| stddev|2008.7877956551522|
|    min|                22|
|    max|              8714|
+-------+------------------+

TRAINING DATASET RESIDUALS: 
+-------------------+
|          residuals|
+-------------------+
|  492.8146219296144|
|  343.5680817861023|
|  694.0439959365583|
| 26.531944914031783|
|  359.4205334916021|
| 286.70066962118153|
| -76.81748122988392|
| 100.06802676016605|
| 118.21745978028912|
|-116.32956447171364|
| 500.57361327319313|
| 336.18467766471053|
| -898.0633881851779|
| 140.72567686761067|
| 222.20520745930662|
|-203.19195963266293|
|  253.9718393749538|
|-193.50928837829565|
| 203.45874925472208|
| -

In [None]:
# (9) Apply the Trained Multivariate Linear Regression Model to the Test DataFrame to make predictions
test_linear_regression_predictions_df = linear_regression_model.transform(test_df)
print("TEST DATASET PREDICTIONS AGAINST ACTUAL LABEL: ")
test_linear_regression_predictions_df.select("prediction", multivariate_label_column, "features").show(10)

TEST DATASET PREDICTIONS AGAINST ACTUAL LABEL: 
+------------------+----+--------------------+
|        prediction| cnt|            features|
+------------------+----+--------------------+
| 976.0746662395563|1321|[1.0,0.0,1.0,0.15...|
|  1050.01248129665| 959|[1.0,0.0,1.0,0.16...|
| 1162.187088514198|1263|[1.0,0.0,1.0,0.16...|
|1289.1224676804804|1510|[1.0,0.0,1.0,0.19...|
| 1300.636622198192|1098|[1.0,0.0,1.0,0.19...|
|1308.9999163839534|1562|[1.0,0.0,1.0,0.2,...|
|1235.6848532836393|1746|[1.0,0.0,2.0,0.18...|
|1384.2702586148866|1472|[1.0,0.0,2.0,0.22...|
|1549.7784033067871|1526|[1.0,0.0,2.0,0.26...|
| 1906.531084586451|2115|[1.0,0.0,2.0,0.31...|
+------------------+----+--------------------+
only showing top 10 rows



In [None]:
# (10) Evaluate the performance of our Linear Regression Model on the Test DataFrame
test_summary = linear_regression_model.evaluate(test_df)
print("RMSE on Test Data = %g" % test_summary.rootMeanSquaredError)
print("R-SQUARED on Test Data = %g" % test_summary.r2)

RMSE on Test Data = 964.597
R-SQUARED on Test Data = 0.739356


In [None]:
# (11) Stop the Spark Context
spark.stop()