In [1]:
import pyspark
from pyspark import *
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
# from pyspark.sql.functions import isnan, when, count, col  # To count Null values
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.stat import Statistics

In [2]:
df = sqlContext.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "true").option("delimiter", ";").load("household_power_consumption.txt")

# OR all null values drop at begining

# df1=spark.read.format("csv").option("header", "true").option("inferSchema", "true").option("nullvalue",'NA').option("delimiter", ";").load("household_power_consumption.txt")

In [3]:
# Changing Dtypes Manually of dataframe
df=df.withColumn("Date",df.Date.cast("timestamp"))
df=df.withColumn("Time",df.Time.cast("timestamp"))
df=df.withColumn("Global_active_power",df.Global_active_power.cast("Double"))
df=df.withColumn("Global_reactive_power",df.Global_reactive_power.cast("Double"))
df=df.withColumn("Voltage",df.Voltage.cast("Double"))
df=df.withColumn("Global_intensity",df.Global_intensity.cast("Double"))
df=df.withColumn("Sub_metering_1",df.Sub_metering_1.cast("Double"))
df=df.withColumn("Sub_metering_2",df.Sub_metering_2.cast("Double"))

In [4]:
# Dropping a Column full of null values
df=df.drop("Date")

In [5]:
# Splitting Date and time from Time column 
split_col = pyspark.sql.functions.split(df['Time'], ' ')
df = df.withColumn('Date', split_col.getItem(0))
df = df.withColumn('Time', split_col.getItem(1))

In [6]:
# Checking Null values in our Dataframe
a=df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns])

In [7]:
# printing number of missing values
a.show()

+----+-------------------+---------------------+-------+----------------+--------------+--------------+--------------+----+
|Time|Global_active_power|Global_reactive_power|Voltage|Global_intensity|Sub_metering_1|Sub_metering_2|Sub_metering_3|Date|
+----+-------------------+---------------------+-------+----------------+--------------+--------------+--------------+----+
|   0|              25979|                25979|  25979|           25979|         25979|         25979|         25979|   0|
+----+-------------------+---------------------+-------+----------------+--------------+--------------+--------------+----+



In [8]:
# filtering out not null values respect to column ( Removing null values by removing rows contain null)
df=df.filter(df.Global_active_power.isNotNull())
df=df.filter(df.Sub_metering_3.isNotNull())

In [9]:
df.show()

+--------+-------------------+---------------------+-------+----------------+--------------+--------------+--------------+----------+
|    Time|Global_active_power|Global_reactive_power|Voltage|Global_intensity|Sub_metering_1|Sub_metering_2|Sub_metering_3|      Date|
+--------+-------------------+---------------------+-------+----------------+--------------+--------------+--------------+----------+
|17:24:00|              4.216|                0.418| 234.84|            18.4|           0.0|           1.0|          17.0|2019-02-22|
|17:25:00|               5.36|                0.436| 233.63|            23.0|           0.0|           1.0|          16.0|2019-02-22|
|17:26:00|              5.374|                0.498| 233.29|            23.0|           0.0|           2.0|          17.0|2019-02-22|
|17:27:00|              5.388|                0.502| 233.74|            23.0|           0.0|           1.0|          17.0|2019-02-22|
|17:28:00|              3.666|                0.528| 235.68|  

In [10]:
df.columns

['Time',
 'Global_active_power',
 'Global_reactive_power',
 'Voltage',
 'Global_intensity',
 'Sub_metering_1',
 'Sub_metering_2',
 'Sub_metering_3',
 'Date']

In [11]:
# Importing library for machine learning
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml import Pipeline
from pyspark.sql.functions import *

In [19]:
columns=['Global_reactive_power','Voltage','Global_intensity','Sub_metering_1','Sub_metering_2','Sub_metering_3']

In [13]:
# finding Correlation with global_active_power
for i in columns:
    e=df.corr("Global_active_power",i)
    print(i +" is correlated : %s" %e)

Global_reactive_power is correlated : 0.24701705264406495
Voltage is correlated : -0.39976160962884133
Global_intensity is correlated : 0.9988886002095948
Sub_metering_1 is correlated : 0.48440127517742076
Sub_metering_2 is correlated : 0.4345687175985004
Sub_metering_3 is correlated : 0.6385554235386937


In [14]:
df.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
Time,2049280,,,00:00:00,23:59:00
Global_active_power,2049280,1.0916150365005453,1.0572941610939872,0.076,11.122
Global_reactive_power,2049280,0.12371447630388106,0.11272197955071599,0.0,1.39
Voltage,2049280,240.8398579745112,3.239986679010419,223.2,254.15
Global_intensity,2049280,4.627759310588589,4.44439625978614,0.2,48.4
Sub_metering_1,2049280,1.1219233096502186,6.153031089701279,0.0,88.0
Sub_metering_2,2049280,1.2985199679887571,5.822026473177491,0.0,80.0
Sub_metering_3,2049280,6.45844735712055,8.437153908665486,0.0,31.0
Date,2049280,,,2019-02-22,2019-02-22


In [15]:
from pyspark.ml.feature import VectorAssembler

In [16]:
vectorAssembler = VectorAssembler(inputCols = columns, outputCol = 'features')

In [17]:
v_df = vectorAssembler.transform(df)

In [18]:
v_df = v_df.select(['features', 'Global_active_power'])
v_df.show(3)

+--------------------+-------------------+
|            features|Global_active_power|
+--------------------+-------------------+
|[0.418,234.84,18....|              4.216|
|[0.436,233.63,23....|               5.36|
|[0.498,233.29,23....|              5.374|
+--------------------+-------------------+
only showing top 3 rows



In [20]:
splits = v_df.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]

In [21]:
from pyspark.ml.regression import LinearRegression

In [22]:
lr = LinearRegression(featuresCol = 'features', labelCol='Global_active_power', maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [23]:
lr_model = lr.fit(train_df)

In [24]:
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

Coefficients: [0.0,0.0,0.1737352215546131,0.0,0.0,0.0]
Intercept: 0.2875228948191185


In [25]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 0.288304
r2: 0.925656


In [26]:
train_df.describe().show()

+-------+-------------------+
|summary|Global_active_power|
+-------+-------------------+
|  count|            1434426|
|   mean| 1.0912569752639423|
| stddev|  1.057373723094851|
|    min|              0.076|
|    max|             11.122|
+-------+-------------------+



In [27]:
lr_predictions = lr_model.transform(test_df)
lr_predictions.select("prediction","Global_active_power","features").show()

+------------------+-------------------+--------------------+
|        prediction|Global_active_power|            features|
+------------------+-------------------+--------------------+
| 0.426511072062809|              0.162|(6,[1,2],[225.69,...|
|0.3917640277518864|              0.116|(6,[1,2],[226.14,...|
|0.3917640277518864|              0.116|(6,[1,2],[226.2,0...|
|0.3917640277518864|              0.116|(6,[1,2],[226.21,...|
|0.3917640277518864|              0.116|(6,[1,2],[226.27,...|
|0.3917640277518864|              0.116|(6,[1,2],[226.36,...|
|0.3917640277518864|              0.116|(6,[1,2],[226.41,...|
|0.3917640277518864|              0.116|(6,[1,2],[226.48,...|
| 0.426511072062809|              0.154|(6,[1,2],[226.5,0...|
|0.3917640277518864|              0.116|(6,[1,2],[226.62,...|
|0.3917640277518864|              0.116|(6,[1,2],[226.69,...|
|0.3917640277518864|              0.116|(6,[1,2],[226.72,...|
|0.3917640277518864|              0.116|(6,[1,2],[226.83,...|
|0.39176

In [28]:
from pyspark.ml.evaluation import RegressionEvaluator

In [29]:
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="Global_active_power",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

R Squared (R2) on test data = 0.925607


In [30]:
test_result = lr_model.evaluate(test_df)
print("Root Mean Squared Error (RMSE) on test data = %g" % test_result.rootMeanSquaredError)

Root Mean Squared Error (RMSE) on test data = 0.288326


In [31]:
predictions = lr_model.transform(test_df)
predictions.select("prediction","Global_active_power","features").show()

+------------------+-------------------+--------------------+
|        prediction|Global_active_power|            features|
+------------------+-------------------+--------------------+
| 0.426511072062809|              0.162|(6,[1,2],[225.69,...|
|0.3917640277518864|              0.116|(6,[1,2],[226.14,...|
|0.3917640277518864|              0.116|(6,[1,2],[226.2,0...|
|0.3917640277518864|              0.116|(6,[1,2],[226.21,...|
|0.3917640277518864|              0.116|(6,[1,2],[226.27,...|
|0.3917640277518864|              0.116|(6,[1,2],[226.36,...|
|0.3917640277518864|              0.116|(6,[1,2],[226.41,...|
|0.3917640277518864|              0.116|(6,[1,2],[226.48,...|
| 0.426511072062809|              0.154|(6,[1,2],[226.5,0...|
|0.3917640277518864|              0.116|(6,[1,2],[226.62,...|
|0.3917640277518864|              0.116|(6,[1,2],[226.69,...|
|0.3917640277518864|              0.116|(6,[1,2],[226.72,...|
|0.3917640277518864|              0.116|(6,[1,2],[226.83,...|
|0.39176