In [1]:
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler

random_state = 772659

# Data Desciption

In [11]:
raw_df_path = "household_power_consumption.txt"
raw_df = pd.read_csv(raw_df_path, sep=";")

In [12]:
raw_df.head(10).transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Date,16/12/2006,16/12/2006,16/12/2006,16/12/2006,16/12/2006,16/12/2006,16/12/2006,16/12/2006,16/12/2006,16/12/2006
Time,17:24:00,17:25:00,17:26:00,17:27:00,17:28:00,17:29:00,17:30:00,17:31:00,17:32:00,17:33:00
Global_active_power,4.216,5.360,5.374,5.388,3.666,3.520,3.702,3.700,3.668,3.662
Global_reactive_power,0.418,0.436,0.498,0.502,0.528,0.522,0.520,0.520,0.510,0.510
Voltage,234.840,233.630,233.290,233.740,235.680,235.020,235.090,235.220,233.990,233.860
Global_intensity,18.400,23.000,23.000,23.000,15.800,15.000,15.800,15.800,15.800,15.800
Sub_metering_1,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
Sub_metering_2,1.000,1.000,2.000,1.000,1.000,2.000,1.000,1.000,1.000,2.000
Sub_metering_3,17.0,16.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,16.0


# Data Preprocessing

In [13]:
feature_df = raw_df.iloc[0:, 2:9].dropna().astype(float)

In [14]:
feature_df.head()

Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,4.216,0.418,234.84,18.4,0.0,1.0,17.0
1,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2,5.374,0.498,233.29,23.0,0.0,2.0,17.0
3,5.388,0.502,233.74,23.0,0.0,1.0,17.0
4,3.666,0.528,235.68,15.8,0.0,1.0,17.0


In [15]:
feature_df = (feature_df - feature_df.mean())/feature_df.std()

In [16]:
feature_df.head()

Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,2.955076,2.61072,-1.851816,3.098788,-0.182337,-0.051274,1.24942
1,4.037084,2.770405,-2.225274,4.133799,-0.182337,-0.051274,1.130897
2,4.050325,3.320431,-2.330213,4.133799,-0.182337,0.120487,1.24942
3,4.063566,3.355916,-2.191323,4.133799,-0.182337,-0.051274,1.24942
4,2.434881,3.586572,-1.592555,2.513781,-0.182337,-0.051274,1.24942


# Performance measuring

# Scikit-learn - Kmeans

## Time measure

In [17]:
from sklearn.cluster import KMeans as SKKmeans

In [20]:
sk_model = SKKmeans(n_clusters=3, max_iter=100)

In [21]:
%%timeit -r 5 
sk_model = SKKmeans(n_clusters=3, max_iter=100)
sk_model.fit(feature_df)

10.3 s ± 292 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)


In [25]:
sk_model = SKKmeans(n_clusters=3, max_iter=100)
sk_model.fit(feature_df)

In [26]:
sk_model.inertia_

3571969.304298058

In [27]:
from sklearn.metrics import silhouette_score

In [36]:
silhouette_score(feature_df, sk_model.labels_, sample_size=100000, random_state=random_state)

0.30672153537156116

# Spark - SVC

In [37]:

#Create PySpark SparkSession
spark = SparkSession.builder.master("spark://spark:7077").appName("Ass3").getOrCreate()
#Create PySpark DataFrame from Pandas


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/15 21:52:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [40]:
input_cols = feature_df.columns.tolist()
transformer = VectorAssembler(inputCols=input_cols, outputCol="features")

def pandas_to_spark(pd_df):
    spark_df = spark.createDataFrame(pd_df)
    result = transformer.transform(spark_df).select("features")
    return result

In [None]:
spark_train_df = pandas_to_spark(feature_df)

## Time measure

In [None]:
from pyspark.ml.clustering import KMeans as SparkKMeans

In [20]:
%%timeit -r 5 
lsvc = SparkKMeans(k=3, maxIter=100)
lsvc.fit(spark_train_df)

23/11/15 16:43:36 WARN TaskSetManager: Stage 0 contains a task of very large size (2497 KiB). The maximum recommended task size is 1000 KiB.
23/11/15 16:43:42 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/11/15 16:43:42 WARN TaskSetManager: Stage 1 contains a task of very large size (2497 KiB). The maximum recommended task size is 1000 KiB.
23/11/15 16:43:42 WARN TaskSetManager: Stage 2 contains a task of very large size (2497 KiB). The maximum recommended task size is 1000 KiB.
23/11/15 16:43:42 WARN TaskSetManager: Stage 3 contains a task of very large size (2497 KiB). The maximum recommended task size is 1000 KiB.
23/11/15 16:43:42 WARN TaskSetManager: Stage 4 contains a task of very large size (2497 KiB). The maximum recommended task size is 1000 KiB.
23/11/15 16:43:43 WARN TaskSetManager: Stage 5 contains a task of very large size (2497 KiB). The maximum recommended task size is 1000 KiB.
23/11/15 16:43:43 WARN TaskSetManager: Stage 6 

4.36 s ± 261 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)


In [21]:
lsvc = SparkKMeans(k=3, maxIter=100)
lsvc = lsvc.fit(spark_train_df)


23/11/15 16:44:10 WARN TaskSetManager: Stage 558 contains a task of very large size (2497 KiB). The maximum recommended task size is 1000 KiB.
23/11/15 16:44:10 WARN TaskSetManager: Stage 559 contains a task of very large size (2497 KiB). The maximum recommended task size is 1000 KiB.
23/11/15 16:44:10 WARN TaskSetManager: Stage 560 contains a task of very large size (2497 KiB). The maximum recommended task size is 1000 KiB.
23/11/15 16:44:10 WARN TaskSetManager: Stage 561 contains a task of very large size (2497 KiB). The maximum recommended task size is 1000 KiB.
23/11/15 16:44:10 WARN TaskSetManager: Stage 562 contains a task of very large size (2497 KiB). The maximum recommended task size is 1000 KiB.
23/11/15 16:44:10 WARN TaskSetManager: Stage 563 contains a task of very large size (2497 KiB). The maximum recommended task size is 1000 KiB.
23/11/15 16:44:10 WARN TaskSetManager: Stage 564 contains a task of very large size (2497 KiB). The maximum recommended task size is 1000 KiB.

## Prediction evaluation

In [24]:
lsvc = SparkKMeans(k=3, maxIter=100)
lsvc = lsvc.fit(spark_train_df)
pred = lsvc.transform(spark_test_df)

y_pred=pred.select("prediction").collect()

acc = silhouette_score(feature_df, y_pred, sample_size=100000, random_state=random_state)
print("silhouette_score: ", acc)

23/11/15 16:47:18 WARN TaskSetManager: Stage 744 contains a task of very large size (2497 KiB). The maximum recommended task size is 1000 KiB.
23/11/15 16:47:19 WARN TaskSetManager: Stage 745 contains a task of very large size (2497 KiB). The maximum recommended task size is 1000 KiB.
23/11/15 16:47:19 WARN TaskSetManager: Stage 746 contains a task of very large size (2497 KiB). The maximum recommended task size is 1000 KiB.
23/11/15 16:47:19 WARN TaskSetManager: Stage 747 contains a task of very large size (2497 KiB). The maximum recommended task size is 1000 KiB.
23/11/15 16:47:19 WARN TaskSetManager: Stage 748 contains a task of very large size (2497 KiB). The maximum recommended task size is 1000 KiB.
23/11/15 16:47:19 WARN TaskSetManager: Stage 749 contains a task of very large size (2497 KiB). The maximum recommended task size is 1000 KiB.
23/11/15 16:47:19 WARN TaskSetManager: Stage 750 contains a task of very large size (2497 KiB). The maximum recommended task size is 1000 KiB.

Prediction Accuracy:  0.8108108108108109
F1 score:


0.8864306784660767

In [25]:
spark.stop()