In [63]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, explode,substring, length, udf
from pyspark.sql.types import *
from pyspark.sql import Row
from itertools import cycle
from pyspark.ml.regression import LinearRegression
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from pyspark.sql import functions as F
from pyspark.sql import types as T

In [64]:
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [65]:
SQLQUERY =  """
            SELECT CATEGORY, Slope
            , Intercept
            , R2
            , Cast(Prediction as Decimal(12)) as Prediction
            FROM summaryDF 
            """
#  ORDER BY YEAR DESC, TOTAL DESC 
# Category,R2, Cast(Prediction as Decimal(12)) as Prediction
#ORDER BY Pediction Desc

regressionDF = spark.sql(SQLQUERY)
regressionDF.show(5,truncate = False)

+----------------------------------+--------------------+----------------------+------------------+-----------+
|CATEGORY                          |Slope               |Intercept             |R2                |Prediction |
+----------------------------------+--------------------+----------------------+------------------+-----------+
|Personal Finance                  |3.095318445488712E8 |-6.208465795394869E11 |0.5257188832329653|4407746449 |
|Reviews and Recommendations       |9.384350510535592E7 |-1.8807660100807297E11|0.6748252913534014|1487279305 |
|Health Care                       |1.0898642181609554E9|-2.1792508864081628E12|0.8747611056595744|22274834277|
|Application Performance Monitoring|1.4638639037091857E8|-2.9369013932035425E11|0.5668998218851391|2010369229 |
|Credit Cards                      |6.334192286089984E7 |-1.2700968121996451E11|0.5027780036421108|941002959  |
+----------------------------------+--------------------+----------------------+------------------+-----

In [66]:
from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler(inputCols = ['Slope', 'R2' ], outputCol = 'features')
featureDF = vectorAssembler.transform(regressionDF).select('CATEGORY','Slope', col('R2').alias('R2'), 'features')

featureDF.show(5)

+--------------------+--------------------+------------------+--------------------+
|            CATEGORY|               Slope|                R2|            features|
+--------------------+--------------------+------------------+--------------------+
|    Personal Finance| 3.095318445488712E8|0.5257188832329653|[3.09531844548871...|
|Reviews and Recom...| 9.384350510535592E7|0.6748252913534014|[9.38435051053559...|
|         Health Care|1.0898642181609554E9|0.8747611056595744|[1.08986421816095...|
|Application Perfo...|1.4638639037091857E8|0.5668998218851391|[1.46386390370918...|
|        Credit Cards| 6.334192286089984E7|0.5027780036421108|[6.33419228608998...|
+--------------------+--------------------+------------------+--------------------+
only showing top 5 rows



In [67]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

# Trains a k-means model.
kmeans = KMeans().setK(3).setSeed(1)
model = kmeans.fit(featureDF)

# Make predictions
predictions = model.transform(featureDF)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Silhouette with squared euclidean distance = 0.888995261361734
Cluster Centers: 
[1.18933606e+08 5.94876400e-01]
[1.87848508e+09 6.79532094e-01]
[6.81651489e+08 6.60692117e-01]


In [68]:
predictions.show()

+--------------------+--------------------+------------------+--------------------+----------+
|            CATEGORY|               Slope|                R2|            features|prediction|
+--------------------+--------------------+------------------+--------------------+----------+
|    Personal Finance| 3.095318445488712E8|0.5257188832329653|[3.09531844548871...|         0|
|Reviews and Recom...| 9.384350510535592E7|0.6748252913534014|[9.38435051053559...|         0|
|         Health Care|1.0898642181609554E9|0.8747611056595744|[1.08986421816095...|         2|
|Application Perfo...|1.4638639037091857E8|0.5668998218851391|[1.46386390370918...|         0|
|        Credit Cards| 6.334192286089984E7|0.5027780036421108|[6.33419228608998...|         0|
|       Collaboration| 2.887291813020192E8|0.5577629546359806|[2.88729181302019...|         0|
|               Video|2.9897157098892593E8|0.6270858327162601|[2.98971570988925...|         0|
|   Retail Technology| 5.094359176492772E7|0.50658

In [69]:
slope = [row.Slope for row in predictions.collect()]
projected = [row.R2 for row in predictions.collect()]
color = [row.prediction for row in predictions.collect()]

In [None]:
plt.figure(figsize=(15,10))
plt.scatter(slope, projected, c=color)
plt.savefig('KMeansCluster.png' , pi=300, quality =95 )
plt.show()