In [102]:
from pyspark.sql import SQLContext
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

%matplotlib inline
print('Spark',pyspark.__version__)

Spark 2.3.1


In [103]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [104]:
data = spark.read.format("csv").option("header", "true").load("./w3_clustering.csv")

In [105]:
data = data.select(data.totalAdClicks.cast("float"),data.revenue.cast("float"))

In [106]:
data.printSchema()

root
 |-- totalAdClicks: float (nullable = true)
 |-- revenue: float (nullable = true)



In [107]:
data.count()

709

In [89]:
data

DataFrame[totalAdClicks: float, revenue: float]

In [90]:
data.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
totalAdClicks,709,32.208744710860366,16.384120817042554,1.0,73.0
revenue,709,44.64880112834979,44.944528765285284,1.0,278.0


In [91]:
data = data.na.drop()

In [92]:
data.columns

['totalAdClicks', 'revenue']

<br><br>

 
<p style="font-family: Arial; font-size:1.5em;color:#2462C0; font-style:bold">

Use VectorAssembler to gather all the features:

</p>

 
<br><br>

In [115]:
vecAssembler = VectorAssembler(inputCols=['totalAdClicks', 'revenue'], outputCol="features_unscaled")
df_kmeans = vecAssembler.transform(data).select('features_unscaled')
df_kmeans.show()

+-----------------+
|features_unscaled|
+-----------------+
|      [22.0,12.0]|
|      [20.0,11.0]|
|      [53.0,51.0]|
|      [48.0,61.0]|
|       [23.0,3.0]|
|      [14.0,25.0]|
|      [34.0,90.0]|
|       [3.0,13.0]|
|      [24.0,12.0]|
|        [5.0,4.0]|
|      [34.0,75.0]|
|      [43.0,35.0]|
|      [53.0,49.0]|
|       [20.0,2.0]|
|       [14.0,6.0]|
|       [10.0,8.0]|
|      [36.0,27.0]|
|       [16.0,6.0]|
|      [16.0,70.0]|
|       [21.0,8.0]|
+-----------------+
only showing top 20 rows



<br><br>

 
<p style="font-family: Arial; font-size:1.5em;color:#2462C0; font-style:bold">

Scale the features using StandardScaler:

</p>

 
<br><br>

In [122]:
scaler = StandardScaler(inputCol="features_unscaled", outputCol="features", withStd=True, withMean=True)
scalerModel = scaler.fit(assembled)
scaledData = scalerModel.transform(assembled)

<br><br>

 
<p style="font-family: Arial; font-size:1.5em;color:#2462C0; font-style:bold">

Select the features column make the data persist:

</p>

 
<br><br>

In [123]:
scaledData = scaledData.select("features")
scaledData.persist()
scaledData.show()

+--------------------+
|            features|
+--------------------+
|[-0.6230877338405...|
|[-0.7451571461900...|
|[1.26898815757710...|
|[0.96381462670329...|
|[-0.5620530276657...|
|[-1.1113653832386...|
|[0.10932874025662...|
|[-1.7827471511610...|
|[-0.5010183214909...|
|[-1.6606777388114...|
|[0.10932874025662...|
|[0.65864109582948...|
|[1.26898815757710...|
|[-0.7451571461900...|
|[-1.1113653832386...|
|[-1.3555042079376...|
|[0.23139815260614...|
|[-0.9892959708890...|
|[-0.9892959708890...|
|[-0.6841224400152...|
+--------------------+
only showing top 20 rows



<br><br>

 
<p style="font-family: Arial; font-size:1.5em;color:#2462C0; font-style:bold">

We can now perform KMeans clustering to generate 2 clusters:
</p>

 
<br><br>

In [124]:
kmeans = KMeans(k=2, seed=1)
model = kmeans.fit(scaledData)
transformed = model.transform(scaledData)

<br><br>

 
<p style="font-family: Arial; font-size:1.5em;color:#2462C0; font-style:bold">

Print the center of these two clusters:

</p>

 
<br><br>

In [125]:
centers = model.clusterCenters()
centers2D = [list(i) for i in centers]
centers2D

[[-0.79780796415399258, -0.49176392408229957],
 [0.84174521435377381, 0.5188465749737885]]

<br><br>

 
<p style="font-family: Arial; font-size:1.5em;color:#2462C0; font-style:bold">

Analyze center of these two clusters:

</p>

 
<br><br>


<div style="color:black;font-family: Arial; font-size:1.1em;line-height:65%">

<br><br>

<p style="line-height:31px;"> First number (field1) in each array refers to scaled verson of the number of ad-clicks and the second number (field2) is the scaled version of the revenue per user.

Compare the 1st number of each cluster to see how differently users in each cluster behave when it comes to clicking ads.

Compare the 2nd number of each cluster to see how differently users in each cluster behave when it comes to buying stuff. 

</p><br><br>

<p style="line-height:31px;">In one cluster, in general, players click on ads much more often and spend more money on in-app purchases. Assuming that Eglence Inc. gets paid for showing ads and for hosting in-app purchase items, we can use this information to increase game's revenue by increasing the prices for ads we show to the frequent-clickers, and charge higher fees for hosting the in-app purchase items shown to the higher revenue generating buyers.</p>

<br><br>
<p style="line-height:31px;"> <b> Note: </b>  This analysis requires you to compare the cluster centers and find any ‘significant’ differences in the corresponding feature values of the  centers. The answer to this question will depend on the features you have chosen. <br><br> Some features help distinguish the clusters remarkably while others may not tell you much. At this point, if you don’t find clear distinguishing patterns, perhaps re-running the clustering model with different numbers of clusters and revising the features you picked would be a good idea. </p>

</div>
