In [12]:
from pyspark.sql import SQLContext
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler

In [14]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [15]:
data = spark.read.format("csv").option("header", "true").load("./w3_clustering.csv")

In [26]:
data = data.select(data.totalAdClicks.cast("float"),data.revenue.cast("float"))

In [37]:
data.printSchema()

root
 |-- totalAdClicks: float (nullable = true)
 |-- revenue: float (nullable = true)



<br><br>

 
<p style="font-family: Arial; font-size:1.5em;color:#2462C0; font-style:bold">

Let us display the number of lines in the data:

</p>

<br><br>

In [27]:
data.count()

709

<br><br>

 
<p style="font-family: Arial; font-size:1.5em;color:#2462C0; font-style:bold">

What does the data contain ?
</p>

 
<br><br>

In [28]:
data

DataFrame[totalAdClicks: float, revenue: float]

<br><br>

 
<p style="font-family: Arial; font-size:1.5em;color:#2462C0; font-style:bold">

Statistics about the data:
</p>

 
<br><br>

In [29]:
data.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
totalAdClicks,709,32.208744710860366,16.384120817042554,1.0,73.0
revenue,709,44.64880112834979,44.944528765285284,1.0,278.0


<br><br>

 
<p style="font-family: Arial; font-size:1.5em;color:#2462C0; font-style:bold">

Drop all rows with NULL or NaN values

</p>

 
<br><br>

In [30]:
data = data.na.drop()

<br><br>

 
<p style="font-family: Arial; font-size:1.5em;color:#2462C0; font-style:bold">

Let us look at the column names:

</p>

 
 
<br><br>

In [31]:
data.columns

['totalAdClicks', 'revenue']

<br><br>

 
<p style="font-family: Arial; font-size:1.5em;color:#2462C0; font-style:bold">

Use VectorAssembler to gather all the features:

</p>

 
<br><br>

In [36]:
featuresUsed = ['totalAdClicks', 'revenue']
assembler = VectorAssembler(inputCols=featuresUsed, outputCol="features_unscaled")
assembled = assembler.transform(data)

<br><br>

 
<p style="font-family: Arial; font-size:1.5em;color:#2462C0; font-style:bold">

Scale the features using StandardScaler:

</p>

 
<br><br>

In [11]:
scaler = StandardScaler(inputCol="features_unscaled", outputCol="features", withStd=True, withMean=True)
scalerModel = scaler.fit(assembled)
scaledData = scalerModel.transform(assembled)

<br><br>

 
<p style="font-family: Arial; font-size:1.5em;color:#2462C0; font-style:bold">

Select the features column make the data persist:

</p>

 
<br><br>

In [12]:
scaledData = scaledData.select("features")
scaledData.persist()

DataFrame[features: vector]

<br><br>

 
<p style="font-family: Arial; font-size:1.5em;color:#2462C0; font-style:bold">

We can now perform KMeans clustering to generate 2 clusters:
</p>

 
<br><br>

In [13]:
kmeans = KMeans(k=2, seed=1)
model = kmeans.fit(scaledData)
transformed = model.transform(scaledData)

<br><br>

 
<p style="font-family: Arial; font-size:1.5em;color:#2462C0; font-style:bold">

Print the center of these two clusters:

</p>

 
<br><br>

In [14]:
centers = model.clusterCenters()
centers

[array([ 0.84174521,  0.51884657]), array([-0.79780796, -0.49176392])]

<br><br>

 
<p style="font-family: Arial; font-size:1.5em;color:#2462C0; font-style:bold">

Analyze center of these two clusters:

</p>

 
<br><br>


<div style="color:black;font-family: Arial; font-size:1.1em;line-height:65%">

<p style="line-height:31px;">Each array denotes the center for a cluster:<br><br>
One Cluster is centered at   ... array([ 0.84174521,  0.51884657])<br>
Other Cluster is centered at   ... array([-0.79780796, -0.49176392])</p>

<br><br>

<p style="line-height:31px;"> First number (field1) in each array refers to scaled verson of the number of ad-clicks and the second number (field2) is the scaled version of the revenue per user.

Compare the 1st number of each cluster to see how differently users in each cluster behave when it comes to clicking ads.

Compare the 2nd number of each cluster to see how differently users in each cluster behave when it comes to buying stuff. 

</p><br><br>

<p style="line-height:31px;">In one cluster, in general, players click on ads much more often and spend more money on in-app purchases. Assuming that Eglence Inc. gets paid for showing ads and for hosting in-app purchase items, we can use this information to increase game's revenue by increasing the prices for ads we show to the frequent-clickers, and charge higher fees for hosting the in-app purchase items shown to the higher revenue generating buyers.</p>

<br><br>
<p style="line-height:31px;"> <b> Note: </b>  This analysis requires you to compare the cluster centers and find any ‘significant’ differences in the corresponding feature values of the  centers. The answer to this question will depend on the features you have chosen. <br><br> Some features help distinguish the clusters remarkably while others may not tell you much. At this point, if you don’t find clear distinguishing patterns, perhaps re-running the clustering model with different numbers of clusters and revising the features you picked would be a good idea. </p>

</div>
