In [1]:
from pyspark.sql import SQLContext
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from notebooks import utils
%matplotlib inline

ImportError: No module named 'notebooks'

In [None]:
sqlContext = SQLContext(sc)
df = sqlContext.read.load('file:///home/cloudera/Downloads/big-data-4/minute_weather.csv', 
                          format='com.databricks.spark.csv', 
                          header='true',inferSchema='true')

In [None]:
df.count()

Reduce the data to 1/10 so the process is faster.

In [None]:
filteredDF = df.filter((df.rowID % 10) == 0)
filteredDF.count()

In [None]:
filteredDF.describe().toPandas().transpose()

We see that there are some rows with values as 0 in some columns. Lets count these values and remove them.

In [None]:
filteredDF.filter(filteredDF.rain_accumulation == 0.0).count()

In [None]:
filteredDF.filter(filteredDF.rain_duration == 0.0).count()

In [None]:
workingDF = filteredDF.drop('rain_accumulation').drop('rain_duration').drop('hpwren_timestamp')

We dropped the columns that had mostly zeroes and also the timestamp column as it is not interesting for us. Now we will drop rows with NA values.

In [None]:
before = workingDF.count()
workingDF = workingDF.na.drop()
after = workingDF.count()
before - after

Now its time to normalize the data. We wont put columns such as rowID, obviously. Other variables that are highly correlated will be ommited too, such as minimum wind measures and average wind measures. We will create as usual, an array of the colums we want to combine and use VectorAssembler to create the vector column:

In [None]:
featuresUsed = ['air_pressure', 'air_temp', 'avg_wind_direction', 'avg_wind_speed', 'max_wind_direction', 'max_wind_speed', 'relative_humidity']
assembler = VectorAssembler(inputCols = featuresUsed, outputCol = "features_unscaled")
assembled = assembler.transform(workingDF)

Now we will use StandardScaler to normalize the data

In [None]:
scaler = StandardScaler(inputCol = "features_unscaled", output = "features", withStd = True, withMean = True)
scalerModel = scaler.fit(assembled)
scaledData = scalerModel.transform(assembled)

Now, in order to guess the number of clusters, lets proceed with the elbow plot

In [None]:
scaledData = scaledData.select("features", "rowID")
elbowset = scaledData.filter((scaledData.rowID % 3) == 0).select("features")
elbowset.persist()

With elboset.persist() we tell Spark to keep the data in memory in order to speed up the computations. Now we will compute the k-means clusters for k2 to 30 in order to create the plot.

In [None]:
clusters = range(2,31)
wsseList = utils.elbow(elbowset, clusters)

In [None]:
utils.elbow_plot(wsseList, clusters)

Looks a bit difficult to guess, but as we can see, the elbow could be between 10 and 15, so lets set k = 12.

In [None]:
scaledDataFeat = scaledData.select("features")
scaledDataFeat.persist()

In [None]:
kmeans = KMeans(k=12, seed=1)
model = kmeans.fit(scaledDAtaFeat)
transformed = model.transform(scaledDataFeat)

Here we are defining the parameters for the KMeans in the first line.
The second line fits the data to the model.
The third line applies the model to the data.

We can check the centers but its difficult to compare in a numeric way.

In [None]:
centers = centers.model.clusterCenters()
centers

So we will have to use plots for this. We will create plots with matplotlib using a Pandas DataFrame. Each row contains the cluster center coordinates and a cluster label. Lets show clusters for "Dry days".

In [None]:
P = utils.pd_centers(featuresUsed, centers)
utils.parallel_plot(P[P['relative_humidity'] < -0.5], P)

Now, lets show clusters for a high air temperature

In [None]:
utils.parallel_plot(P[P['air_temp'] > 0.5], P)

And now, show clusters for "cool days", i.e. weather samples with high relative humidity and low air temperature:

In [None]:
utils.parallel_plot((P[P['relative_humidity'] > 0.5) & (P['air_temp'] < 0.5)], P)

The last two clusters, which capture days with mild weather:

In [None]:
utils.parallel_plot(P.iloc[[2]], P)