In [1]:
# Importing python modules

from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql import HiveContext
from pyspark.context import SparkContext

In [2]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType,FloatType,DoubleType

In [3]:
from pyspark.sql import functions as F

In [3]:
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier,GBTClassifier, RandomForestClassificationModel
from pyspark.ml.feature import StringIndexer, VectorIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [5]:
sc = SparkContext("local", "Sensor1")
sqlcontext = HiveContext(sc)

###### Import data

In [6]:
table = sqlcontext.table("default.agg_labeled_data")

In [7]:
table.registerTempTable('agg_table')

In [8]:
df = sqlcontext.sql('Select * from agg_table')

In [9]:
df.show(5)

+----------+----------+------------------+------------------+------------+--------+-----+
|machine_id|session_id|         avg_sound|   avg_temperature|max_cum_dist|distance|label|
+----------+----------+------------------+------------------+------------+--------+-----+
|      MID7|         1|2.2437212989300783|29.574007220216608|       7.667|     7.5|    0|
|      MID7|         2| 2.253173230523249| 29.36462093862816|       7.667|     7.5|    0|
|      MID7|         3|   2.2502084065054| 29.67870036101083|       7.667|     7.5|    0|
|      MID7|         4|2.2516293220431245|29.314079422382672|       7.667|     7.5|    0|
|      MID7|         5|2.2666558868109385|29.732851985559567|       7.667|     7.5|    0|
+----------+----------+------------------+------------------+------------+--------+-----+
only showing top 5 rows



#### Feature vector

In [10]:
df_train = df.select("avg_sound","avg_temperature","max_cum_dist","label")

In [11]:
df_train = df_train.withColumnRenamed("avg_sound", "sound").withColumnRenamed("avg_temperature", "temperature")\
    .withColumnRenamed("max_cum_dist", "cum_dist")

In [12]:
assembler = VectorAssembler(inputCols=[x for x in df_train.columns if x != "label"], outputCol='features')

In [13]:
output = assembler.transform(df_train)

In [14]:
final_data = output.select('features','label')

In [15]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [16]:
test_data.groupBy('label').agg({'label': 'count'}).show()

+-----+------------+
|label|count(label)|
+-----+------------+
|    1|          78|
|    0|        9976|
+-----+------------+



## RandomForestClassifier

In [17]:
rfc = RandomForestClassifier(featuresCol='features', labelCol='label')

In [18]:
rfc_model = rfc.fit(train_data)

#### save model to disk

In [19]:
rfc_model.save("../model/rfc_model")

In [20]:
rfc_model.featureImportances

SparseVector(3, {0: 0.5196, 1: 0.3571, 2: 0.1233})

In [21]:
df_train.printSchema()

root
 |-- sound: double (nullable = true)
 |-- temperature: double (nullable = true)
 |-- cum_dist: double (nullable = true)
 |-- label: integer (nullable = true)



In [22]:
# Make predictions.
predictions = rfc_model.transform(test_data)
# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|    0|[2.22932725547613...|
|       0.0|    0|[2.23142357701782...|
|       0.0|    0|[2.23334367746092...|
|       0.0|    0|[2.23429052156013...|
|       0.0|    0|[2.23527842907437...|
+----------+-----+--------------------+
only showing top 5 rows



###### Evaluate model for accuracy

In [23]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("accuracy  ",accuracy)
print("Test Error = %g" % (1.0 - accuracy))

accuracy   0.9996021483986473
Test Error = 0.000397852


### Logistic Regression 

In [333]:
lr = LogisticRegression(labelCol='label')

In [334]:
fitted_model = lr.fit(train_data)

In [335]:
training_sum = fitted_model.summary

In [336]:
training_sum.predictions.describe().show()

+-------+--------------------+--------------------+
|summary|               label|          prediction|
+-------+--------------------+--------------------+
|  count|               23429|               23429|
|   mean|0.006701096931153...|0.006701096931153...|
| stddev| 0.08158723149959506| 0.08158723149959506|
|    min|                 0.0|                 0.0|
|    max|                 1.0|                 1.0|
+-------+--------------------+--------------------+



#### Evaluate results

Let's evaluate the results on the data set we were given (using the test data)

In [337]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [338]:
pred_and_labels = fitted_model.evaluate(test_data)

In [339]:
pred_and_labels.predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[2.23273518132170...|    0|[135.029246633567...|[1.0,2.2779453014...|       0.0|
|[2.23579057243616...|    0|[114.548235895304...|[1.0,1.7878588924...|       0.0|
|[2.23599199029571...|    0|[133.642491383725...|[1.0,9.1159816908...|       0.0|
|[2.23848673832883...|    0|[124.165979764015...|[1.0,1.1895977891...|       0.0|
|[2.24068171782910...|    0|[127.808341201022...|[1.0,3.1156070306...|       0.0|
|[2.24338548076863...|    0|[127.815092278556...|[1.0,3.0946441665...|       0.0|
|[2.24418111675155...|    0|[114.403819864303...|[1.0,2.0656290100...|       0.0|
|[2.24428569134445...|    0|[122.614430213846...|[1.0,5.6134460933...|       0.0|
|[2.24449494376909...|    0|[129.962772181546...|[1.0,3.6131433195...|       0.0|
|[2.244554999396

In [340]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label')

In [341]:
auc = evaluator.evaluate(pred_and_labels.predictions)

In [342]:
auc

1.0