In [1]:
# Importing python modules

from pyspark.sql import SparkSession
from pyspark.sql import Row
from os.path import abspath

In [2]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType,FloatType,DoubleType

In [3]:
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier,GBTClassifier, RandomForestClassificationModel
from pyspark.ml.feature import StringIndexer, VectorIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [4]:
warehouse_location =abspath('spark-warehouse')

In [5]:
sqlcontext = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .enableHiveSupport() \
    .getOrCreate()

###### Import data

In [6]:
table = sqlcontext.table("default.agg_data")

In [7]:
table.registerTempTable('agg_table')

In [8]:
df = sqlcontext.sql('Select * from agg_table')

In [9]:
df.show(5)

+----------+----------+------------------+------------------+------------+--------+-----+
|machine_id|session_id|         avg_sound|   avg_temperature|max_cum_dist|distance|label|
+----------+----------+------------------+------------------+------------+--------+-----+
|      MID7|         1|2.2437212989300783|29.574007220216608|       7.667|     7.5|    0|
|      MID7|         2| 2.253173230523249| 29.36462093862816|       7.667|     7.5|    0|
|      MID7|         3|   2.2502084065054| 29.67870036101083|       7.667|     7.5|    0|
|      MID7|         4|2.2516293220431245|29.314079422382672|       7.667|     7.5|    0|
|      MID7|         5|2.2666558868109385|29.732851985559567|       7.667|     7.5|    0|
+----------+----------+------------------+------------------+------------+--------+-----+
only showing top 5 rows



#### Feature vector

In [10]:
df_train = df.select("avg_sound","avg_temperature","max_cum_dist","label")

In [11]:
df_train = df_train.withColumnRenamed("avg_sound", "sound").withColumnRenamed("avg_temperature", "temperature")\
    .withColumnRenamed("max_cum_dist", "cum_dist")

In [12]:
assembler = VectorAssembler(inputCols=[x for x in df_train.columns if x != "label"], outputCol='features')

In [13]:
output = assembler.transform(df_train)

In [14]:
final_data = output.select('features','label')

In [15]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [16]:
test_data.groupBy('label').agg({'label': 'count'}).show()

+-----+------------+
|label|count(label)|
+-----+------------+
|    1|          52|
|    0|        9910|
+-----+------------+



## RandomForestClassifier

In [17]:
rfc = RandomForestClassifier(featuresCol='features', labelCol='label')

In [18]:
rfc_model = rfc.fit(train_data)

#### save model to disk

In [21]:
rfc_model.save("../model/rfc_model")

In [22]:
rfc_model.featureImportances

SparseVector(3, {0: 0.418, 1: 0.4854, 2: 0.0965})

In [24]:
df_train.printSchema()

root
 |-- sound: double (nullable = true)
 |-- temperature: double (nullable = true)
 |-- cum_dist: double (nullable = true)
 |-- label: integer (nullable = true)



In [25]:
# Make predictions.
predictions = rfc_model.transform(test_data)
# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|    0|[2.21518629594353...|
|       0.0|    0|[2.21849089814836...|
|       0.0|    0|[2.22059746681858...|
|       0.0|    0|[2.22066911588854...|
|       0.0|    0|[2.22079482712968...|
+----------+-----+--------------------+
only showing top 5 rows



###### Evaluate model for accuracy

In [26]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("accuracy  ",accuracy)
print("Test Error = %g" % (1.0 - accuracy))

accuracy   0.9992973298534431
Test Error = 0.00070267


### Logistic Regression 

In [27]:
lr = LogisticRegression(labelCol='label')

In [28]:
fitted_model = lr.fit(train_data)

In [29]:
training_sum = fitted_model.summary

In [30]:
training_sum.predictions.describe().show()

+-------+--------------------+--------------------+
|summary|               label|          prediction|
+-------+--------------------+--------------------+
|  count|               23373|               23373|
|   mean|0.007572840456937492|0.007572840456937492|
| stddev| 0.08669379507218826| 0.08669379507218826|
|    min|                 0.0|                 0.0|
|    max|                 1.0|                 1.0|
+-------+--------------------+--------------------+



#### Evaluate results

Let's evaluate the results on the data set we were given (using the test data)

In [31]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [32]:
pred_and_labels = fitted_model.evaluate(test_data)

In [33]:
pred_and_labels.predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[2.21518629594353...|    0|[62.4157831148655...|[1.0,7.8193348153...|       0.0|
|[2.21849089814836...|    0|[121.414883675117...|[1.0,1.8628847992...|       0.0|
|[2.22059746681858...|    0|[53.5958313150042...|[1.0,5.2920776373...|       0.0|
|[2.22066911588854...|    0|[89.6657786843840...|[1.0,1.1445825007...|       0.0|
|[2.22079482712968...|    0|[58.3086579722969...|[1.0,4.7519470795...|       0.0|
|[2.22126921802648...|    0|[93.3817436835513...|[1.0,2.7849923574...|       0.0|
|[2.22197921059307...|    0|[92.1310073014743...|[1.0,9.7277391574...|       0.0|
|[2.22222115338152...|    0|[122.180074477662...|[1.0,8.6669812261...|       0.0|
|[2.22236483826866...|    0|[105.537380851722...|[1.0,1.4645286147...|       0.0|
|[2.222646996164

In [34]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label')

In [35]:
auc = evaluator.evaluate(pred_and_labels.predictions)

In [36]:
auc

1.0