# Medication Compliance Monitoring System using Machine Learning and Distributed Computing
By Katja Wittfoth, Donya Fozoonmayeh, Hai Vu Le, Chong Geng


## 1.1 Reading the Data from MongoDB & Creating DataFrame 

In [16]:
from pyspark import SparkContext      # for connecting MongoDB
from pyspark.sql import SparkSession  # for connecting MongoDB
import os                             # for connecting MongoDB

from pyspark import SparkConf
from pyspark.sql.types import *
from pyspark.sql import Row
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler

from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import OneVsRest
from pyspark.ml.classification import LinearSVC

from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

from pyspark.sql.functions import udf
from pyspark.ml.feature import StringIndexer

from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder

In [2]:
pyspark_submit_args = '--packages org.mongodb.spark:mongo-spark-connector_2.11:2.4.0 pyspark-shell'
os.environ["PYSPARK_SUBMIT_ARGS"] = pyspark_submit_args

In [3]:
ss = SparkSession \
    .builder \
    .appName("myApp") \
    .config("spark.mongodb.input.uri", "mongodb://18.217.205.7/sensors.five")\
    .getOrCreate()

In [4]:
window_5_df_raw = ss.read.format("com.mongodb.spark.sql.DefaultSource").load()

## 1.2 data cleaning

## 1.2.1 rename the 'profile_activity' column as 'label', persist the df in memory.

In [5]:
window_5_df_raw = window_5_df_raw.withColumnRenamed('profile_activity', 'label').cache()

In [6]:
window_5_df_raw.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- bin1_25_percentile_x_val_accelerometer: double (nullable = true)
 |-- bin1_25_percentile_x_val_gyroscope: double (nullable = true)
 |-- bin1_25_percentile_y_val_accelerometer: double (nullable = true)
 |-- bin1_25_percentile_y_val_gyroscope: double (nullable = true)
 |-- bin1_25_percentile_z_val_accelerometer: double (nullable = true)
 |-- bin1_25_percentile_z_val_gyroscope: double (nullable = true)
 |-- bin1_5_percentile_x_val_accelerometer: double (nullable = true)
 |-- bin1_5_percentile_x_val_gyroscope: double (nullable = true)
 |-- bin1_5_percentile_y_val_accelerometer: double (nullable = true)
 |-- bin1_5_percentile_y_val_gyroscope: double (nullable = true)
 |-- bin1_5_percentile_z_val_accelerometer: double (nullable = true)
 |-- bin1_5_percentile_z_val_gyroscope: double (nullable = true)
 |-- bin1_75_percentile_x_val_accelerometer: double (nullable = true)
 |-- bin1_75_percentile_x_val_gyroscope:

### 1.2.2 Converting string labels to integer

In [7]:
def label_converter(label):
    if label == 'non_dominant_pill_med':
        return int(1)
    elif label == 'dominant_pill_med':
        return int(1)
    elif label == 'dominant_liquid_med':
        return int(1)
    elif label == 'non_dominant_liquid_med':
        return int(1)
    else: 
        return 0

label_udf = udf(label_converter, IntegerType())

In [8]:
window_5_df = window_5_df_raw.withColumn('numeric_label', label_udf(window_5_df_raw['label']))\
                             .drop('label')\
                             .drop('_id')\
                             .withColumnRenamed('numeric_label', 'label')


### 1.2.3 Splitting the data into train and test

In [9]:
splits = window_5_df.randomSplit([0.8, 0.2], seed=30)

window_5_train = splits[0].cache()
window_5_test = splits[1].cache()

In [10]:
va_5_train = VectorAssembler(outputCol="features", inputCols=window_5_train.columns[0:-1])
training_set_5 = va_5_train.transform(window_5_train).select("features", "label").cache()

va_5_test = VectorAssembler(outputCol="features", inputCols=window_5_test.columns[0:-1])
testing_set_5 = va_5_test.transform(window_5_test).select("features", "label").cache()

## 1.3 Model 1: Random Forest Classifier

### 1.3.1 model training and evaluation

In [11]:
from time import time
start_rf_5 = time()
rf_5 = RandomForestClassifier(maxDepth=30, seed=42)
rf_model_5 = rf_5.fit(training_set_5)


rf_5_predict = rf_model_5.transform(testing_set_5)

evaluator_f1_5 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1_window_5_rf = evaluator_f1_5.evaluate(rf_5_predict)

evaluator_accuracy_5 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy_window_5_rf = evaluator_accuracy_5.evaluate(rf_5_predict)

evaluator_precision_5 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
precision_window_5_rf = evaluator_precision_5.evaluate(rf_5_predict)

evaluator_recall_5 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
recall_window_5_rf = evaluator_recall_5.evaluate(rf_5_predict)
end_rf_5 = time()

### 1.3.2 confusion matrix

In [13]:
rf_5_predict.select('label','prediction').groupBy('label','prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|   14|
|    0|       0.0|   81|
|    1|       1.0|   52|
|    0|       1.0|   17|
+-----+----------+-----+



### 1.3.3 classification metrics

In [14]:
print("F1 score = %g" % (f1_window_5_rf))
print("Accuracy = %g" % (accuracy_window_5_rf))
print("Test Error = %g" % (1.0 - accuracy_window_5_rf))
print("Precision = %g" % (precision_window_5_rf))
print("Recall = %g" % (recall_window_5_rf))
print("TimeInSeconds = %g" % (end_rf_5-start_rf_5))

F1 score = 0.811607
Accuracy = 0.810976
Test Error = 0.189024
Precision = 0.812787
Recall = 0.810976
TimeInSeconds = 22.6219


## Cross-validation and Hyperparameter tuning

In [19]:
# rf_cv = RandomForestClassifier()
# evaluator_cv = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", 
#                                               metricName="accuracy")

# cv = CrossValidator().setEstimator(rf_cv).setEvaluator(evaluator_cv).setNumFolds(5)

# paramGrid = ParamGridBuilder().addGrid(rf_cv.numTrees, [30, 50, 70])\
#                               .addGrid(rf_cv.maxDepth, [20,25,30])\
#                               .addGrid(rf_cv.featureSubsetStrategy, ['auto', 'all', 'onethird'])\
#                               .build()

# cv.setEstimatorParamMaps(paramGrid)

# cvmodel = cv.fit(training_set_5)

# print(cvmodel.bestModel._java_obj.getNumTrees())
# print(cvmodel.bestModel._java_obj.getFeatureSubsetStrategy())
# print(cvmodel.bestModel._java_obj.getImpurity())
# print("Accuracy : " +  str(MulticlassClassificationEvaluator()\
#                            .evaluate(cvmodel.bestModel.transform(testing_set_5))))

## Feature Importance & Feature Selection

Some features have very low importance that the importance score is returned as 0.
We use features with non-zero importance in other algorithms to see if using a more selective set of features helps increases their performance. 

In [20]:
features_dict = dict()
for ft_index, ft_name in zip(range(0, 324), window_5_df.columns):
    features_dict[ft_index] = ft_name

# select the features that 
n_features = len(rf_model_5.featureImportances.indices) 

important_features = []
for impt, feature_index in sorted(zip(rf_model_5.featureImportances.values, 
                                      rf_model_5.featureImportances.indices), reverse=True)[:n_features]:
    important_features.append(features_dict[feature_index])
    print(f"{features_dict[feature_index]} : {impt}")

global_max_z_val_accelerometer : 0.027321605358731408
global_95_percentile_z_val_accelerometer : 0.024176559441324966
global_avg_z_val_accelerometer : 0.020659353947089495
global_min_y_val_accelerometer : 0.019484144215838693
bin2_95_percentile_z_val_gyroscope : 0.016685499473687065
bin2_std_y_val_accelerometer : 0.015407014249715679
bin3_avg_z_val_accelerometer : 0.015002890224686309
global_std_x_val_accelerometer : 0.013946045873114401
bin2_75_percentile_z_val_gyroscope : 0.011690661552343754
bin2_std_z_val_gyroscope : 0.011550934152936416
global_std_y_val_accelerometer : 0.010509143403371703
bin2_max_z_val_gyroscope : 0.010026933369531556
bin3_25_percentile_z_val_accelerometer : 0.010008345770625052
global_max_z_val_gyroscope : 0.009866967852499964
global_95_percentile_y_val_gyroscope : 0.009718336886372319
bin3_75_percentile_z_val_accelerometer : 0.009494672399274496
global_5_percentile_y_val_accelerometer : 0.009356038221645852
bin2_avg_y_val_gyroscope : 0.008996379888690292
bin4_

In [21]:
# unimportant features
unimportant_features = [feature for feature in range(324) 
                        if feature not in rf_model_5.featureImportances.indices] # indices

unimportant_features = [features_dict[index] for index in unimportant_features]

__Features vector and Label vector for the selected set of features__

We will use the features and label vectors built from this set of `important_features` as inputs for an alternative model of the following algorithms and compare the performance before and after feature selection:

In [22]:
va_5_train_top = VectorAssembler(outputCol="features", inputCols=important_features)
training_set_5_top = va_5_train_top.transform(window_5_train).select("features", "label")

va_5_test_top = VectorAssembler(outputCol="features", inputCols=important_features)
testing_set_5_top = va_5_test_top.transform(window_5_test).select("features", "label")
training_set_5_top.cache()
testing_set_5_top.cache()

DataFrame[features: vector, label: int]

## 1.4 Model 2: Logistic Regression 

### 1.4.1 model training and evaluation

In [23]:
start_lr_5 = time()

lr_5 = LogisticRegression(regParam=0.01, maxIter=1000, fitIntercept=True)
lr_model_5 = lr_5.fit(training_set_5)

lr_5_predict = lr_model_5.transform(testing_set_5)
#lr_5_predict.show()

bc_eval = BinaryClassificationEvaluator()

evaluator_f1_5_lr = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1_window_5_lr = evaluator_f1_5.evaluate(lr_5_predict)

evaluator_accuracy_5_lr = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy_window_5_lr = evaluator_accuracy_5.evaluate(lr_5_predict)

evaluator_precision_5_lr = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
precision_window_5_lr = evaluator_precision_5_lr.evaluate(lr_5_predict)

evaluator_recall_5_lr = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
recall_window_5_lr = evaluator_recall_5_lr.evaluate(lr_5_predict)

end_lr_5 = time()

### 1.4.2 confusion matrix

In [24]:
lr_5_predict.select('label','prediction').groupBy('label','prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|   18|
|    0|       0.0|   79|
|    1|       1.0|   48|
|    0|       1.0|   19|
+-----+----------+-----+



### 1.4.3 classification metrics

In [25]:
print("F1 score = %g" % (f1_window_5_lr))
print("Accuracy = %g" % (accuracy_window_5_lr))
print("Test Error = %g" % (1.0 - accuracy_window_5_lr))
print("Precision = %g" % (precision_window_5_lr))
print("Recall = %g" % (recall_window_5_lr))
print("TimeInSeconds = %g" % (end_lr_5-start_lr_5))

F1 score = 0.77466
Accuracy = 0.77439
Test Error = 0.22561
Precision = 0.774988
Recall = 0.77439
TimeInSeconds = 14.3331


## Logistic Regression on selected features

In [27]:
start_lr_5_top = time()
lr_5_top = LogisticRegression(regParam=0.01, maxIter=1000, fitIntercept=True)
lr_model_5_top = lr_5_top.fit(training_set_5_top)

lr_5_predict_top = lr_model_5_top.transform(testing_set_5_top)
# lr_5_predict_top.show()

bc_eval = BinaryClassificationEvaluator()
# print (bc_eval.getMetricName() +":" + str(bc_eval.evaluate(lr_5_predict_top)))

evaluator_f1_5_lr_top = MulticlassClassificationEvaluator(labelCol="label", 
                                                          predictionCol="prediction", 
                                                          metricName="f1")
f1_window_5_lr_top = evaluator_f1_5_lr_top.evaluate(lr_5_predict_top)

evaluator_accuracy_5_lr_top = MulticlassClassificationEvaluator(labelCol="label", 
                                                                predictionCol="prediction", 
                                                                metricName="accuracy")
accuracy_window_5_lr_top = evaluator_accuracy_5.evaluate(lr_5_predict_top)

evaluator_precision_5_lr_top = MulticlassClassificationEvaluator(labelCol="label", 
                                                                 predictionCol="prediction", 
                                                                 metricName="weightedPrecision")
precision_window_5_lr_top = evaluator_precision_5_lr_top.evaluate(lr_5_predict_top)

evaluator_recall_5_lr_top = MulticlassClassificationEvaluator(labelCol="label", 
                                                              predictionCol="prediction", 
                                                              metricName="weightedRecall")
recall_window_5_lr_top = evaluator_recall_5_lr_top.evaluate(lr_5_predict_top)
end_lr_5_top = time()

### Confusion matrix

In [28]:
lr_5_predict_top.select('label','prediction').groupBy('label','prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|   17|
|    0|       0.0|   79|
|    1|       1.0|   49|
|    0|       1.0|   19|
+-----+----------+-----+



### Classification metrics

In [29]:
print("F1 score = %g" % (f1_window_5_lr_top))
print("Accuracy = %g" % (accuracy_window_5_lr_top))
print("Test Error = %g" % (1.0 - accuracy_window_5_lr_top))
print("Precision = %g" % (precision_window_5_lr_top))
print("Recall = %g" % (recall_window_5_lr_top))

F1 score = 0.780994
Accuracy = 0.780488
Test Error = 0.219512
Precision = 0.781736
Recall = 0.780488


## 1.5 Model 3: Gradient-boosted tree classifier

### 1.5.1 model training and evaluation

In [30]:
start_gbt_5 = time()
gbt_5 = GBTClassifier(maxIter=50, seed=42)
gbt_model_5 = gbt_5.fit(training_set_5)

gbt_5_predict = gbt_model_5.transform(testing_set_5)

gbt_evaluator_accuracy_5 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
gbt_accuracy_5 = gbt_evaluator_accuracy_5.evaluate(gbt_5_predict)

gbt_evaluator_precision_5 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
gbt_precision_5 = gbt_evaluator_precision_5.evaluate(gbt_5_predict)

gbt_evaluator_f1_5 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
gbt_f1_5 = gbt_evaluator_f1_5.evaluate(gbt_5_predict)

gbt_evaluator_recall_5 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
gbt_recall_5 = gbt_evaluator_recall_5.evaluate(gbt_5_predict)

end_gbt_5 = time()

### 1.5.2 confusion matrix

In [31]:
gbt_5_predict.select('label','prediction').groupBy('label','prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|   11|
|    0|       0.0|   81|
|    1|       1.0|   55|
|    0|       1.0|   17|
+-----+----------+-----+



### 1.5.3 classification metrics

In [32]:
print("F1 score = %g" % (gbt_f1_5))
print("Accuracy = %g" % (gbt_accuracy_5))
print("Test Error = %g" % (1.0 - gbt_accuracy_5))
print("Precision = %g" % (gbt_precision_5))
print("Recall = %g" % (gbt_recall_5))
print("TimeInSeconds = %g" % (end_gbt_5-start_gbt_5))

F1 score = 0.830284
Accuracy = 0.829268
Test Error = 0.170732
Precision = 0.833532
Recall = 0.829268
TimeInSeconds = 54.9102


### Gradient-boosted tree on selected features

In [33]:
start_gbt_5_top = time()
gbt_5_top = GBTClassifier(maxIter=50, seed=42)
gbt_model_5_top = gbt_5_top.fit(training_set_5_top)

gbt_5_predict_top = gbt_model_5_top.transform(testing_set_5_top)

gbt_evaluator_accuracy_5_top = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
gbt_accuracy_5_top = gbt_evaluator_accuracy_5_top.evaluate(gbt_5_predict_top)

gbt_evaluator_precision_5_top = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
gbt_precision_5_top = gbt_evaluator_precision_5_top.evaluate(gbt_5_predict_top)

gbt_evaluator_f1_5_top = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
gbt_f1_5_top = gbt_evaluator_f1_5_top.evaluate(gbt_5_predict_top)

gbt_evaluator_recall_5_top = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
gbt_recall_5_top = gbt_evaluator_recall_5_top.evaluate(gbt_5_predict_top)

end_gbt_5_top = time()

### Confusion matrix

In [34]:
gbt_5_predict_top.select('label','prediction').groupBy('label','prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|    8|
|    0|       0.0|   79|
|    1|       1.0|   58|
|    0|       1.0|   19|
+-----+----------+-----+



### Performance metrics

In [35]:
print("F1 score = %g" % (gbt_f1_5_top))
print("Accuracy = %g" % (gbt_accuracy_5_top))
print("Test Error = %g" % (1.0 - gbt_accuracy_5_top))
print("Precision = %g" % (gbt_precision_5_top))
print("Recall = %g" % (gbt_recall_5_top))

F1 score = 0.836803
Accuracy = 0.835366
Test Error = 0.164634
Precision = 0.845749
Recall = 0.835366


## 1.6 Model 4: Linear Support Vector Machine

### 1.6.1 model training and evaluation

In [36]:
start_svm_5 = time()
svm_5 = LinearSVC(maxIter=100, regParam=0.1)
svm_model_5 = svm_5.fit(training_set_5)

svm_5_predict = svm_model_5.transform(testing_set_5)

svm_evaluator_accuracy_5 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
svm_accuracy_5 = svm_evaluator_accuracy_5.evaluate(svm_5_predict)

svm_evaluator_precision_5 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
svm_precision_5 = svm_evaluator_precision_5.evaluate(svm_5_predict)

svm_evaluator_f1_5 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
svm_f1_5 = svm_evaluator_f1_5.evaluate(svm_5_predict)

svm_evaluator_recall_5 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
svm_recall_5 = svm_evaluator_recall_5.evaluate(svm_5_predict)
end_svm_5 = time()

### 1.6.2 confusion matrix

In [37]:
svm_5_predict.select('label','prediction').groupBy('label','prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|   16|
|    0|       0.0|   79|
|    1|       1.0|   50|
|    0|       1.0|   19|
+-----+----------+-----+



### 1.6.3 classification metrics

In [38]:
print("F1 score = %g" % (svm_f1_5))
print("Accuracy = %g" % (svm_accuracy_5))
print("Test Error = %g" % (1.0 - svm_accuracy_5))
print("Precision = %g" % (svm_precision_5))
print("Recall = %g" % (svm_recall_5))
print("TimeInSeconds = %g" % (end_svm_5-start_svm_5))

F1 score = 0.787298
Accuracy = 0.786585
Test Error = 0.213415
Precision = 0.788542
Recall = 0.786585
TimeInSeconds = 13.9244


## Linear Support Vector Machine on selected features

In [39]:
start_svm_5_top = time()

svm_5_top = LinearSVC(maxIter=100, regParam=0.1)
svm_model_5_top = svm_5_top.fit(training_set_5_top)

svm_5_predict_top = svm_model_5_top.transform(testing_set_5_top)

svm_evaluator_accuracy_5_top = MulticlassClassificationEvaluator(labelCol="label", 
                                                             predictionCol="prediction", 
                                                             metricName="accuracy")
svm_accuracy_5_top = svm_evaluator_accuracy_5_top.evaluate(svm_5_predict_top)

svm_evaluator_precision_5_top = MulticlassClassificationEvaluator(labelCol="label", 
                                                              predictionCol="prediction", 
                                                              metricName="weightedPrecision")
svm_precision_5_top = svm_evaluator_precision_5_top.evaluate(svm_5_predict_top)

svm_evaluator_f1_5_top = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
svm_f1_5_top = svm_evaluator_f1_5_top.evaluate(svm_5_predict_top)

svm_evaluator_recall_5_top = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
svm_recall_5_top = svm_evaluator_recall_5_top.evaluate(svm_5_predict_top)

end_svm_5_top = time()

### Confusion Matrix

In [40]:
svm_5_predict_top.select('label','prediction').groupBy('label','prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|   16|
|    0|       0.0|   79|
|    1|       1.0|   50|
|    0|       1.0|   19|
+-----+----------+-----+



### Performance metrics

In [41]:
print("F1 score = %g" % (svm_f1_5_top))
print("Accuracy = %g" % (svm_accuracy_5_top))
print("Test Error = %g" % (1.0 - svm_accuracy_5_top))
print("Precision = %g" % (svm_precision_5_top))
print("Recall = %g" % (svm_recall_5_top))

F1 score = 0.787298
Accuracy = 0.786585
Test Error = 0.213415
Precision = 0.788542
Recall = 0.786585


## 1.7 Model 5: OneVsRest 

### 1.7.1 model training and evaluation

In [42]:
start_ovr_5 = time()
ovr = OneVsRest(classifier=lr_5)
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="label",
                                                 predictionCol="prediction",
                                                 metricName="f1")
evaluator_accuracy = MulticlassClassificationEvaluator(labelCol="label",
                                                       predictionCol="prediction",
                                                       metricName="accuracy")
evaluator_precision = MulticlassClassificationEvaluator(labelCol="label",
                                                        predictionCol="prediction",
                                                        metricName="weightedPrecision")
evaluator_recall = MulticlassClassificationEvaluator(labelCol="label",
                                                        predictionCol="prediction",
                                                        metricName="weightedRecall")
ovr_model_5 = ovr.fit(training_set_5)
ovr_5_predict = ovr_model_5.transform(testing_set_5)

f1_window_5_ovr = evaluator_f1.evaluate(ovr_5_predict)
accuracy_window_5_ovr = evaluator_accuracy.evaluate(ovr_5_predict)
precision_window_5_ovr = evaluator_precision.evaluate(ovr_5_predict)
recall_window_5_ovr = evaluator_recall.evaluate(ovr_5_predict)
end_ovr_5 = time()

### 1.7.2 confusion matrix

In [43]:
ovr_5_predict.select('label','prediction').groupBy('label','prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|   18|
|    0|       0.0|   79|
|    1|       1.0|   48|
|    0|       1.0|   19|
+-----+----------+-----+



### 1.7.3 classification metrics

In [44]:
print("F1 score = %g" % (f1_window_5_ovr))
print("Accuracy = %g" % (accuracy_window_5_ovr))
print("Test Error = %g" % (1.0 - accuracy_window_5_ovr))
print("Precision = %g" % (precision_window_5_ovr))
print("Recall = %g" % (recall_window_5_ovr))
print("TimeInSeconds = %g" % (end_ovr_5-start_ovr_5))

F1 score = 0.77466
Accuracy = 0.77439
Test Error = 0.22561
Precision = 0.774988
Recall = 0.77439
TimeInSeconds = 25.7417


## OneVsRest on selected features

In [45]:
start_ovr_5_top = time()

ovr_top = OneVsRest(classifier=lr_5) 
evaluator_f1_top = MulticlassClassificationEvaluator(labelCol="label",
                                                 predictionCol="prediction",
                                                 metricName="f1")
evaluator_accuracy_top = MulticlassClassificationEvaluator(labelCol="label",
                                                       predictionCol="prediction",
                                                       metricName="accuracy")
evaluator_precision_top = MulticlassClassificationEvaluator(labelCol="label",
                                                        predictionCol="prediction",
                                                        metricName="weightedPrecision")
evaluator_recall_top = MulticlassClassificationEvaluator(labelCol="label",
                                                        predictionCol="prediction",
                                                        metricName="weightedRecall")
ovr_model_5_top = ovr_top.fit(training_set_5_top)
ovr_5_predict_top = ovr_model_5_top.transform(testing_set_5_top)

f1_window_5_ovr_top = evaluator_f1_top.evaluate(ovr_5_predict_top)
accuracy_window_5_ovr_top = evaluator_accuracy_top.evaluate(ovr_5_predict_top)
precision_window_5_ovr_top = evaluator_precision_top.evaluate(ovr_5_predict_top)
recall_window_5_ovr_top = evaluator_recall_top.evaluate(ovr_5_predict_top)

end_ovr_5_top = time()

### Confustion matrix

In [46]:
ovr_5_predict_top.select('label','prediction').groupBy('label','prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|   17|
|    0|       0.0|   79|
|    1|       1.0|   49|
|    0|       1.0|   19|
+-----+----------+-----+



### Performance metrics

In [47]:
print("F1 score = %g" % (f1_window_5_ovr_top))
print("Accuracy = %g" % (accuracy_window_5_ovr_top))
print("Test Error = %g" % (1.0 - accuracy_window_5_ovr_top))
print("Precision = %g" % (precision_window_5_ovr_top))
print("Recall = %g" % (recall_window_5_ovr_top))
print("TimeInSeconds = %g" % (end_ovr_5-start_ovr_5_top))

F1 score = 0.780994
Accuracy = 0.780488
Test Error = 0.219512
Precision = 0.781736
Recall = 0.780488
TimeInSeconds = -2.59083


## 1.6 Model comparison

In [28]:
# import pandas as pd
# headers=["model","F1 score", "Accuracy", "Test Error", "Precision",
#         "Recall","TimeInSeconds"]
# metrics=[("random forest",f1_window_5_rf, accuracy_window_5_rf, 1.0 - accuracy_window_5_rf,
#             precision_window_5_rf, recall_window_5_rf, end_rf_5-start_rf_5),
#          ("logistic regression", f1_window_5_lr, accuracy_window_5_lr, 1.0 - accuracy_window_5_lr,
#            precision_window_5_lr, recall_window_5_lr, end_lr_5-start_lr_5),
#          ("Gradient-boosted tree classifier", gbt_f1_5, gbt_accuracy_5, 1.0 - gbt_accuracy_5,
#             gbt_precision_5, gbt_recall_5, end_gbt_5-start_gbt_5),
#         ("Support Vector Machine", svm_f1_5, svm_accuracy_5, 1.0 - svm_accuracy_5,
#            svm_precision_5, svm_recall_5, end_svm_5-start_svm_5),
#         ("OneVsRest", f1_window_5_ovr, accuracy_window_5_ovr, 1.0 - accuracy_window_5_ovr,
#            precision_window_5_ovr, recall_window_5_ovr, end_ovr_5-start_ovr_5)]
# df_metrics = pd.DataFrame.from_records(metrics, columns=headers).set_index('model')

In [48]:
import pandas as pd
headers=["model","F1 score", "Accuracy", "Test Error", "Precision",
        "Recall","TimeInSeconds"]
metrics=[("random forest",f1_window_5_rf, accuracy_window_5_rf, 1.0 - accuracy_window_5_rf,
            precision_window_5_rf, recall_window_5_rf, end_rf_5-start_rf_5),
         ("logistic regression", f1_window_5_lr, accuracy_window_5_lr, 1.0 - accuracy_window_5_lr,
           precision_window_5_lr, recall_window_5_lr, end_lr_5-start_lr_5),
         ("logistic regression - selected features", f1_window_5_lr_top, accuracy_window_5_lr_top, 
          1.0 - accuracy_window_5_lr_top, precision_window_5_lr_top, recall_window_5_lr_top, 
          end_lr_5_top-start_lr_5_top),
         ("Gradient-boosted tree classifier", gbt_f1_5, gbt_accuracy_5, 1.0 - gbt_accuracy_5,
            gbt_precision_5, gbt_recall_5, end_gbt_5-start_gbt_5),
         ("Gradient-boosted tree classifier - selected features", gbt_f1_5_top, gbt_accuracy_5_top, 
          1.0 - gbt_accuracy_5_top, gbt_precision_5_top, gbt_recall_5_top, end_gbt_5_top-start_gbt_5_top),
         ("Support Vector Machine", svm_f1_5, svm_accuracy_5, 1.0 - svm_accuracy_5,
           svm_precision_5, svm_recall_5, end_svm_5-start_svm_5),
         ("Support Vector Machine - selected features", svm_f1_5_top, svm_accuracy_5_top, 
          1.0 - svm_accuracy_5_top, svm_precision_5_top, svm_recall_5_top, end_svm_5_top-start_svm_5_top),
         ("OneVsRest", f1_window_5_ovr, accuracy_window_5_ovr, 1.0 - accuracy_window_5_ovr,
           precision_window_5_ovr, recall_window_5_ovr, end_ovr_5-start_ovr_5),
        ("OneVsRest - selected features", f1_window_5_ovr_top, accuracy_window_5_ovr_top, 
         1.0 - accuracy_window_5_ovr_top, precision_window_5_ovr_top, recall_window_5_ovr_top, 
         end_ovr_5_top-start_ovr_5_top)]
df_metrics = pd.DataFrame.from_records(metrics, columns=headers).set_index('model')

In [49]:
df_metrics

Unnamed: 0_level_0,F1 score,Accuracy,Test Error,Precision,Recall,TimeInSeconds
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
random forest,0.811607,0.810976,0.189024,0.812787,0.810976,22.621946
logistic regression,0.77466,0.77439,0.22561,0.774988,0.77439,14.333141
logistic regression - selected features,0.780994,0.780488,0.219512,0.781736,0.780488,14.140279
Gradient-boosted tree classifier,0.830284,0.829268,0.170732,0.833532,0.829268,54.910195
Gradient-boosted tree classifier - selected features,0.836803,0.835366,0.164634,0.845749,0.835366,51.853088
Support Vector Machine,0.787298,0.786585,0.213415,0.788542,0.786585,13.924436
Support Vector Machine - selected features,0.787298,0.786585,0.213415,0.788542,0.786585,12.476128
OneVsRest,0.77466,0.77439,0.22561,0.774988,0.77439,25.741653
OneVsRest - selected features,0.780994,0.780488,0.219512,0.781736,0.780488,27.152684
