In [1]:
from pyspark.sql import SparkSession  
import os                             

from pyspark.sql.types import *

from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import LinearSVC
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import OneVsRest
from pyspark.ml.classification import RandomForestClassifier


In [2]:
def load_from_mongodb(publicIP:str, db:str, collection:str):
    pyspark_submit_args = '--packages org.mongodb.spark:mongo-spark-connector_2.11:2.4.0 pyspark-shell'
    os.environ["PYSPARK_SUBMIT_ARGS"] = pyspark_submit_args
    
    uri = "mongodb://{}/{}.{}".format(publicIP, db, collection)
    
    ss = SparkSession.builder \
                     .appName("myApp") \
                     .config("spark.mongodb.input.uri", uri)\
                     .getOrCreate()
    loaded_data = ss.read.format("com.mongodb.spark.sql.DefaultSource").load()
    return loaded_data

In [3]:
def labelize(df, labelCol:str):
    """
    Rename the specified column as `label` in the input DataFrame object df.
    """
    if labelCol != 'label':
        return df.withColumnRenamed(existing=labelCol, new='label')

In [4]:
def binarize_label(df, targetValues:list):
    """
    Label the `targetValues` in the column `label` as 1 in the DataFrame df.
    """
    from pyspark.sql.functions import udf
    
    def label_converter(label):
        if label in targetValues:
            return int(1)
        else: 
            return 0

    label_udf = udf(f=label_converter, returnType=IntegerType())
    
    df_binarized = df.withColumn('numeric_label', label_udf(df['label']))\
                     .drop('label')\
                     .withColumnRenamed('numeric_label', 'label')
    return df_binarized

In [5]:
def create_train_test_sets(labelizedDF,
                           featureCols:list, splitRatio:list, labelCol="label", seed=None):
    """
    Transform a clean DataFrame with numeric label columns 
    into features and label vectors for modeling.
    Split into training and test set and cache them in the memory.
    """
    # Turn raw columns into features and label vectors
    from pyspark.ml.feature import VectorAssembler
    
    # if label column is not 'label' then change it into 'label'
    labelizedDF = labelizedDF.withColumnRenamed(existing=labelCol, new='label')
    
    va = VectorAssembler(inputCols=featureCols, outputCol='features')
    labeledPoints = va.transform(labelizedDF).select('features', labelCol)
    
    
    
    # Then split into train and test set
    train, test = labeledPoints.randomSplit(splitRatio, seed=seed)
    
    return train.cache(), test.cache()

In [6]:
def create_evaluators(evalType:str, metrics:list, 
                      labelCol='label', predictionCol='prediction'):
    """
    Initiate either 'bina' (binary) or 'multi' (multiclass) classification evaluators.
    """
    
    if evalType=='multi':
        from pyspark.ml.evaluation import MulticlassClassificationEvaluator
        evaluators = [MulticlassClassificationEvaluator(metricName=metric) for metric in metrics]
        return evaluators
    
    elif evalType=='bina':
        from pyspark.ml.evaluation import BinaryClassificationEvaluator
        evaluators = [BinaryClassificationEvaluator(metricName=metric) for metric in metrics]
        return evaluators
    
    else:
        print('Invalid input.')
        return None

In [7]:
def modeling(training_data, clf):
    """
    Take a training DataFrame, an estimator, and a performance metric.
    Return the trained model and training time. 
    """
    from time import time
    start_train = time()
    clf_trained = clf.fit(training_data)
    training_time = time() - start_train
    print("Training time in seconds is {}.\n".format(training_time))
    
    return clf_trained, training_time

In [8]:
def model_evaluate(testing_data, trained_clf, evaluators:list):
    """
    Take a test set, a trained model, and performance metrics for evaluation.
    Return performance metrics and prediction time.
    """
    from time import time
    start_pred = time()
    pred = trained_clf.transform(testing_data)
    pred_time = time() - start_pred
    
    pred.select('label','prediction').groupBy('label','prediction').count().show()
    
    scores = []
    for evaluator in evaluators:
        score = evaluator.evaluate(pred)
        scores.append(score)
        print(f"{evaluator.getMetricName().capitalize()} score = {score}")
        if evaluator.getMetricName() == 'accuracy':
            print(f"Test Error = {1 - score}")
#             scores.append(1-score)
    print(f"\nPredition time in seconds is: {pred_time}")
    return scores, pred_time

In [9]:
def run_model(clf, training_data, testing_data, evaluators):
    """
    Combine all model fitting and evaluation functions together.
    """
    trained_model, training_time = modeling(training_data=training_set, clf=clf)
    scores, pred_time = model_evaluate(testing_data=testing_set, trained_clf=trained_model, evaluators=evaluators)
    return trained_model, training_time, pred_time, scores

# Main program

In [10]:
# main program

# load data from MongoDB
df_raw = load_from_mongodb(publicIP='18.217.205.7', db='sensors', collection='forty')

# rename label column as 'label'
df = labelize(df_raw, 'profile_activity')

# binarize target labels into 1 and 0
targetActions = ['non_dominant_pill_med', 'dominant_pill_med', 'dominant_liquid_med', 'non_dominant_liquid_med']
df = binarize_label(df, targetValues=targetActions)

# create train, test sets
feature_cols = df.drop('_id', 'label').columns
training_set, testing_set = create_train_test_sets(df, featureCols=feature_cols, labelCol='label', 
                                                   seed=1, splitRatio=[0.8, 0.2])


# initiate performance evaluators to compare all models
metrics = ['f1', 'accuracy', 'weightedPrecision', 'weightedRecall']
evaluators = create_evaluators(evalType='multi', metrics=metrics, predictionCol='prediction')

## Random Forest with default parameter settings

In [11]:
rf = RandomForestClassifier(seed = 42)
rf_model, rf_training_time, rf_pred_time, rf_scores = run_model(clf=rf, evaluators=evaluators, 
                                                                training_data=training_set, 
                                                                testing_data=testing_set)

Training time in seconds is 45.502580642700195.

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|    5|
|    0|       0.0|   75|
|    1|       1.0|   85|
|    0|       1.0|   10|
+-----+----------+-----+

F1 score = 0.9141453141453142
Accuracy score = 0.9142857142857143
Test Error = 0.08571428571428574
Weightedprecision score = 0.9155075187969925
Weightedrecall score = 0.9142857142857143

Predition time in seconds is: 0.16902375221252441


## Random Forest with tuned hyperparameters

In [12]:
rf_tune = RandomForestClassifier(maxDepth=30, numTrees=50, seed = 42)

rf_tune_model, rf_tune_training_time, rf_tune_pred_time, rf_tune_scores = run_model(clf=rf_tune, 
                                                                                    evaluators=evaluators, 
                                                                                    training_data=training_set, 
                                                                                    testing_data=testing_set)

Training time in seconds is 14.069912672042847.

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|    2|
|    0|       0.0|   83|
|    1|       1.0|   88|
|    0|       1.0|    2|
+-----+----------+-----+

F1 score = 0.9771428571428572
Accuracy score = 0.9771428571428571
Test Error = 0.02285714285714291
Weightedprecision score = 0.9771428571428572
Weightedrecall score = 0.9771428571428572

Predition time in seconds is: 0.12610292434692383


## Random Forest with tuned hyperparameters 2

In [13]:
rf_tune2 = RandomForestClassifier(maxDepth=30, numTrees=50, featureSubsetStrategy='sqrt', seed = 42)
rf_tune2_model, rf_tune2_training_time, rf_tune2_pred_time, rf_tune2_scores = run_model(clf=rf_tune2, 
                                                                                        evaluators=evaluators, 
                                                                                        training_data=training_set,
                                                                                        testing_data=testing_set)

Training time in seconds is 13.312894105911255.

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|    2|
|    0|       0.0|   83|
|    1|       1.0|   88|
|    0|       1.0|    2|
+-----+----------+-----+

F1 score = 0.9771428571428572
Accuracy score = 0.9771428571428571
Test Error = 0.02285714285714291
Weightedprecision score = 0.9771428571428572
Weightedrecall score = 0.9771428571428572

Predition time in seconds is: 0.13925385475158691


## Logistic Regression 

In [14]:
lr = LogisticRegression(regParam=0.01, maxIter=1000, fitIntercept=True)
lr_model, lr_training_time, lr_pred_time, lr_scores = run_model(clf=lr, evaluators=evaluators, 
                                                                training_data=training_set, 
                                                                testing_data=testing_set)

Training time in seconds is 60.9147207736969.

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|    7|
|    0|       0.0|   77|
|    1|       1.0|   83|
|    0|       1.0|    8|
+-----+----------+-----+

F1 score = 0.9142689015192202
Accuracy score = 0.9142857142857143
Test Error = 0.08571428571428574
Weightedprecision score = 0.9143118785975928
Weightedrecall score = 0.9142857142857144

Predition time in seconds is: 0.12568426132202148


## Gradient-Boosted Tree Classifier

In [15]:
gbt = GBTClassifier(maxIter=50, seed=42)
gbt_model, gbt_training_time, gbt_pred_time, gbt_scores = run_model(clf=gbt, evaluators=evaluators, 
                                                                    training_data=training_set, 
                                                                    testing_data=testing_set)

Training time in seconds is 208.78445649147034.

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|    2|
|    0|       0.0|   84|
|    1|       1.0|   88|
|    0|       1.0|    1|
+-----+----------+-----+

F1 score = 0.982859383094608
Accuracy score = 0.9828571428571429
Test Error = 0.017142857142857126
Weightedprecision score = 0.9829258277651274
Weightedrecall score = 0.9828571428571429

Predition time in seconds is: 0.1262812614440918


## Linear Support Vector Machine

In [16]:
svm = LinearSVC(maxIter=100, regParam=0.1)
svm_model, svm_training_time, svm_pred_time, svm_scores = run_model(clf=svm, evaluators=evaluators, 
                                                                    training_data=training_set, 
                                                                    testing_data=testing_set)

Training time in seconds is 20.61867332458496.

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|    8|
|    0|       0.0|   73|
|    1|       1.0|   82|
|    0|       1.0|   12|
+-----+----------+-----+

F1 score = 0.8855795854224351
Accuracy score = 0.8857142857142857
Test Error = 0.11428571428571432
Weightedprecision score = 0.8863747232541559
Weightedrecall score = 0.8857142857142857

Predition time in seconds is: 0.06303858757019043


## OneVsRest

In [17]:
ovr = OneVsRest(classifier=lr)
ovr_model, ovr_training_time, ovr_pred_time, ovr_scores = run_model(clf=ovr, evaluators=evaluators, 
                                                                    training_data=training_set, 
                                                                    testing_data=testing_set)

Training time in seconds is 117.82324767112732.

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|    7|
|    0|       0.0|   77|
|    1|       1.0|   83|
|    0|       1.0|    8|
+-----+----------+-----+

F1 score = 0.9142689015192202
Accuracy score = 0.9142857142857143
Test Error = 0.08571428571428574
Weightedprecision score = 0.9143118785975928
Weightedrecall score = 0.9142857142857144

Predition time in seconds is: 0.5393030643463135


# Random Forest with selected features

After running random forest models, we noticed that not all features help the model. Some features have very low importance that the importance score is returned as 0. We use features with non-zero importance in some algorithms to see if using a more selective set of features helps increases their performance.

## Extract important features

In [18]:
# allFeatures = df.drop('_id','label').columns

features_dict = dict()
for ft_index, ft_name in enumerate(df.columns):
    features_dict[ft_index] = ft_name
# colIndex_to_delete = [colIndex for colIndex, colName in features_dict.items() 
#                       if colName in ['_id', 'label']]
# for colIndex in colIndex_to_delete:
#     del features_dict[colIndex]

# select the features that have non-zero significance
n_features = len(rf_tune_model.featureImportances.indices) 

important_features = []
for impt, feature_index in sorted(zip(rf_tune_model.featureImportances.values, 
                                      rf_tune_model.featureImportances.indices), reverse=True)[:n_features]:
    important_features.append(features_dict[feature_index+1])
    print(f"{features_dict[feature_index+1]} : {impt}")

global_max_z_val_accelerometer : 0.018102714742531036
global_std_y_val_accelerometer : 0.008470467932173794
global_95_percentile_z_val_accelerometer : 0.008216018076404718
global_std_x_val_accelerometer : 0.007398996175978697
bin19_std_y_val_accelerometer : 0.0069988666407970355
global_75_percentile_z_val_accelerometer : 0.006887370583319257
bin8_std_z_val_gyroscope : 0.006645707003685561
bin18_std_y_val_gyroscope : 0.006339215466729085
global_min_z_val_accelerometer : 0.006266699455065079
bin6_max_z_val_accelerometer : 0.006077281220544812
bin18_std_z_val_gyroscope : 0.005999144359622649
bin16_std_y_val_accelerometer : 0.005415515487585808
bin7_75_percentile_z_val_accelerometer : 0.005318877469731636
global_max_x_val_gyroscope : 0.004717285879776724
bin17_std_y_val_gyroscope : 0.004627222012141572
global_std_y_val_gyroscope : 0.0044772667362868454
global_5_percentile_x_val_gyroscope : 0.004277094525262298
bin9_95_percentile_z_val_accelerometer : 0.004162615750419789
bin8_max_z_val_gyr

bin39_max_x_val_accelerometer : 0.0004338348318547925
bin27_5_percentile_z_val_accelerometer : 0.0004338137410600568
bin34_avg_x_val_accelerometer : 0.00043341647152240326
bin11_min_z_val_gyroscope : 0.0004333887911989299
bin29_95_percentile_y_val_gyroscope : 0.00043217037971136316
bin27_75_percentile_x_val_accelerometer : 0.0004306497081811824
bin37_75_percentile_z_val_gyroscope : 0.0004304747811171099
bin18_min_y_val_accelerometer : 0.0004300281643258863
bin1_med_x_val_accelerometer : 0.00042958240062058783
bin33_avg_y_val_gyroscope : 0.00042952092798036
bin37_avg_x_val_gyroscope : 0.0004294242203732321
bin4_25_percentile_y_val_accelerometer : 0.00042778002052782385
bin39_max_y_val_accelerometer : 0.0004275314575524841
bin12_25_percentile_x_val_gyroscope : 0.0004263112251892045
bin35_5_percentile_z_val_accelerometer : 0.00042502673966094334
bin21_5_percentile_z_val_gyroscope : 0.0004230699350492309
bin21_75_percentile_y_val_gyroscope : 0.0004211345084543041
bin4_75_percentile_z_val_a

bin6_25_percentile_y_val_gyroscope : 0.00010763475731628587
bin32_75_percentile_z_val_accelerometer : 0.00010756631758796898
bin32_max_y_val_gyroscope : 0.00010713960180736245
bin22_25_percentile_x_val_gyroscope : 0.00010711087811762791
bin33_min_x_val_accelerometer : 0.00010674385142708205
bin31_25_percentile_z_val_gyroscope : 0.00010660559876811281
bin26_75_percentile_y_val_accelerometer : 0.0001063798394711991
bin38_min_z_val_gyroscope : 0.00010634671251556322
bin12_min_x_val_gyroscope : 0.0001061822894141684
bin17_med_y_val_gyroscope : 0.00010613940755121544
bin18_max_x_val_gyroscope : 0.00010613508049405478
bin37_min_y_val_gyroscope : 0.00010593211904050976
bin15_75_percentile_y_val_gyroscope : 0.00010575553415509871
bin22_min_x_val_accelerometer : 0.00010570871083785545
bin30_avg_z_val_accelerometer : 0.00010570778469916822
bin26_75_percentile_x_val_gyroscope : 0.0001055189047315032
bin15_min_y_val_accelerometer : 0.00010535830028626192
bin3_min_y_val_gyroscope : 0.00010532478547

In [19]:
unimportant_features = [featureIndex for featureIndex, featureName in enumerate(df.columns)
                        if featureIndex not in rf_tune_model.featureImportances.indices] # indices

unimportant_features = [features_dict[index] for index in unimportant_features]

__Features vector and Label vector for the selected set of features__

We will use the features and label vectors built from this set of `important_features` as inputs for an alternative model of the following algorithms and compare the performance before and after feature selection

In [20]:
# recreate training and testing set

feature_cols = important_features
training_set_top, testing_set_top = create_train_test_sets(df, featureCols=important_features, labelCol='label', 
                                                   seed=1, splitRatio=[0.8, 0.2])

## RandomForest - selected features 

In [21]:
rf_top = RandomForestClassifier(maxDepth=30, numTrees=50, seed = 42)
rf_top_model, rf_top_training_time, rf_top_pred_time, rf_top_scores = run_model(clf=rf_top, evaluators=evaluators, 
                                             training_data=training_set_top, testing_data=testing_set_top)

Training time in seconds is 11.6944420337677.

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|    2|
|    0|       0.0|   83|
|    1|       1.0|   88|
|    0|       1.0|    2|
+-----+----------+-----+

F1 score = 0.9771428571428572
Accuracy score = 0.9771428571428571
Test Error = 0.02285714285714291
Weightedprecision score = 0.9771428571428572
Weightedrecall score = 0.9771428571428572

Predition time in seconds is: 0.11738920211791992


## Logistic Regression - selected features

In [22]:
lr_top = LogisticRegression(regParam=0.01, maxIter=1000, fitIntercept=True)
lr_top_model, lr_top_training_time, lr_top_pred_time, lr_top_scores = run_model(clf=lr_top, evaluators=evaluators, 
                                                                                training_data=training_set_top, 
                                                                                testing_data=testing_set_top)

Training time in seconds is 58.973448753356934.

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|    7|
|    0|       0.0|   77|
|    1|       1.0|   83|
|    0|       1.0|    8|
+-----+----------+-----+

F1 score = 0.9142689015192202
Accuracy score = 0.9142857142857143
Test Error = 0.08571428571428574
Weightedprecision score = 0.9143118785975928
Weightedrecall score = 0.9142857142857144

Predition time in seconds is: 0.06215310096740723


## Gradient-Boosted Tree - selected features

In [23]:
gbt_top = GBTClassifier(maxIter=50, seed=42)
gbt_top_model, gbt_top_training_time, gbt_top_pred_time, gbt_top_scores = run_model(clf=gbt_top, 
                                                                                    evaluators=evaluators, 
                                                                                    training_data=training_set_top,
                                                                                    testing_data=testing_set_top)

Training time in seconds is 210.01020503044128.

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|    2|
|    0|       0.0|   84|
|    1|       1.0|   88|
|    0|       1.0|    1|
+-----+----------+-----+

F1 score = 0.982859383094608
Accuracy score = 0.9828571428571429
Test Error = 0.017142857142857126
Weightedprecision score = 0.9829258277651274
Weightedrecall score = 0.9828571428571429

Predition time in seconds is: 0.1607198715209961


## Linear SVM - selected features

In [24]:
svm_top = LinearSVC(maxIter=100, regParam=0.1)
svm_top_model, svm_top_training_time, svm_top_pred_time, svm_top_scores = run_model(clf=svm_top, 
                                                                                    evaluators=evaluators, 
                                                                                    training_data=training_set_top, 
                                                                                    testing_data=testing_set_top)

Training time in seconds is 24.847747802734375.

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|    8|
|    0|       0.0|   73|
|    1|       1.0|   82|
|    0|       1.0|   12|
+-----+----------+-----+

F1 score = 0.8855795854224351
Accuracy score = 0.8857142857142857
Test Error = 0.11428571428571432
Weightedprecision score = 0.8863747232541559
Weightedrecall score = 0.8857142857142857

Predition time in seconds is: 0.15775322914123535


## OneVsRest - selected features

In [25]:
ovr_top = OneVsRest(classifier=lr)
ovr_top_model, ovr_top_training_time, ovr_top_pred_time, ovr_top_scores = run_model(clf=ovr_top, evaluators=evaluators, 
                                             training_data=training_set_top, testing_data=testing_set_top)

Training time in seconds is 113.96643733978271.

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|    7|
|    0|       0.0|   77|
|    1|       1.0|   83|
|    0|       1.0|    8|
+-----+----------+-----+

F1 score = 0.9142689015192202
Accuracy score = 0.9142857142857143
Test Error = 0.08571428571428574
Weightedprecision score = 0.9143118785975928
Weightedrecall score = 0.9142857142857144

Predition time in seconds is: 0.5405926704406738


# Model comparison

In [26]:
import pandas as pd

headers=["Model","F1 score", "Accuracy", "Precision",
        "Recall","Training Time (s)", "Prediction Time (s)"]

metrics=[("RandomForest", *rf_scores, rf_training_time, rf_pred_time),
         ("RandomForest - Hyperparameters Tuned", *rf_tune_scores, rf_tune_training_time, rf_pred_time),
         ("RandomForest - selected features", *rf_tune2_scores, rf_tune2_training_time, rf_tune2_pred_time),
         ("Logistic Regression", *lr_scores, lr_training_time, lr_pred_time), 
         ("Logistic Regression - selected features", *lr_top_scores, lr_top_training_time, lr_top_pred_time),
         ("Gradient-boosted tree classifier", *gbt_scores, gbt_training_time, gbt_pred_time),
         ("Gradient-boosted tree classifier - selected features",*gbt_top_scores, gbt_training_time, gbt_pred_time),
         ("Support Vector Machine", *svm_scores, svm_training_time, svm_pred_time),
         ("Support Vector Machine - selected features", *svm_top_scores, svm_top_training_time, svm_top_pred_time),
         ("OneVsRest", *ovr_scores, ovr_training_time, ovr_pred_time),
         ("OneVsRest - selected features", *ovr_top_scores, ovr_top_training_time, ovr_top_pred_time)]

df_metrics = pd.DataFrame.from_records(metrics, columns=headers).set_index('Model')

In [27]:
df_metrics.sort_values(by='F1 score', ascending=False)

Unnamed: 0_level_0,F1 score,Accuracy,Precision,Recall,Training Time (s),Prediction Time (s)
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Gradient-boosted tree classifier,0.982859,0.982857,0.982926,0.982857,208.784456,0.126281
Gradient-boosted tree classifier - selected features,0.982859,0.982857,0.982926,0.982857,208.784456,0.126281
RandomForest - Hyperparameters Tuned,0.977143,0.977143,0.977143,0.977143,14.069913,0.169024
RandomForest - selected features,0.977143,0.977143,0.977143,0.977143,13.312894,0.139254
Logistic Regression,0.914269,0.914286,0.914312,0.914286,60.914721,0.125684
Logistic Regression - selected features,0.914269,0.914286,0.914312,0.914286,58.973449,0.062153
OneVsRest,0.914269,0.914286,0.914312,0.914286,117.823248,0.539303
OneVsRest - selected features,0.914269,0.914286,0.914312,0.914286,113.966437,0.540593
RandomForest,0.914145,0.914286,0.915508,0.914286,45.502581,0.169024
Support Vector Machine,0.88558,0.885714,0.886375,0.885714,20.618673,0.063039


# Extra - Cross validation

We commented out this section as we have implemented the best model based on this code already above.

In [28]:
# rf_cv = RandomForestClassifier()
# evaluator_cv = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", 
#                                               metricName="accuracy")

# cv = CrossValidator().setEstimator(rf_cv).setEvaluator(evaluator_cv).setNumFolds(5)

# paramGrid = ParamGridBuilder().addGrid(rf_cv.numTrees, [30, 50, 70])\
#                               .addGrid(rf_cv.maxDepth, [10, 20, 30])\
#                               .addGrid(rf_cv.featureSubsetStrategy, ['auto', 'all', 'onethird'])\
#                               .build()

# cv.setEstimatorParamMaps(paramGrid)

# cvmodel = cv.fit(training_set_5)

# print(cvmodel.bestModel._java_obj.getNumTrees())
# print(cvmodel.bestModel._java_obj.getFeatureSubsetStrategy())
# print("Accuracy : " +  str(MulticlassClassificationEvaluator()\
#                            .evaluate(cvmodel.bestModel.transform(testing_set_5))))