In [1]:
#!pip uninstall dist-keras -y

In [2]:
#!pip install --upgrade keras

In [3]:
#!pip install dist-keras

In [4]:
import numpy as np

import time

import requests

from keras.optimizers import *
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation

from pyspark import SparkContext
from pyspark import SparkConf

from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics

from distkeras.trainers import *
from distkeras.predictors import *
from distkeras.transformers import *
from distkeras.evaluators import *
from distkeras.utils import *

Using TensorFlow backend.


In [5]:
def setHadoopConfig(name):
    prefix = "fs.swift2d.service." + name
    hconf = sc._jsc.hadoopConfiguration()
    hconf.set(prefix + '.auth.url', 'https://identity.open.softlayer.com'+'/v3/auth/tokens')
    hconf.set(prefix + '.auth.endpoint.prefix', 'endpoints')
    hconf.set(prefix + '.tenant', 'a9fb4d478e3d40a8bbd54c5a2ecf25a3')
    hconf.set(prefix + '.username', '6a4cc8251c1940179a6cccc9098a15e0')
    hconf.set(prefix + '.password', 'kDTcKA2H(3eo5.G0')
    hconf.setInt(prefix + '.http.port', 8080)
    hconf.set(prefix + '.region', 'dallas')
    hconf.setBoolean(prefix + '.public', False)

name = "keystone"
setHadoopConfig(name)

seven_cases = spark.read.parquet("swift2d://MGH." + name + "/tempParq/7cases.parquet")

In [6]:
numClasses = seven_cases.select("prediction").distinct().count()
numFeats = len(seven_cases.select("pcaFeatures").limit(1).toPandas()['pcaFeatures'][0])

seven_cases = seven_cases.withColumnRenamed("prediction","label_index").select("label_index","pcaFeatures")
encoder = OneHotTransformer(output_dim=numClasses,input_col="label_index", output_col="label")
encoded = encoder.transform(seven_cases)

In [7]:
(training_set, test_set) = encoded.randomSplit([0.8, 0.2])
training_set.cache()
test_set.cache()

training_set.repartition(50)
training_set.take(2)

[Row(label_index=0, pcaFeatures=DenseVector([-0.9483, 0.0061, 0.0236, 0.0304, 0.3027, -0.0294, 0.0723, -0.0325, -0.0037, 0.0106, -0.0104, -0.0076, 0.0037, 0.0031, 0.0039, -0.0076, -0.0015, 0.0007, -0.0005, 0.0009, 0.0, 0.0018, -0.0008, 0.0003, -0.0013, -0.0028, 0.001, 0.0004, -0.0006, 0.0002, 0.0003, 0.0009, -0.0001, 0.0, 0.0003, -0.0003, -0.0001, -0.0004, 0.0002, 0.0, -0.0, 0.0, -0.0, 0.0002, 0.0003, -0.0, -0.0002, -0.0005, -0.0, -0.0, -0.0, -0.0, 0.0, -0.0001, -0.0002, 0.0002, 0.0001, -0.0, 0.0001, -0.0002, -0.0001, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0001, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, 0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, 0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0, 0.0]), label=[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]),
 Row(label_index=0, pcaFeatures=DenseVector([-0.9482, 0.0122, 0.02, 0.0251, 0.3029, -0.0327, 0.0754, -0.0225, -0.0051, 0.0152, -0.017, -0.0043, -0.0012, 0.0059

In [8]:
model = Sequential()
model.add(Dense(512, input_shape=(numFeats,),activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(256,activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(128,activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(64,activation='relu'))
model.add(Dense(32,activation='relu'))
model.add(Dense(numClasses,activation='softmax'))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               51712     
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               131328    
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               32896     
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 64)                8256      
__________

In [9]:
results = {}

def evaluate_accuracy(model, test_set, ohLabel, iLabel, pred, feats):
    
    evaluator = AccuracyEvaluator(prediction_col=pred, label_col=iLabel)
    test_set = test_set.select(feats, iLabel, ohLabel)
    predictor = ModelPredictor(keras_model=model, features_col=feats)
    test_set = predictor.predict(test_set)
    index_transformer = LabelIndexTransformer(output_dim=numClasses)
    test_set = index_transformer.transform(test_set)
    score = evaluator.evaluate(test_set)
    
    return score

def add_result(trainer, accuracy, dt):
    global results;
    
    # Store the metrics.
    results[trainer] = {}
    results[trainer]['accuracy'] = accuracy;
    results[trainer]['time_spent'] = dt
    # Display the metrics.
    print("Trainer: " + str(trainer))
    print(" - Accuracy: " + str(accuracy))
    print(" - Training time: " + str(dt))

In [None]:
optimizer = 'adagrad'
#optimizer = 'adam'
loss = 'categorical_crossentropy'

trainer = ADAG(keras_model=model, worker_optimizer=optimizer, loss=loss, metrics=["accuracy"], num_workers=60, batch_size=16,
     features_col="pcaFeatures", label_col="label", num_epoch=100, communication_window=15)
#trainer = DOWNPOUR(keras_model=model, worker_optimizer=optimizer, loss=loss, num_workers=50,
#                   batch_size=32, communication_window=4, num_epoch=100,
#                   features_col="pcaFeatures", label_col="label")
trainer.set_parallelism_factor(2)
#test_set training_set
trained_model = trainer.train(training_set)

In [13]:
#evaluator = AccuracyEvaluator(prediction_col="prediction_index", label_col="label_index")
#test_set = test_set.select("pcaFeatures", "label_index", "label")
#predictor = ModelPredictor(keras_model=trained_model, features_col="pcaFeatures")
#test_set = predictor.predict(test_set)
#index_transformer = LabelIndexTransformer(output_dim=numClasses)
#test_set = index_transformer.transform(test_set)
#score = evaluator.evaluate(test_set)
#print(" - Accuracy: " + str(score))
#print(" - Training time: " + str(trainer.get_training_time()))

accuracy = evaluate_accuracy(trained_model, test_set, "label", "label_index", "prediction_index", "pcaFeatures")
dt = trainer.get_training_time()
add_result('adag', accuracy, dt)

Trainer: adag
 - Accuracy: 0.9789330958837577
 - Training time: 391.0852265357971


epoch<br>
50 - 0.8193942088140799<br>
100 - 0.8898135212294236<br>
200 - 0.9727177334732424 (2 200 hidden layers) <br>
100 - 0.9832014410330102 adag (256, 128) <br>
200 - 0.9884074419358762 (256, 128, 64) <br>
200 - 0.9922518356253296 (512, 256, 128, 64, 32) adagrad

In [None]:
predictor = ModelPredictor(keras_model=trained_model, features_col="pcaFeatures")
predictions = predictor.predict(test_set)
transformer = LabelIndexTransformer(output_dim=numClasses)
transformer.transform(predictions).select("prediction_index","label_index").where("label_index != 0").take(5)