In [13]:
sc.stop()

In [14]:
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName('Elephas_App').setMaster('local[*]')
sc = SparkContext(conf=conf)

In [15]:
from __future__ import print_function
print(sc)

<SparkContext master=local[*] appName=Elephas_App>


In [88]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]')\
.appName('deep-learning').getOrCreate()

In [21]:
from pyspark.sql import SQLContext
from pyspark.ml.linalg import Vectors
import numpy as np
import random

sql_context = SQLContext(sc)

def shuffle_csv(csv_file):
    lines = open(csv_file).readlines()
    random.shuffle(lines)
    open(csv_file, 'w').writelines(lines)

def load_data_frame(csv_file, shuffle=True, train=True):
    if shuffle:
        shuffle_csv(csv_file)
    data = sc.textFile(data_path + csv_file)
    # This is an RDD, which will later be transformed to a data frame
    data = data.filter(lambda x:x.split(',')[0] != 'id').map(lambda line: line.split(','))
    if train:
        data = data.map(
            lambda line: (Vectors.dense(np.asarray(line[1:-1]).astype(np.float32)),
                          str(line[-1])) )
    else:
        # Test data gets dummy labels. We need the same structure as in Train data
        data = data.map( lambda line: (Vectors.dense(np.asarray(line[1:]).astype(np.float32)),"Class_1") ) 
    return sql_context.createDataFrame(data, ['features', 'category'])

In [22]:
train_df = load_data_frame("../data_path/train.csv")
test_df = load_data_frame("../data_path/test.csv", shuffle=False, train=False) # No need to shuffle test data

print("Train data frame:")
train_df.show(10)

print("Test data frame (note the dummy category):")
test_df.show(10)

Train data frame:
+--------------------+--------+
|            features|category|
+--------------------+--------+
|[0.0,0.0,1.0,0.0,...| Class_3|
|[1.0,0.0,0.0,0.0,...| Class_2|
|[0.0,0.0,0.0,0.0,...| Class_2|
|[0.0,0.0,0.0,0.0,...| Class_8|
|[0.0,0.0,8.0,6.0,...| Class_6|
|[0.0,0.0,0.0,0.0,...| Class_8|
|[0.0,0.0,0.0,0.0,...| Class_2|
|[1.0,0.0,0.0,0.0,...| Class_8|
|[0.0,1.0,6.0,1.0,...| Class_6|
|[0.0,0.0,0.0,0.0,...| Class_2|
+--------------------+--------+
only showing top 10 rows

Test data frame (note the dummy category):
+--------------------+--------+
|            features|category|
+--------------------+--------+
|[0.0,0.0,0.0,0.0,...| Class_1|
|[2.0,2.0,14.0,16....| Class_1|
|[0.0,1.0,12.0,1.0...| Class_1|
|[0.0,0.0,0.0,1.0,...| Class_1|
|[1.0,0.0,0.0,1.0,...| Class_1|
|[0.0,0.0,0.0,0.0,...| Class_1|
|[0.0,0.0,0.0,0.0,...| Class_1|
|[2.0,0.0,0.0,0.0,...| Class_1|
|[0.0,0.0,0.0,0.0,...| Class_1|
|[0.0,0.0,0.0,0.0,...| Class_1|
+--------------------+--------+
only showing top 

In [91]:
from pyspark.ml.feature import StringIndexer

string_indexer = StringIndexer(inputCol="category", outputCol="label")
fitted_indexer = string_indexer.fit(train_df)
indexed_df = fitted_indexer.transform(train_df)

In [92]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=True)
fitted_scaler = scaler.fit(indexed_df)
scaled_df = fitted_scaler.transform(indexed_df)
print("The result of indexing and scaling. Each transformation adds new columns to the data frame:")
scaled_df.show(10)

The result of indexing and scaling. Each transformation adds new columns to the data frame:
+--------------------+--------+-----+--------------------+
|            features|category|label|     scaled_features|
+--------------------+--------+-----+--------------------+
|[0.0,0.0,1.0,0.0,...| Class_3|  3.0|[-0.2535060296260...|
|[1.0,0.0,0.0,0.0,...| Class_2|  0.0|[0.40208999583479...|
|[0.0,0.0,0.0,0.0,...| Class_2|  0.0|[-0.2535060296260...|
|[0.0,0.0,0.0,0.0,...| Class_8|  2.0|[-0.2535060296260...|
|[0.0,0.0,8.0,6.0,...| Class_6|  1.0|[-0.2535060296260...|
|[0.0,0.0,0.0,0.0,...| Class_8|  2.0|[-0.2535060296260...|
|[0.0,0.0,0.0,0.0,...| Class_2|  0.0|[-0.2535060296260...|
|[1.0,0.0,0.0,0.0,...| Class_8|  2.0|[0.40208999583479...|
|[0.0,1.0,6.0,1.0,...| Class_6|  1.0|[-0.2535060296260...|
|[0.0,0.0,0.0,0.0,...| Class_2|  0.0|[-0.2535060296260...|
+--------------------+--------+-----+--------------------+
only showing top 10 rows



In [94]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.utils import to_categorical

nb_classes = train_df.select("category").distinct().count()
input_dim = len(train_df.select("features").first()[0])

model = Sequential()
model.add(Dense(512, input_shape=(input_dim,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')

In [98]:
from elephas.ml_model import ElephasEstimator
from tensorflow.keras import optimizers


adam = optimizers.Adam(lr=0.01)
opt_conf = optimizers.serialize(adam)

# Initialize SparkML Estimator and set all relevant properties
estimator = ElephasEstimator()
# The next two paramters come directly from pyspark
estimator.setFeaturesCol("scaled_features")             
estimator.setLabelCol("label")                
# Provide serialized Keras model to the estimator object
estimator.set_keras_model_config(model.to_yaml())
estimator.set_categorical_labels(True)
estimator.set_nb_classes(nb_classes)
estimator.set_num_workers(1)
estimator.set_epochs(5) 
estimator.set_batch_size(128)
estimator.set_verbosity(1)
estimator.set_validation_split(0.15)
estimator.set_optimizer_config(opt_conf)
estimator.set_mode("synchronous")
estimator.set_loss("categorical_crossentropy")
estimator.set_metrics(['acc'])

ElephasEstimator_ff673344560a

In [100]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[string_indexer, scaler, estimator])
fitted_pipeline = pipeline.fit(train_df) 
prediction = fitted_pipeline.transform(train_df)
# prediction = fitted_pipeline.transform(test_df) # <-- The same code evaluates test data.
pnl = prediction.select("label", "prediction")


Exception ignored in: <function JavaModelWrapper.__del__ at 0x161847290>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/site-packages/pyspark/mllib/common.py", line 137, in __del__
    self._sc._gateway.detach(self._java_model)
AttributeError: 'MulticlassMetrics' object has no attribute '_sc'


>>> Fit model
>>> Synchronous training complete.


In [102]:
pnl.show(5)

+-----+--------------------+
|label|          prediction|
+-----+--------------------+
|  3.0|[0.50407135486602...|
|  0.0|[0.52967709302902...|
|  0.0|[0.50407135486602...|
|  2.0|[0.00933281425386...|
|  1.0|[0.0, 1.0, 0.0, 0...|
+-----+--------------------+
only showing top 5 rows



In [135]:
from sklearn.metrics import accuracy_score
import numpy as np

metrics_df= prediction.toPandas()
labels= metrics_df["label"].values
predictions= [np.argmax(val) for val in np.array(metrics_df["prediction"].values)]
accuracy_score(labels, predictions)