# Spark + Horovod for Burgerking

In [1]:
import configargparse
import argparse
import os
os.environ["OMP_NUM_THREADS"] = "4"

import pandas as pd
import numpy as np

import logging
import time
from datetime import datetime

import pyspark
import pyspark.sql.types as T
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.linalg import Vectors

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import Dataset
from torch.utils.data.sampler import SubsetRandomSampler

import horovod.spark.torch as hvd
from horovod.spark.common.store import HDFSStore
from horovod.spark.common.backend import SparkBackend 

## Configuration

In [2]:
p = configargparse.ArgParser(default_config_files=['../conf/burgerking.conf'])
p.add_argument("--spark-master-address", type=str)
p.add_argument("--spark-cores-max", type=int)
p.add_argument("--spark-executor-cores", type=int)
p.add_argument("--spark-executor-memory", type=str)
p.add_argument("--spark-default-parallelism", type=int)
p.add_argument("--spark-data-dir", type=str)
p.add_argument("--hdfs-store", type=str)
p.add_argument("--nics", type=str)
options, _ = p.parse_known_args()

## Spark Initialization

In [3]:
spark = pyspark.sql.SparkSession.builder.master(options.spark_master_address)\
                                    .config("spark.cores.max", options.spark_cores_max) \
                                    .config("spark.executor.cores", options.spark_executor_cores) \
                                    .config("spark.executor.memory", options.spark_executor_memory) \
                                    .config("spark.default.parallelism", options.spark_default_parallelism) \
                                    .config("spark.sql.execution.arrow.enabled", "true") \
                                    .config("spark.executorEnv.PATH", os.environ['PATH']) \
                                    .getOrCreate()   
                                    

## Data preparation

In [5]:
start = time.time()
df = spark.read.json(options.spark_data_dir)
train_df, test_df = df.randomSplit([0.999, 0.001], seed=100)
end = time.time()

prepare_time = end - start
print(f"# train data: {train_df.count()}")
print(f"# test data:  {test_df.count()}")
print(f"time:         {prepare_time:.2f}s")

# train data: 99891
# test data:  109
time:         5.80s


In [6]:
store = HDFSStore(options.hdfs_store)

_check_url: hdfs://


## Model Definition

In [7]:
n_plus = 522
n_time = 167
n_bkids = 126
n_weather = 35
n_feels = 20

# Bidirectional recurrent neural network (many-to-one)
class BiRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, fcn_input_size, fcn_output_size):
        super(BiRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embeds_pluids = nn.Embedding(n_plus, 50, sparse=True)
        self.embeds_bkidx = nn.Embedding(n_bkids, 100, sparse=True)
        self.embeds_timeidx = nn.Embedding(n_time, 100, sparse=True)
        self.embeds_feelsBucket = nn.Embedding(n_feels, 100, sparse=True)
        self.embeds_weather = nn.Embedding(n_weather, 100, sparse=True)

        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)

        self.hidden1 = nn.Linear(100, 100)
        self.hidden2 = nn.Linear(100, 1)
        self.flatten = nn.Flatten()

        self.drop_layer = nn.Dropout(p=0.3)
        self.fc = nn.Linear(fcn_input_size, fcn_output_size)


    def forward(self, pluids, timeidx, bkidx, weatheridx, feelsBucket):

        pluids = pluids.long()
        timeidx = timeidx.long()
        bkidx = bkidx.long()
        weatheridx = weatheridx.long()
        feelsBucket = feelsBucket.long()
        plu_embed = self.embeds_pluids(pluids)
        bkidx_embed = self.embeds_bkidx(bkidx)
        time_embed = self.embeds_timeidx(timeidx)
        weather_embed = self.embeds_weather(weatheridx)
        feels_embed = self.embeds_feelsBucket(feelsBucket)

        x = plu_embed

        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size) # 2 for bidirection 
        
        # Forward propagate gru
        gru_out, _ = self.gru(x, h0)
        ut = torch.tanh(self.hidden1(gru_out))
        # et shape: [batch_size, seq_len, att_hops]
        et = self.hidden2(ut)

        # att shape: [batch_size,  att_hops, seq_len]
        att = F.softmax(torch.transpose(et, 2, 1))

        # output shape [batch_size, att_hops, embedding_width]
        output = torch.matmul(att, gru_out)

        # flatten the output
        attention_output = self.flatten(output)
        context_features = torch.mul(attention_output,(1 + bkidx_embed + time_embed + weather_embed + feels_embed))
        ac1 = F.relu(context_features)

        dropout = self.drop_layer(ac1)
        output = self.fc(dropout)


        return output

In [8]:
batch_size = 16000
num_epoch = 5
loss = nn.CrossEntropyLoss()
num_proc=options.spark_cores_max // options.spark_executor_cores
feature_cols = ["pluids", "timeidx", "bkidx", "weatheridx", "feelsBucket"]

model = BiRNN(50, 50, 1, 100, 522)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

def f(data):
    #avoid omp resource competition
    os.environ["OMP_NUM_THREADS"] = "4"
    iter([os.environ["OMP_NUM_THREADS"]])

## Train

In [9]:
backend = SparkBackend(num_proc, nics=[options.nics], use_mpi=True)
torch_estimator = hvd.TorchEstimator(backend=backend,
                                     store=store,
                                     model=model,
                                     optimizer=optimizer,
                                     loss=lambda input, target: loss(input, target.long()),
                                     feature_cols=feature_cols,
                                     input_shapes=[[-1, 5], [-1], [-1], [-1], [-1]],
                                     label_cols=['label'],
                                     batch_size=batch_size,
                                     epochs=num_epoch,
                                     verbose=2)
start = time.time()
torch_model = torch_estimator.fit(train_df).setOutputCols(['label_prob'])
end = time.time()
train_time = end - start
print(f"train time: {train_time}") 

num_partitions=20
writing dataframes
train_data_path=hdfs://sr257:9000/tmp/intermediate_train_data.0
val_data_path=hdfs://sr257:9000/tmp/intermediate_val_data.0
train_partitions=20


  metadata, avg_row_size = make_metadata_dictionary(train_data_schema)


train_rows=99891


[1,0]<stderr>:  return torch._C._cuda_getDeviceCount() > 0
[1,1]<stderr>:  return torch._C._cuda_getDeviceCount() > 0
[1,0]<stderr>:2020-12-18 09:50:13.947496: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcudart.so.10.1'; dlerror: libcudart.so.10.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/bluewhale/Bluewhale/tools/mpi/lib:/usr/lib64/:/opt/rh/devtoolset-7/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib:/opt/rh/devtoolset-7/root/usr/lib64/dyninst:/opt/rh/devtoolset-7/root/usr/lib/dyninst:/opt/rh/devtoolset-7/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib:/usr/lib64/:
[1,0]<stderr>:2020-12-18 09:50:13.947525: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
[1,1]<stderr>:SLF4J: Class path contains multiple SLF4J bindings.
[1,1]<stderr>:SLF4J: Found binding in [jar:file:/home/bluewhale/envs/hadoop-2.7.7/share/had

[1,0]<stderr>:SLF4J: Found binding in [jar:file:/home/bluewhale/envs/hadoop-2.7.7/share/hadoop/common/lib/slf4j-log4j12-1.7.10.jar!/org/slf4j/impl/StaticLoggerBinder.class]
[1,0]<stderr>:SLF4J: Found binding in [jar:file:/home/bluewhale/envs/hadoop-2.7.7/share/hadoop/httpfs/tomcat/webapps/webhdfs/WEB-INF/lib/slf4j-log4j12-1.7.10.jar!/org/slf4j/impl/StaticLoggerBinder.class]
[1,0]<stderr>:SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
[1,0]<stderr>:SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
[1,0]<stderr>:SLF4J: Class path contains multiple SLF4J bindings.
[1,0]<stderr>:SLF4J: Found binding in [jar:file:/home/bluewhale/envs/hadoop-2.7.7/share/hadoop/kms/tomcat/webapps/kms/WEB-INF/lib/slf4j-log4j12-1.7.10.jar!/org/slf4j/impl/StaticLoggerBinder.class]
[1,0]<stderr>:SLF4J: Found binding in [jar:file:/home/bluewhale/envs/hadoop-2.7.7/share/hadoop/common/lib/slf4j-log4j12-1.7.10.jar!/org/slf4j/impl/StaticLoggerBinder.class]
[1,0]<std

[1,0]<stdout>:epoch:	0	step	0:	{'loss': 6.2590155601501465, 'all_metrics': []}[1,0]<stdout>:
[1,1]<stdout>:{'epoch': 0, 'train': {'loss': 6.258663654327393, 'all_metrics': []}}
[1,0]<stdout>:{'epoch': 0, 'train': {'loss': 6.258663654327393, 'all_metrics': []}}
[1,0]<stdout>:epoch:	1	step	0:	{'loss': 6.257025718688965, 'all_metrics': []}
[1,1]<stdout>:{'epoch': 1, 'train': {'loss': 6.256636142730713, 'all_metrics': []}}
[1,0]<stdout>:{'epoch': 1, 'train': {'loss': 6.256636142730713, 'all_metrics': []}}
[1,0]<stdout>:epoch:	2	step	0:	{'loss': 6.255742073059082, 'all_metrics': []}[1,0]<stdout>:
[1,1]<stdout>:{'epoch': 2, 'train': {'loss': 6.255521297454834, 'all_metrics': []}}
[1,0]<stdout>:{'epoch': 2, 'train': {'loss': 6.255521297454834, 'all_metrics': []}}
[1,0]<stdout>:epoch:	3	step	0:	{'loss': 6.255092620849609, 'all_metrics': []}[1,0]<stdout>:
[1,0]<stdout>:{'epoch': 3, 'train': {'loss': 6.2541399002075195, 'all_metrics': []}}
[1,1]<stdout>:{'epoch': 3, 'train': {'loss': 6.254139900

## Eval

In [12]:
# Evaluate the model on the held-out test DataFrame
pred_df = torch_model.transform(test_df)
argmax = udf(lambda v: float(np.argmax(v)), returnType=T.DoubleType())
pred_df = pred_df.withColumn('label_pred', argmax(pred_df.label_prob))
evaluator = MulticlassClassificationEvaluator(predictionCol='label_pred', labelCol='label', metricName='accuracy')
test_acc = evaluator.evaluate(pred_df)

## Statistics

In [13]:
print(f"Data Preparation: {prepare_time:.2f}s\") 
print(f"Train Total: {train_time:.2f}s / epoch") 
print(f"Train Avg: {train_time/num_epoch:.2f}s / epoch") 
print(f'Test Acc: {test_acc:.2f}')

Data Preparation: 5.80s / epoch
Train Total: 64.64s / epoch
Train Avg: 12.93s / epoch
Test Acc: 0.00
