In [1]:
import findspark
findspark.init()

from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql import *
from pyspark.sql.window import Window

import pickle
from datetime import datetime, timedelta, date
from time import time, sleep
from dateutil.relativedelta import relativedelta
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
import sys
import os

#Need to set environment variables in order to make pyspark work in Python3.6
spark_home=os.environ['SPARK_HOME']
os.environ['PYTHONPATH']= spark_home+"/python/lib/py4j-0.10.4-src.zip"
os.environ['PYSPARK_PYTHON']="/dfm0/util/dfm_python/python36/bin/python3.6"
os.environ['PYSPARK_DRIVER_PYTHON']="/dfm0/util/dfm_python/python36/bin/python3.6"

super_dir='/dfm1/lijli06/DFM_SPARK'

In [3]:
def launch_spark(appName):
    spark = SparkSession.builder \
                .master("yarn") \
                .appName(appName) \
                .config("spark.executor.instances", 5) \
                .config("spark.executor.cores",5) \
                .config("spark.executor.memory",'29G') \
                .config("spark.submit.deployMode",'client') \
                .config("spark.executorEnv.PYTHONHASHSEED", 123) \
                .getOrCreate()
    return spark

In [4]:
start_date="2018-07-01"
end_date="2018-07-31"
log_label="sysauditlog"

#convert the dates from string to date objects
start_date1 = datetime.strptime(start_date, '%Y-%m-%d').date()
end_date1 = datetime.strptime(end_date, '%Y-%m-%d').date()

#Get the paths to the hdfs files; end date may be extended one day later
tmp_paths = []

if log_label in ['aracslog', 'arvelog', 'areslog', 'aracsstat']:
    path_label = log_label + '/*'
else:
    path_label = log_label

date_delta = end_date1 - start_date1
path_dates = [start_date1+relativedelta(days=i) for i in range(date_delta.days+1)]
for path_date in path_dates:
    tmp_path = "hdfs://dfm-cluster/DFM/"+path_label+"/" \
                +"/".join(path_date.strftime("%Y-%m-%d").split("-")) \
                +"/"+log_label+"*.avro"
    tmp_paths.append(tmp_path)
    
orgs = pd.read_csv('/dfm1/lijli06/DFM_SPARK/HSBC_Group.txt',header=0)
orgs = orgs['ORGNAME'].tolist()

In [9]:
time_org = []
for i in range(2):
    spark = launch_spark("partition_test_none_{}".format(i))
    start = time()
    spark_df = spark.read.format("com.databricks.spark.avro").load(tmp_paths)
    spark_df = spark_df.withColumn('DATE', F.to_date('DATELOGGED'))    
    spark_df = spark_df.filter(spark_df.ORGNAME.isin(orgs))
    tmp_list = spark_df.rdd.map(lambda x: ((x.DATE, x.ORGNAME, x.CALLERID),1)).reduceByKey(lambda x,y: 1) \
                           .map(lambda x: ((x[0][0], x[0][1]),1)).reduceByKey(lambda x,y: x+y) \
                           .map(lambda x: [x[0][0], x[0][1], x[1]]).collect()
    end = time()
    time_org.append(end-start)
    print('Used time of {:.2f} sec'.format(end-start))
    spark.stop()
    sleep(10)

Used time of 445.64 sec
Used time of 452.75 sec


In [10]:
time_coalesce = []
for i in range(2):
    spark = launch_spark("partition_test_coalesce_{}".format(i))
    start = time()
    spark_df = spark.read.format("com.databricks.spark.avro").load(tmp_paths)
    spark_df = spark_df.withColumn('DATE', F.to_date('DATELOGGED'))    
    spark_df = spark_df.filter(spark_df.ORGNAME.isin(orgs))
    tmp_list = spark_df.rdd.coalesce(200) \
                           .map(lambda x: ((x.DATE, x.ORGNAME, x.CALLERID),1)).reduceByKey(lambda x,y: 1) \
                           .map(lambda x: ((x[0][0], x[0][1]),1)).reduceByKey(lambda x,y: x+y) \
                           .map(lambda x: [x[0][0], x[0][1], x[1]]).collect()
    end = time()
    time_coalesce.append(end-start)
    print('Used time of {:.2f} sec'.format(end-start))
    spark.stop()
    sleep(10)

Used time of 204.09 sec
Used time of 330.30 sec


In [5]:
def key_partitioner(keys):
    return hash(keys)

In [12]:
time_partition = []
for i in range(2):
    spark = launch_spark("partition_test_partition_{}".format(i))
    start = time()
    spark_df = spark.read.format("com.databricks.spark.avro").load(tmp_paths)
    spark_df = spark_df.withColumn('DATE', F.to_date('DATELOGGED'))    
    spark_df = spark_df.filter(spark_df.ORGNAME.isin(orgs))
    spark_df = spark_df.repartition(200, 'DATE')
    tmp_list = spark_df.rdd.map(lambda x: ((x.DATE, x.ORGNAME, x.CALLERID),1)) \
                           .reduceByKey(lambda x,y: 1) \
                           .map(lambda x: ((x[0][0], x[0][1]),1)).reduceByKey(lambda x,y: x+y) \
                           .map(lambda x: [x[0][0], x[0][1], x[1]]).collect()
    end = time()
    time_partition.append(end-start)
    print('Used time of {:.2f} sec'.format(end-start))
    spark.stop()
    sleep(10)

Used time of 510.20 sec
Used time of 432.26 sec


In [6]:
time_mapred_part1 = []
for i in range(2):
    spark = launch_spark("partition_test_mapred1_{}".format(i))
    start = time()
    spark_df = spark.read.format("com.databricks.spark.avro").load(tmp_paths)
    spark_df = spark_df.withColumn('DATE', F.to_date('DATELOGGED'))    
    spark_df = spark_df.filter(spark_df.ORGNAME.isin(orgs))
    tmp_list = spark_df.rdd.map(lambda x: ((x.DATE, x.ORGNAME, x.CALLERID),1), True) \
                           .reduceByKey(lambda x,y: 1, 200) \
                           .map(lambda x: ((x[0][0], x[0][1]),1)).reduceByKey(lambda x,y: x+y, 200) \
                           .map(lambda x: [x[0][0], x[0][1], x[1]]).collect()
    end = time()
    time_mapred_part1.append(end-start)
    print('Used time of {:.2f} sec'.format(end-start))
    spark.stop()
    sleep(10)

Used time of 867.68 sec
Used time of 723.42 sec


In [None]:
time_mapred_part2 = []
for i in range(2):
    spark = launch_spark("partition_test_mapred2_{}".format(i))
    start = time()
    spark_df = spark.read.format("com.databricks.spark.avro").load(tmp_paths)
    spark_df = spark_df.withColumn('DATE', F.to_date('DATELOGGED'))    
    spark_df = spark_df.filter(spark_df.ORGNAME.isin(orgs))
    tmp_list = spark_df.rdd.map(lambda x: ((x.DATE, x.ORGNAME, x.CALLERID),1)) \
                           .reduceByKey(lambda x,y: 1, 200, key_partitioner) \
                           .map(lambda x: ((x[0][0], x[0][1]),1)).reduceByKey(lambda x,y: x+y, 200, key_partitioner) \
                           .map(lambda x: [x[0][0], x[0][1], x[1]]).collect()
    end = time()
    time_mapred_part2.append(end-start)
    print('Used time of {:.2f} sec'.format(end-start))
    spark.stop()
    sleep(10)

In [None]:
time_mapred_part3 = []
for i in range(2):
    spark = launch_spark("partition_test_mapred3_{}".format(i))
    start = time()
    spark_df = spark.read.format("com.databricks.spark.avro").load(tmp_paths)
    spark_df = spark_df.withColumn('DATE', F.to_date('DATELOGGED'))    
    spark_df = spark_df.filter(spark_df.ORGNAME.isin(orgs))
    tmp_list = spark_df.rdd.coalesce(200).map(lambda x: ((x.DATE, x.ORGNAME, x.CALLERID),1)) \
                           .reduceByKey(lambda x,y: 1, 200, key_partitioner) \
                           .map(lambda x: ((x[0][0], x[0][1]),1)).reduceByKey(lambda x,y: x+y, 200, key_partitioner) \
                           .map(lambda x: [x[0][0], x[0][1], x[1]]).collect()
    end = time()
    time_mapred_part3.append(end-start)
    print('Used time of {:.2f} sec'.format(end-start))
    spark.stop()
    sleep(10)

In [14]:
spark.stop()