In [5]:
spark.stop()

## DLRM Categorify

In [1]:
###### Start spark job ######
import init

from pyrecdp.data_processor import *
from pyrecdp.utils import *

import logging
from timeit import default_timer as timer
import os
from pyspark import *
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
import numpy as np
import pandas as pd

# Define Schema
LABEL_COL = 0
INT_COLS = list(range(1, 14))
CAT_COLS = list(range(14, 40))
label_fields = [StructField('_c%d' % LABEL_COL, IntegerType())]
int_fields = [StructField('_c%d' % i, IntegerType()) for i in INT_COLS]
str_fields = [StructField('_c%d' % i, StringType()) for i in CAT_COLS]
schema = StructType(label_fields + int_fields + str_fields)

to_be_categorified = [23, 35, 14, 33]

def categorifyAllFeatures(df, proc, output_name="categorified", gen_dict=False):
    dict_dfs = []
    #to_categorify_cols = ['_c%d' % i for i in CAT_COLS]
    to_categorify_cols = ['_c%d' % i for i in to_be_categorified]
    if gen_dict:
        # only call below function when target dicts were not pre-prepared        
        op_gen_dict = GenerateDictionary(to_categorify_cols, isParquet=False)
        proc.reset_ops([op_gen_dict])
        t1 = timer()
        dict_dfs = proc.generate_dicts(df)
        t2 = timer()
        print("Generate Dictionary took %.3f" % (t2 - t1))
    else:
        # or we can simply load from pre-gened
        dict_dfs = [{'col_name': name, 'dict': proc.spark.read.parquet(
            "%s/%s/%s/%s" % (proc.path_prefix, proc.current_path, proc.dicts_path, name))} for name in to_categorify_cols]    

    # start to do categorify
    op_categorify = Categorify(to_categorify_cols, dict_dfs=dict_dfs)
    op_fillna_for_categorified = FillNA(to_categorify_cols, 0)
    proc.append_ops([op_categorify, op_fillna_for_categorified])
    t1 = timer()
    df = proc.transform(df)
    t2 = timer()
    print("Categorify took %.3f" % (t2 - t1))
    
    return df

/mnt/nvme2/chendi/BlueWhale/frameworks.bigdata.bluewhale/RecDP


In [None]:
path_prefix = "hdfs://"
current_path = "/dlrm/"
csv_folder = "/dlrm/csv_raw_data"
file = "/dlrm/raw_data/day_0"
#path = os.path.join(path_prefix, file)


##### 1. Start spark and initialize data processor #####
t0 = timer()
spark = SparkSession\
    .builder\
    .master('yarn')\
    .appName("DLRM")\
    .config("spark.executor.memory", "20g")\
    .config("spark.executor.memoryOverhead", "3g")\
    .getOrCreate()

files = ["day_%d" % i for i in range(0, 24)]
file_names = [os.path.join(path_prefix, csv_folder, filename) for filename in files]

proc = DataProcessor(spark, path_prefix, current_path=current_path, shuffle_disk_capacity="1200GB")
#df = spark.read.schema(schema).option('sep', '\t').csv(file_names)
df = spark.read.parquet("/dlrm/categorified_stage1")
df = categorifyAllFeatures(df, proc, output_name="dlrm_categorified", gen_dict=False)

per core memory size is 5.000 GB and shuffle_disk maximum capacity is 1200.000 GB
categorify threshold is 50.000 M rows, flush_threshold is 960.000 GB
('_c23', DataFrame[dict_col: string, dict_col_id: int], 130229466)
('_c35', DataFrame[dict_col: string, dict_col_id: int], 187188509)
('_c14', DataFrame[dict_col: string, dict_col_id: int], 227605431)
('_c33', DataFrame[dict_col: string, dict_col_id: int], 292775613)
etstimated_to_shuffle_size is 725.014 GB, will do shj
etstimated_to_shuffle_size is 700.576 GB, will do shj and spill to disk
etstimated_to_shuffle_size is 676.137 GB, will do shj
etstimated_to_shuffle_size is 651.698 GB, will do shj and spill to disk


* per core memory size is 5.000 GB and shuffle_disk maximum capacity is 1200.000 GB
* Generate Dictionary took 2373.706
* categorify threshold is 134.218 M rows, flush_threshold is 960.000 GB, etstimated_to_shuffle_size is 1287.104 GB
* ('_c19', DataFrame[dict_col: string, dict_col_id: int], 3)
* ('_c30', DataFrame[dict_col: string, dict_col_id: int], 3)
* ('_c26', DataFrame[dict_col: string, dict_col_id: int], 10)
* ('_c32', DataFrame[dict_col: string, dict_col_id: int], 14)
* ('_c39', DataFrame[dict_col: string, dict_col_id: int], 36)
* ('_c22', DataFrame[dict_col: string, dict_col_id: int], 63)
* ('_c38', DataFrame[dict_col: string, dict_col_id: int], 108)
* ('_c29', DataFrame[dict_col: string, dict_col_id: int], 154)
* ('_c31', DataFrame[dict_col: string, dict_col_id: int], 976)
* ('_c21', DataFrame[dict_col: string, dict_col_id: int], 1543)
* ('_c27', DataFrame[dict_col: string, dict_col_id: int], 2208)
* ('_c20', DataFrame[dict_col: string, dict_col_id: int], 7122)
* ('_c17', DataFrame[dict_col: string, dict_col_id: int], 7424)
* ('_c28', DataFrame[dict_col: string, dict_col_id: int], 11938)
* ('_c37', DataFrame[dict_col: string, dict_col_id: int], 12973)
* ('_c16', DataFrame[dict_col: string, dict_col_id: int], 17295)
* ('_c18', DataFrame[dict_col: string, dict_col_id: int], 20265)
* ('_c15', DataFrame[dict_col: string, dict_col_id: int], 39060)
* ('_c25', DataFrame[dict_col: string, dict_col_id: int], 405282)
* ('_c36', DataFrame[dict_col: string, dict_col_id: int], 590151)
* ('_c24', DataFrame[dict_col: string, dict_col_id: int], 3067955)
* ('_c34', DataFrame[dict_col: string, dict_col_id: int], 40790947)
* ('_c23', DataFrame[dict_col: string, dict_col_id: int], 130229466)
* ('_c35', DataFrame[dict_col: string, dict_col_id: int], 187188509)
* ('_c14', DataFrame[dict_col: string, dict_col_id: int], 227605431)
* ('_c33', DataFrame[dict_col: string, dict_col_id: int], 292775613)