In [None]:
spark.stop()

### Workflow:
---

A. Convert from json to csv using spark

B. process csv data for train
    1. load data from csv-alike files
    2. create a new negative records table (user with false item_record and category) 
    3. join category by item_id as a positive records table
    4. zip negative table and positive table as one, add a positive label to indicates.
    5. aggregate same user history items into two new columns: items_history and category history
    6. save last two records(one positive / one negative) of each user to local_test, remains save to local_train
    7. split local_test with random 1:9 into local_test_splitByUser and local_train_splitByUser
    8. build 3 dictionary: mid_voc, uid_voc and cat_voc from local_train_splitByUser using aggregate_count and sort
    9. create a new dictionary map for mid and cat, create a mid list according to reviews_info
    9. save all files: mid_voc, uid_voc, cat_voc, local_train_splitByUser and local_test_splitByUser

### DIEN data process

In [None]:
from python_script.init_spark import *
from python_script.utils import *
from python_script.data_processor import DataProcessor
from timeit import default_timer as timer
import pandas as pd
import numpy as np
from pyspark.sql.window import Window
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import SparkSession

dir_path = "/mnt/nvme2/chendi/BlueWhale/ai-matrix/macro_benchmark/DIEN_INTEL_TF2/pyspark_data/"
local_prefix_src = "file://" + dir_path

t0 = timer()
spark = SparkSession\
    .builder\
    .master('yarn')\
    .appName("DIEN_DATA_PREPARE") \
    .getOrCreate()

# Enable Arrow-based columnar data transfers
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
t1 = timer()
# Load reviews_info and item_info from HDFS
reviews_info_df, item_info_df = load_csv(spark, local_prefix_src)
data_processor = DataProcessor(
    spark, reviews_info_df, item_info_df, dir_path)
data_processor.process()
t3 = timer()
print("\n==================== Process Time =======================\n")
print("Total process took %.3f secs" % (t3 - t0))
print("Details:")
print("start spark took %.3f secs" % (t1 - t0))
print("process and save took %.3f secs, includes:" % (t3 - t1))
for key, value in data_processor.elapse_time.items():
    print("\t%s %.3f" % (key, value))
print("\n==========================================================")

==================== Process Time =======================

Total process took 263.191 secs

Details:

start spark took 33.097 secs

process and save took 230.093 secs, includes:

	data_process 162.889
    
	generate_voc 27.437
    
	combine_and_save_negative_positive 36.814

==========================================================

### Convert Json to CSV
#### May take about 2min, pls skip if you only want to check data process
#### Notice: Seems original json file contains some error, convert by pyspark some rows will return Null items

In [None]:
# Convert Json to CSV

import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window
import numpy as np
import pandas as pd
from time import time
from timeit import default_timer as timer
import random
import os.path
import pickle
import pandas

dir_path = "/mnt/nvme2/chendi/BlueWhale/x-deeplearning/xdl-algorithm-solution/DIEN/data/pyspark_output/"
local_prefix = "file://" + dir_path

def load_json():
    
    item_info_df = spark.read.json(local_prefix + 'meta_Books.json')
    reviews_info_df = spark.read.json(local_prefix + 'reviews_Books.json')
    
    item_info_df = item_info_df.select('asin', expr("categories[0][size(categories[0]) - 1] as categories"))
    reviews_info_df = reviews_info_df.select('reviewerID', 'asin', 'overall', 'unixReviewTime')
    
    return reviews_info_df, item_info_df

def list_dir(path):   
    source_path_dict = {}
    dirs = os.listdir(path)
    for files in dirs:
        try:
            sub_dirs = os.listdir(path + "/" + files)
            for file_name in sub_dirs:
                if (file_name.endswith('parquet') or file_name.endswith('csv')):
                    source_path_dict[files] = os.path.join(
                        path, files, file_name)
        except:
            source_path_dict[files] = os.path.join(path, files)
    return source_path_dict

def result_rename_or_convert():   
    fpath = dir_path
    source_path_dict = list_dir(fpath)
    fix = "-spark"
    try:
        os.rename(source_path_dict["reviews-info" + fix], fpath + 'reviews-info')
        os.rename(source_path_dict["item-info" + fix], fpath + 'item-info')
    except:
        pass


t0 = timer()
spark = SparkSession\
    .builder\
    .master('yarn')\
    .appName("DIEN_DATA_PROCESS") \
    .getOrCreate()

# Enable Arrow-based columnar data transfers
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

t1 = timer()
reviews_info_df, item_info_df = load_json()
t2 = timer()
reviews_info_df.repartition(1).write.option("sep", '\t').format('csv').mode('overwrite').save(local_prefix + 'reviews-info-spark')
item_info_df.repartition(1).write.option("sep", '\t').format('csv').mode('overwrite').save(local_prefix + 'item-info-spark')
t3 = timer()
result_rename_or_convert()
t4 = timer()

# should be 2370585
print("Total length of item-info is ", item_info_df.count())
# should be 22507155
print("Total length of reviews-info is ", reviews_info_df.count())

print("\n==================== Convert Time =======================\n")
print("Total process took %.3f secs" % (t4 - t0))
print("Details:")
print("start spark took %.3f secs" % (t1 - t0))
print("load from json took %.3f secs" % (t2 - t1))
print("write as csv took %.3f secs" % (t3 - t2))
print("rename csv files took %.3f secs" % (t4 - t3))
print("\n==========================================================")    

### Option2: using original python to convert (No need to run)

In [None]:
# Load original json as CSV
import sys
import random
import time
from time import time

def process_meta(file):
    fi = open(file, "r")
    fo = open("item-info", "w")
    for line in fi:
        obj = eval(line)
        cat = obj["categories"][0][-1]
        print(obj["asin"] + "\t" + cat, file=fo)

def process_reviews(file):
    fi = open(file, "r")
    user_map = {}
    fo = open("reviews-info", "w")
    for line in fi:
        obj = eval(line)
        userID = obj["reviewerID"]
        itemID = obj["asin"]
        rating = obj["overall"]
        time = obj["unixReviewTime"]
        print(userID + "\t" + itemID + "\t" + str(rating) + "\t" + str(time), file=fo)

local_prefix = "/mnt/nvme2/chendi/BlueWhale/x-deeplearning/xdl-algorithm-solution/DIEN/data/"
t0 = timer()
process_meta(local_prefix + 'meta_Books.json')
process_reviews(local_prefix + 'reviews_Books.json')
t1 = timer()

print("Convert initial csv from json took %.3f secs" % (t1 - t0))

### Old process codes, now we use python module

In [None]:
spark.stop()